mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
Promote TfIdfvectorizer to ONNX ver 9 (#373)
* Advance ONNX commit, move Ngram files under ONNX and rename to TfIdfVectorizer * Rename Ngram to TfIdfVectorizer and redeclare in ONNX domain * Restore tfidfvectorizer tests * Remove ML definition.
This commit is contained in:
parent
89f643f04b
commit
829b2a5e81
7 changed files with 106 additions and 252 deletions
|
|
@ -13,9 +13,6 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Ngram);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int32_t, Ngram);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int64_t, Ngram);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QuantizeLinear);
|
||||
|
|
@ -42,9 +39,6 @@ void RegisterContribKernels(KernelRegistry& kernel_registry) {
|
|||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm)>());
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM)>());
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer)>());
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Ngram)>());
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int32_t, Ngram)>());
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int64_t, Ngram)>());
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DequantizeLinear)>());
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear)>());
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QuantizeLinear)>());
|
||||
|
|
|
|||
|
|
@ -275,28 +275,28 @@ activation and leaky_relu_alpha.)DOC")
|
|||
"",
|
||||
AttributeProto::FLOAT,
|
||||
OPTIONAL)
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
||||
if (hasNInputShapes(ctx, 2)) {
|
||||
auto transAAttr = ctx.getAttribute("transA");
|
||||
bool transA =
|
||||
transAAttr ? static_cast<int>(transAAttr->i()) != 0 : false;
|
||||
auto transBAttr = ctx.getAttribute("transB");
|
||||
bool transB =
|
||||
transBAttr ? static_cast<int>(transBAttr->i()) != 0 : false;
|
||||
auto& first_input_shape = getInputShape(ctx, 0);
|
||||
auto& second_input_shape = getInputShape(ctx, 1);
|
||||
if (first_input_shape.dim_size() != 2)
|
||||
fail_shape_inference("First input does not have rank 2");
|
||||
if (second_input_shape.dim_size() != 2)
|
||||
fail_shape_inference("Second input does not have rank 2");
|
||||
updateOutputShape(
|
||||
ctx,
|
||||
0,
|
||||
{first_input_shape.dim(transA ? 1 : 0),
|
||||
second_input_shape.dim(transB ? 0 : 1)});
|
||||
}
|
||||
});
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
||||
if (hasNInputShapes(ctx, 2)) {
|
||||
auto transAAttr = ctx.getAttribute("transA");
|
||||
bool transA =
|
||||
transAAttr ? static_cast<int>(transAAttr->i()) != 0 : false;
|
||||
auto transBAttr = ctx.getAttribute("transB");
|
||||
bool transB =
|
||||
transBAttr ? static_cast<int>(transBAttr->i()) != 0 : false;
|
||||
auto& first_input_shape = getInputShape(ctx, 0);
|
||||
auto& second_input_shape = getInputShape(ctx, 1);
|
||||
if (first_input_shape.dim_size() != 2)
|
||||
fail_shape_inference("First input does not have rank 2");
|
||||
if (second_input_shape.dim_size() != 2)
|
||||
fail_shape_inference("Second input does not have rank 2");
|
||||
updateOutputShape(
|
||||
ctx,
|
||||
0,
|
||||
{first_input_shape.dim(transA ? 1 : 0),
|
||||
second_input_shape.dim(transB ? 0 : 1)});
|
||||
}
|
||||
});
|
||||
|
||||
ONNX_CONTRIB_OPERATOR_SCHEMA(ExpandDims)
|
||||
.SetDomain(kMSDomain)
|
||||
|
|
@ -374,139 +374,6 @@ activation and leaky_relu_alpha.)DOC")
|
|||
})
|
||||
.SetDoc(R"DOC(Tokenizer divides each string in X into a vector of strings along the last axis. All input strings including attributes are UTF-8 encoded.)DOC");
|
||||
|
||||
ONNX_CONTRIB_OPERATOR_SCHEMA(Ngram)
|
||||
.SetDomain(kMSDomain)
|
||||
.SinceVersion(1)
|
||||
.Input(0, "X", "Input for n-gram extraction", "T")
|
||||
.Output(0, "Y", "Ngram results", "T1")
|
||||
.TypeConstraint(
|
||||
"T",
|
||||
{"tensor(string)", "tensor(int32)", "tensor(int64)"},
|
||||
"Input is ether string UTF-8 or int32/int64")
|
||||
.TypeConstraint(
|
||||
"T1",
|
||||
{"tensor(float)"},
|
||||
"1-D tensor of floats")
|
||||
.Attr(
|
||||
"max_gram_length",
|
||||
"Maximum n-gram length. If this value is 3, 3-grams will be used to generate the output.",
|
||||
AttributeProto::INT)
|
||||
.Attr(
|
||||
"min_gram_length",
|
||||
"Minimum n-gram length. If this value is 2 and max_gram_length is 3, output may contain counts of 2-grams and 3-grams.",
|
||||
AttributeProto::INT)
|
||||
.Attr(
|
||||
"max_skip_count",
|
||||
"Maximum number of items (integers/strings) to be skipped when constructing an n-gram from X."
|
||||
"If max_skip_count=1, min_gram_length=2, max_gram_length=3, this operator may generate 2-grams"
|
||||
"with skip_count=0 and skip_count=1, and 3-grams with skip_count=0 and skip_count=1",
|
||||
AttributeProto::INT)
|
||||
.Attr(
|
||||
"pool_strings",
|
||||
"List of strings n-grams learned from the training set. Either this or pool_int64s attributes must be present but not both."
|
||||
"It's an 1-D tensor starting with the collections of all 1-grams and ending with the collections of n-grams."
|
||||
"The i-th element in pool stores the n-gram that should be mapped to index ngram_indexes[i] in the output vector.",
|
||||
AttributeProto::STRINGS,
|
||||
OPTIONAL)
|
||||
.Attr(
|
||||
"pool_int64s",
|
||||
"List of int64 n-grams learned from the training set. Either this or pool_strings attributes must be present but not both."
|
||||
"It's an 1-D tensor starting with the collections of all 1-grams and ending with the collections of n-grams."
|
||||
"The i-th element in pool stores the n-gram that should be mapped to index ngram_indexes[i] in the output vector.",
|
||||
AttributeProto::INTS,
|
||||
OPTIONAL)
|
||||
.Attr(
|
||||
"ngram_counts",
|
||||
"The starting indexes of 1-grams, 2-grams, and so on in pool."
|
||||
"It is useful when determining the boundary between two consecutive collections of n-grams."
|
||||
"For example, if ngram_counts is [0, 17, 36], the first index (zero-based) of 1-gram/2-gram/3-gram"
|
||||
"in pool are 0/17/36. This format is essentially identical to CSR (or CSC) sparse matrix format, "
|
||||
"and we choose to keep this due to its popularity.",
|
||||
AttributeProto::INTS)
|
||||
.Attr(
|
||||
"ngram_indexes",
|
||||
"list of int64s (type: AttributeProto::INTS). This list is parallel to the specified 'pool_*' attribute."
|
||||
"The i-th element in ngram_indexes indicate the coordinate of the i-th n-gram in the output tensor.",
|
||||
AttributeProto::INTS)
|
||||
.Attr(
|
||||
"weights",
|
||||
"list of floats. This attribute stores the weight of each n-gram in pool. The i-th element in weights"
|
||||
"is the weight of the i-th n-gram in pool. Its length equals to the size of ngram_indexes."
|
||||
"By default, weights is an all-one tensor.This attribute is used when mode is \"IDF\" or \"TFIDF\""
|
||||
"to scale the associated word counts.",
|
||||
AttributeProto::FLOATS,
|
||||
OPTIONAL)
|
||||
.Attr(
|
||||
"mode",
|
||||
"The weighting criteria. It can be one of \"TF\" (term frequency),"
|
||||
"\"IDF\" (inverse document frequency), and \"TFIDF\" (the combination of TF and IDF)",
|
||||
AttributeProto::STRING)
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
auto output_elem_type = ctx.getOutputType(0)->mutable_tensor_type();
|
||||
output_elem_type->set_elem_type(ONNX_NAMESPACE::TensorProto::FLOAT);
|
||||
|
||||
if (hasInputShape(ctx, 0)) {
|
||||
std::vector<int64_t> ngram_indexes;
|
||||
ONNX_NAMESPACE::getRepeatedAttribute(ctx, "ngram_indexes", ngram_indexes);
|
||||
if (ngram_indexes.empty() || !std::all_of(ngram_indexes.cbegin(), ngram_indexes.cend(),
|
||||
[](int64_t i) { return i >= 0; })) {
|
||||
fail_shape_inference(
|
||||
"ngram_indexes must be non-empty with no negative values");
|
||||
}
|
||||
|
||||
auto greatest_hit = std::max_element(ngram_indexes.cbegin(), ngram_indexes.cend());
|
||||
auto max_last_axis = *greatest_hit + 1;
|
||||
|
||||
ONNX_NAMESPACE::TensorShapeProto output_shape;
|
||||
auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
|
||||
auto dim_size = input_shape.dim_size();
|
||||
if (dim_size == 0 || dim_size == 1) {
|
||||
output_shape.add_dim()->set_dim_value(max_last_axis);
|
||||
} else if (dim_size == 2) {
|
||||
auto& B_dim = input_shape.dim(0);
|
||||
if (!B_dim.has_dim_value()) {
|
||||
fail_shape_inference(
|
||||
"Input shape does not have first dimension value");
|
||||
}
|
||||
output_shape.add_dim()->set_dim_value(B_dim.dim_value());
|
||||
output_shape.add_dim()->set_dim_value(max_last_axis);
|
||||
} else {
|
||||
fail_shape_inference(
|
||||
"Input shape must have either [C] or [B,C] dimensions where C > 0 and B > 0");
|
||||
}
|
||||
updateOutputShape(ctx, 0, output_shape);
|
||||
}
|
||||
})
|
||||
.SetDoc(R"DOC(
|
||||
This transform extracts n-grams from the input sequence and save them as a vector. Input can
|
||||
be either a 1-D or 2-D tensor. For 1-D input, output is the n-gram representation of that input.
|
||||
For 2-D input, the output is also a 2-D tensor whose i-th row is the n-gram representation of the i-th input row.
|
||||
More specifically, if input shape is [C], the corresponding output shape would be [max(ngram_indexes) + 1].
|
||||
If input shape is [N, C], this operator produces a [N, max(ngram_indexes) + 1]-tensor.
|
||||
|
||||
In contrast to standard n-gram extraction, here, the indexes of extracting an n-gram from the original
|
||||
sequence are not necessarily consecutive numbers. The discontinuity between indexes are controlled by the number of skips.
|
||||
If the number of skips is 2, we should skip two tokens when scanning through the original sequence.
|
||||
Let's consider an example. Assume that input sequence is [94, 17, 36, 12, 28] and the number of skips is 2.
|
||||
The associated 2-grams are [94, 12] and [17, 28] respectively indexed by [0, 3] and [1, 4].
|
||||
If the number of skips becomes 0, the 2-grams generated are [94, 17], [17, 36], [36, 12], [12, 28]
|
||||
indexed by [0, 1], [1, 2], [2, 3], [3, 4], respectively.
|
||||
|
||||
The output vector stores the count of each n-gram;
|
||||
Y[i] indicates the times that the i-th n-gram is found. The attribute ngram_indexes is used to determine the mapping
|
||||
between index i and the corresponding n-gram. If pool_int64s is [94 , 17 ,17, 36], ngram_indexes is [1, 0],
|
||||
ngram_counts=[0, 0], then the Y[0] (first element in Y) and Y[1] (second element in Y) are the counts of [17, 36] and [94, 17],
|
||||
respectively. An n-gram which cannot be found in pool_strings/pool_int64s should be ignored and has no effect on the output.
|
||||
Note that we may consider all skips up to S when generating the n-grams.
|
||||
|
||||
The examples used above are true if mode is "TF". If mode is "IDF", all the counts larger than 1 would be truncated to 1 and
|
||||
the i-th element in weights would be used to scale (by multiplication) the count of the i-th n-gram in pool. If mode is "TFIDF",
|
||||
this operator first computes the counts of all n-grams and then scale them by the associated values in the weights attribute.
|
||||
|
||||
Only one of pool_strings and pool_int64s can be set. If pool_int64s is set, the input should be an integer tensor.
|
||||
If pool_strings is set, the input must be a string tensor.
|
||||
)DOC");
|
||||
|
||||
// Operators for linear 8 bit quanitzation support.
|
||||
ONNX_CONTRIB_OPERATOR_SCHEMA(QuantizeLinear)
|
||||
.SetDomain(kMSDomain)
|
||||
|
|
|
|||
|
|
@ -248,6 +248,9 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Asi
|
|||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Acosh);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Atanh);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Scan);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, string, TfIdfVectorizer);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, int32_t, TfIdfVectorizer);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, int64_t, TfIdfVectorizer);
|
||||
|
||||
void RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Clip)>());
|
||||
|
|
@ -489,6 +492,9 @@ void RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
|
|||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Acosh)>());
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Atanh)>());
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Scan)>());
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, string, TfIdfVectorizer)>());
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, int32_t, TfIdfVectorizer)>());
|
||||
kernel_registry.Register(BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, int64_t, TfIdfVectorizer)>());
|
||||
}
|
||||
|
||||
// Forward declarations of ml op kernels
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "ngram.h"
|
||||
#include "tfidfvectorizer.h"
|
||||
#include "onnx/defs/schema.h"
|
||||
#include "core/common/common.h"
|
||||
#include "core/framework/tensor.h"
|
||||
|
|
@ -12,34 +12,33 @@
|
|||
#include <iterator>
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
|
||||
Ngram,
|
||||
1,
|
||||
ONNX_CPU_OPERATOR_TYPED_KERNEL(
|
||||
TfIdfVectorizer,
|
||||
9,
|
||||
string,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<std::string>())
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>()),
|
||||
contrib::Ngram);
|
||||
TfIdfVectorizer);
|
||||
|
||||
ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
|
||||
Ngram,
|
||||
1,
|
||||
ONNX_CPU_OPERATOR_TYPED_KERNEL(
|
||||
TfIdfVectorizer,
|
||||
9,
|
||||
int32_t,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<int32_t>())
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>()),
|
||||
contrib::Ngram);
|
||||
TfIdfVectorizer);
|
||||
|
||||
ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
|
||||
Ngram,
|
||||
1,
|
||||
ONNX_CPU_OPERATOR_TYPED_KERNEL(
|
||||
TfIdfVectorizer,
|
||||
9,
|
||||
int64_t,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<int64_t>())
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>()),
|
||||
contrib::Ngram);
|
||||
TfIdfVectorizer);
|
||||
|
||||
namespace ngram_details {
|
||||
|
||||
|
|
@ -169,10 +168,9 @@ inline void Emplace(ForwardIter first, size_t ngrams, size_t ngram_size, size_t&
|
|||
}
|
||||
|
||||
} // namespace ngram_details
|
||||
} // namespace contrib
|
||||
} // namespace onnxruntime
|
||||
|
||||
using namespace onnxruntime::contrib::ngram_details;
|
||||
using namespace onnxruntime::ngram_details;
|
||||
|
||||
namespace std {
|
||||
template <typename T>
|
||||
|
|
@ -186,7 +184,6 @@ struct hash<NgramEntry<T>> {
|
|||
} // namespace std
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
// The weighting criteria.
|
||||
// "TF"(term frequency),
|
||||
|
|
@ -206,7 +203,7 @@ enum WeightingCriteria {
|
|||
kTFIDF = 3
|
||||
};
|
||||
|
||||
struct Ngram::Impl {
|
||||
struct TfIdfVectorizer::Impl {
|
||||
WeightingCriteria weighting_criteria_ = kNone;
|
||||
int64_t max_gram_length_ = 0;
|
||||
int64_t min_gram_length_ = 0;
|
||||
|
|
@ -251,36 +248,36 @@ struct Ngram::Impl {
|
|||
};
|
||||
|
||||
template <>
|
||||
inline auto Ngram::Impl::PoolEnd<int64_t>() const {
|
||||
inline auto TfIdfVectorizer::Impl::PoolEnd<int64_t>() const {
|
||||
return int64_set_.cend();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline auto Ngram::Impl::PoolEnd<int32_t>() const {
|
||||
inline auto TfIdfVectorizer::Impl::PoolEnd<int32_t>() const {
|
||||
return PoolEnd<int64_t>();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline auto Ngram::Impl::PoolEnd<std::string>() const {
|
||||
inline auto TfIdfVectorizer::Impl::PoolEnd<std::string>() const {
|
||||
return str_set_.cend();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline auto Ngram::Impl::PoolFind<int64_t>(const NgramEntry<int64_t>& i) const {
|
||||
inline auto TfIdfVectorizer::Impl::PoolFind<int64_t>(const NgramEntry<int64_t>& i) const {
|
||||
return int64_set_.find(i);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline auto Ngram::Impl::PoolFind<int32_t>(const NgramEntry<int32_t>& i) const {
|
||||
inline auto TfIdfVectorizer::Impl::PoolFind<int32_t>(const NgramEntry<int32_t>& i) const {
|
||||
return int64_set_.find(i);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline auto Ngram::Impl::PoolFind<std::string>(const NgramEntry<std::string>& i) const {
|
||||
inline auto TfIdfVectorizer::Impl::PoolFind<std::string>(const NgramEntry<std::string>& i) const {
|
||||
return str_set_.find(i);
|
||||
}
|
||||
|
||||
Ngram::Ngram(const OpKernelInfo& info) : OpKernel(info), impl_(new Impl) {
|
||||
TfIdfVectorizer::TfIdfVectorizer(const OpKernelInfo& info) : OpKernel(info), impl_(new Impl) {
|
||||
std::string mode;
|
||||
Status status = info.GetAttr("mode", &mode);
|
||||
ORT_ENFORCE(status.IsOK(), "mode is required");
|
||||
|
|
@ -381,10 +378,10 @@ Ngram::Ngram(const OpKernelInfo& info) : OpKernel(info), impl_(new Impl) {
|
|||
}
|
||||
}
|
||||
|
||||
Ngram::~Ngram() {
|
||||
TfIdfVectorizer::~TfIdfVectorizer() {
|
||||
}
|
||||
|
||||
void Ngram::OutputResult(OpKernelContext* ctx, size_t B, const std::vector<uint32_t>& frequences) const {
|
||||
void TfIdfVectorizer::OutputResult(OpKernelContext* ctx, size_t B, const std::vector<uint32_t>& frequences) const {
|
||||
const Impl& impl = *impl_;
|
||||
std::vector<int64_t> output_dims;
|
||||
if (B == 0) {
|
||||
|
|
@ -437,7 +434,7 @@ void Ngram::OutputResult(OpKernelContext* ctx, size_t B, const std::vector<uint3
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
Status Ngram::ComputeImpl(OpKernelContext* ctx) const {
|
||||
Status TfIdfVectorizer::ComputeImpl(OpKernelContext* ctx) const {
|
||||
const auto& impl = *impl_;
|
||||
auto const set_end = impl.PoolEnd<T>();
|
||||
|
||||
|
|
@ -559,7 +556,7 @@ Status Ngram::ComputeImpl(OpKernelContext* ctx) const {
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Ngram::Compute(OpKernelContext* ctx) const {
|
||||
Status TfIdfVectorizer::Compute(OpKernelContext* ctx) const {
|
||||
Status s;
|
||||
|
||||
auto X = ctx->Input<Tensor>(0);
|
||||
|
|
@ -578,5 +575,4 @@ Status Ngram::Compute(OpKernelContext* ctx) const {
|
|||
return s;
|
||||
}
|
||||
|
||||
} // namespace contrib
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -9,13 +9,12 @@
|
|||
#include <vector>
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
class Ngram final : public OpKernel {
|
||||
class TfIdfVectorizer final : public OpKernel {
|
||||
public:
|
||||
explicit Ngram(const OpKernelInfo& info);
|
||||
~Ngram();
|
||||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Ngram);
|
||||
explicit TfIdfVectorizer(const OpKernelInfo& info);
|
||||
~TfIdfVectorizer();
|
||||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TfIdfVectorizer);
|
||||
|
||||
Status Compute(OpKernelContext* ctx) const override;
|
||||
|
||||
|
|
@ -30,5 +29,4 @@ class Ngram final : public OpKernel {
|
|||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
} // namespace contrib
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -344,14 +344,7 @@ int real_main(int argc, char* argv[]) {
|
|||
{"cast_FLOAT_to_STRING", "Cast opset 9 not supported yet"},
|
||||
{"cast_FLOAT_to_FLOAT16", "Cast opset 9 not supported yet"},
|
||||
{"cast_FLOAT16_to_DOUBLE", "Cast opset 9 not supported yet"},
|
||||
{"nonzero_example", "NonZero opset 9 not supported yet"},
|
||||
{"tfidfvectorizer_tf_uniandbigrams_skip5", "TfIdfVectorizer opset 9 not supported yet"},
|
||||
{"tfidfvectorizer_tf_batch_onlybigrams_skip0", "TfIdfVectorizer opset 9 not supported yet"},
|
||||
{"tfidfvectorizer_tf_onlybigrams_skip5", "TfIdfVectorizer opset 9 not supported yet"},
|
||||
{"tfidfvectorizer_tf_only_bigrams_skip0", "TfIdfVectorizer opset 9 not supported yet"},
|
||||
{"tfidfvectorizer_tf_onlybigrams_levelempty", "TfIdfVectorizer opset 9 not supported yet"},
|
||||
{"tfidfvectorizer_tf_batch_uniandbigrams_skip5", "TfIdfVectorizer opset 9 not supported yet"},
|
||||
{"tfidfvectorizer_tf_batch_onlybigrams_skip5", "TfIdfVectorizer opset 9 not supported yet"}};
|
||||
{"nonzero_example", "NonZero opset 9 not supported yet"}};
|
||||
|
||||
#ifdef USE_CUDA
|
||||
broken_tests["maxpool_2d_default"] = "cudnn pooling only support input dimension >= 3";
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@
|
|||
|
||||
namespace onnxruntime {
|
||||
namespace test {
|
||||
namespace ngram_test {
|
||||
namespace tfidfvectorizer_test {
|
||||
|
||||
constexpr const char* domain = onnxruntime::kMSDomain;
|
||||
const int opset_ver = 1;
|
||||
constexpr const char* domain = kOnnxDomain;
|
||||
const int opset_ver = 9;
|
||||
|
||||
void InitTestAttr(OpTester& test, const std::string& mode,
|
||||
int64_t min_gram_length, int64_t max_gram_length, int64_t max_skip_count,
|
||||
|
|
@ -36,9 +36,9 @@ void InitTestAttr(OpTester& test, const std::string& mode,
|
|||
test.AddAttribute("pool_strings", pool_strings);
|
||||
}
|
||||
}
|
||||
} // namespace ngram_test
|
||||
} // namespace tfidfvectorizer_test
|
||||
|
||||
using namespace ngram_test;
|
||||
using namespace tfidfvectorizer_test;
|
||||
|
||||
// Here is what takes place in general and in particular
|
||||
// in this unit test.There are 7 n - grams : 4 unigrams and 3 bigrams
|
||||
|
|
@ -48,8 +48,8 @@ using namespace ngram_test;
|
|||
// However, attribute all controls whether we consider all of the supplied ngram[M..N] sizes
|
||||
// into consideration or not.With all = false, we only consider N - grams.
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_onlyBigrams_Skip0) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, Int32_TF_onlyBigrams_Skip0) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=0, Min=Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 2, 2, 0,
|
||||
{0, 4},
|
||||
|
|
@ -70,8 +70,8 @@ TEST(ContribOpNgramTest, Int32_TF_onlyBigrams_Skip0) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_BatchOnlyBigrams_Skip0) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, Int32_TF_BatchOnlyBigrams_Skip0) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=0, Min=Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 2, 2, 0,
|
||||
{0, 4},
|
||||
|
|
@ -95,8 +95,8 @@ TEST(ContribOpNgramTest, Int32_TF_BatchOnlyBigrams_Skip0) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TF_OnlyBigrams_Skip0) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, String_TF_OnlyBigrams_Skip0) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=0, Min=Max=2, weights empty, string
|
||||
InitTestAttr(test, "TF", 2, 2, 0,
|
||||
{0, 4},
|
||||
|
|
@ -118,8 +118,8 @@ TEST(ContribOpNgramTest, String_TF_OnlyBigrams_Skip0) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TF_BatchOnlyBigrams_Skip0) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, String_TF_BatchOnlyBigrams_Skip0) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=0, Min=Max=2, weights empty, string
|
||||
InitTestAttr(test, "TF", 2, 2, 0,
|
||||
{0, 4},
|
||||
|
|
@ -145,8 +145,8 @@ TEST(ContribOpNgramTest, String_TF_BatchOnlyBigrams_Skip0) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_onlyBigrams_LevelEmpty) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, Int32_TF_onlyBigrams_LevelEmpty) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=0, Min=Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 2, 2, 0,
|
||||
{0, 0}, // no unigrams, bi-grams start immediately
|
||||
|
|
@ -171,8 +171,8 @@ TEST(ContribOpNgramTest, Int32_TF_onlyBigrams_LevelEmpty) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, Int32_TF_onlyBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 2, 2, 5,
|
||||
{0, 4},
|
||||
|
|
@ -195,8 +195,8 @@ TEST(ContribOpNgramTest, Int32_TF_onlyBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_BatchOnlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, Int32_TF_BatchOnlyBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, , Min=Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 2, 2, 5,
|
||||
{0, 4},
|
||||
|
|
@ -221,8 +221,8 @@ TEST(ContribOpNgramTest, Int32_TF_BatchOnlyBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TF_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, String_TF_onlyBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, , Min=Max=2, weights empty, string
|
||||
InitTestAttr(test, "TF", 2, 2, 5,
|
||||
{0, 4},
|
||||
|
|
@ -246,8 +246,8 @@ TEST(ContribOpNgramTest, String_TF_onlyBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TF_BatchOnlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, String_TF_BatchOnlyBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, , Min=Max=2, weights empty, string
|
||||
InitTestAttr(test, "TF", 2, 2, 5,
|
||||
{0, 4},
|
||||
|
|
@ -270,8 +270,8 @@ TEST(ContribOpNgramTest, String_TF_BatchOnlyBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_UniAndBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, Int32_TF_UniAndBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, , Min=1, Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 1, 2, 5,
|
||||
{0, 4},
|
||||
|
|
@ -293,8 +293,8 @@ TEST(ContribOpNgramTest, Int32_TF_UniAndBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_BatchUniAndBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, Int32_TF_BatchUniAndBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, Min=1, Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 1, 2, 5,
|
||||
{0, 4},
|
||||
|
|
@ -318,8 +318,8 @@ TEST(ContribOpNgramTest, Int32_TF_BatchUniAndBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TF_UniAndBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, String_TF_UniAndBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, Min=1, Max=2, weights empty, string
|
||||
InitTestAttr(test, "TF", 1, 2, 5,
|
||||
{0, 4},
|
||||
|
|
@ -341,8 +341,8 @@ TEST(ContribOpNgramTest, String_TF_UniAndBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TF_BatchUniAndBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, String_TF_BatchUniAndBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, Min=1, Max=2, weights empty, string
|
||||
InitTestAttr(test, "TF", 1, 2, 5,
|
||||
{0, 4},
|
||||
|
|
@ -366,8 +366,8 @@ TEST(ContribOpNgramTest, String_TF_BatchUniAndBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_IDF_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, Int32_IDF_onlyBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights empty, int32
|
||||
// We change to IDF but do not supply weights so
|
||||
// we should get all 1.0f where count is not zero
|
||||
|
|
@ -390,8 +390,8 @@ TEST(ContribOpNgramTest, Int32_IDF_onlyBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_IDF_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, String_IDF_onlyBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights empty, string
|
||||
InitTestAttr(test, "IDF", 2, 2, 5,
|
||||
{0, 4},
|
||||
|
|
@ -413,8 +413,8 @@ TEST(ContribOpNgramTest, String_IDF_onlyBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TFIDF_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, Int32_TFIDF_onlyBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights empty, int32
|
||||
// We change to TFIDF but do not supply weights so
|
||||
// we should all get the original values as weights are 1.0f by
|
||||
|
|
@ -438,8 +438,8 @@ TEST(ContribOpNgramTest, Int32_TFIDF_onlyBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TFIDF_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, String_TFIDF_onlyBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights empty, string
|
||||
InitTestAttr(test, "TFIDF", 2, 2, 5,
|
||||
{0, 4},
|
||||
|
|
@ -461,8 +461,8 @@ TEST(ContribOpNgramTest, String_TFIDF_onlyBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_IDFWeights_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, Int32_IDFWeights_onlyBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights specified, int32
|
||||
// We change to IDF with supplied weights. All
|
||||
// with non-zero counts must be replaced with the supplied weights
|
||||
|
|
@ -485,8 +485,8 @@ TEST(ContribOpNgramTest, Int32_IDFWeights_onlyBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_IDFWeights_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, String_IDFWeights_onlyBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights specified, string
|
||||
InitTestAttr(test, "IDF", 2, 2, 5,
|
||||
{0, 4},
|
||||
|
|
@ -508,8 +508,8 @@ TEST(ContribOpNgramTest, String_IDFWeights_onlyBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TFIDFWeights_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, Int32_TFIDFWeights_onlyBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights specified, int32
|
||||
// We change to TFIDF with supplied weights.
|
||||
// We should have all counts scaled by weights
|
||||
|
|
@ -532,8 +532,8 @@ TEST(ContribOpNgramTest, Int32_TFIDFWeights_onlyBigrams_Skip5) {
|
|||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TFIDFWeights_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
TEST(TfIdfVectorizerTest, String_TFIDFWeights_onlyBigrams_Skip5) {
|
||||
OpTester test("TfIdfVectorizer", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights specified, string
|
||||
InitTestAttr(test, "TFIDF", 2, 2, 5,
|
||||
{0, 4},
|
||||
Loading…
Reference in a new issue