mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-05 04:17:53 +00:00
Implement N-gram (#180)
* Implement N-gram Do not load unnecessary pool n-grams. Add String typed tests. Set output size to the mav ngram_index value plus 1. * Address security warnings and some review comments. * Fix build issues, rework sampling to try all n-gram sizes at a given offset. * Rework the loop so all n should be tried at a given offset and we do not add the same items all over again such as b,c and next we try b,c,d but we no longer add b,c again. * Compute hash incrementally so we do not re-hash elements that were already there when we add more elements to n-gram. * Address review comments. TODO: Remove all attribute. * Remove all attribute, adjust tests. Correct docs. * Address more review comments. * Create Type And Shape inference function. * Address review comments. Implement batch mode per new spec. * Correct switch bracing in OutputResult and re-test. * Fix shape error message within TypeAndShapeInferenceFunction.
This commit is contained in:
parent
8fba324678
commit
058803086d
5 changed files with 1314 additions and 0 deletions
|
|
@ -12,6 +12,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Ngram);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int32_t, Ngram);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int64_t, Ngram);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QuantizeLinear);
|
||||
|
|
@ -32,6 +35,9 @@ void RegisterContribKernels(std::function<void(KernelCreateInfo&&)> fn) {
|
|||
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv)>());
|
||||
fn(BuildKernel<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM)>());
|
||||
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer)>());
|
||||
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Ngram)>());
|
||||
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int32_t, Ngram)>());
|
||||
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int64_t, Ngram)>());
|
||||
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DequantizeLinear)>());
|
||||
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear)>());
|
||||
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QuantizeLinear)>());
|
||||
|
|
|
|||
582
onnxruntime/contrib_ops/cpu/ngram.cc
Normal file
582
onnxruntime/contrib_ops/cpu/ngram.cc
Normal file
|
|
@ -0,0 +1,582 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "ngram.h"
|
||||
#include "onnx/defs/schema.h"
|
||||
#include "core/common/common.h"
|
||||
#include "core/framework/tensor.h"
|
||||
|
||||
#include <functional>
|
||||
#include <unordered_set>
|
||||
#include <ostream>
|
||||
#include <iterator>
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
|
||||
Ngram,
|
||||
1,
|
||||
string,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<std::string>())
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>()),
|
||||
contrib::Ngram);
|
||||
|
||||
ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
|
||||
Ngram,
|
||||
1,
|
||||
int32_t,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<int32_t>())
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>()),
|
||||
contrib::Ngram);
|
||||
|
||||
ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
|
||||
Ngram,
|
||||
1,
|
||||
int64_t,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<int64_t>())
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>()),
|
||||
contrib::Ngram);
|
||||
|
||||
namespace ngram_details {
|
||||
|
||||
class NgramEntryBase {
|
||||
size_t id_; // Id in the pool
|
||||
protected:
|
||||
NgramEntryBase(size_t id) : id_(id) {}
|
||||
~NgramEntryBase() = default;
|
||||
|
||||
public:
|
||||
size_t Id() const { return id_; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class NgramEntry;
|
||||
|
||||
template <>
|
||||
class NgramEntry<int64_t> : public NgramEntryBase {
|
||||
std::vector<int64_t> items_;
|
||||
size_t hash_ = 0;
|
||||
|
||||
void RunningHash(int64_t v) {
|
||||
std::hash<int64_t> hf{};
|
||||
hash_ ^= hf(v) + 0x9e3779b9 + (hash_ << 6) + (hash_ >> 2);
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename ForwardIter>
|
||||
explicit NgramEntry(size_t id, ForwardIter first, ForwardIter last) : NgramEntryBase(id) {
|
||||
while (first != last) {
|
||||
RunningHash(*first);
|
||||
items_.push_back(*first);
|
||||
++first;
|
||||
}
|
||||
assert(!items_.empty());
|
||||
}
|
||||
// For sampling
|
||||
explicit NgramEntry() : NgramEntryBase(0) {}
|
||||
void AddItem(int64_t v) {
|
||||
items_.push_back(v);
|
||||
RunningHash(v);
|
||||
}
|
||||
void DebugPrint() const {
|
||||
std::copy(items_.cbegin(), items_.cend(), std::ostream_iterator<int64_t>(std::cout, ","));
|
||||
std::cout << std::endl;
|
||||
}
|
||||
void Clear() {
|
||||
items_.clear();
|
||||
hash_ = 0;
|
||||
}
|
||||
bool operator==(const NgramEntry& o) const {
|
||||
return items_ == o.items_;
|
||||
}
|
||||
size_t Hash() const {
|
||||
return hash_;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class NgramEntry<int32_t> : public NgramEntry<int64_t> {
|
||||
public:
|
||||
template <typename ForwardIter>
|
||||
explicit NgramEntry(size_t id, ForwardIter first, ForwardIter last) : NgramEntry<int64_t>(id, first, last) {}
|
||||
explicit NgramEntry() = default;
|
||||
};
|
||||
|
||||
template <>
|
||||
class NgramEntry<std::string> : public NgramEntryBase {
|
||||
private:
|
||||
std::vector<std::reference_wrapper<const std::string>> items_;
|
||||
size_t hash_ = 0;
|
||||
|
||||
void RunningHash(const std::string& s) {
|
||||
std::hash<std::string> hf{};
|
||||
hash_ ^= hf(s) + 0x9e3779b9 + (hash_ << 6) + (hash_ >> 2);
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename ForwardIter>
|
||||
explicit NgramEntry(size_t id, ForwardIter first, ForwardIter last) : NgramEntryBase(id) {
|
||||
while (first != last) {
|
||||
RunningHash(*first);
|
||||
items_.push_back(std::cref(*first));
|
||||
++first;
|
||||
}
|
||||
assert(!items_.empty());
|
||||
}
|
||||
explicit NgramEntry() : NgramEntryBase(0) {}
|
||||
void AddItem(const std::string& s) {
|
||||
items_.push_back(std::cref(s));
|
||||
RunningHash(s);
|
||||
}
|
||||
void DebugPrint() const {
|
||||
std::copy(items_.cbegin(), items_.cend(), std::ostream_iterator<std::string>(std::cout, ","));
|
||||
std::cout << std::endl;
|
||||
}
|
||||
void Clear() {
|
||||
items_.clear();
|
||||
hash_ = 0;
|
||||
}
|
||||
|
||||
bool operator==(const NgramEntry& o) const {
|
||||
if (items_.size() == o.items_.size()) {
|
||||
return std::equal(items_.cbegin(), items_.cend(),
|
||||
o.items_.cbegin(), o.items_.cend(),
|
||||
std::equal_to<std::string>());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
size_t Hash() const {
|
||||
return hash_;
|
||||
}
|
||||
};
|
||||
|
||||
using IntegerPoolSet = std::unordered_set<NgramEntry<int64_t>>;
|
||||
// Does not own strings, contains references to them. This helps
|
||||
// to search by string references that point to the current input.
|
||||
using StringPoolSet = std::unordered_set<NgramEntry<std::string>>;
|
||||
|
||||
template <typename ForwardIter, typename Cont>
|
||||
inline void Emplace(ForwardIter first, size_t ngrams, size_t ngram_size, size_t& ngram_id, Cont& c) {
|
||||
for (; ngrams > 0; --ngrams) {
|
||||
c.emplace(ngram_id, first, first + ngram_size);
|
||||
first += ngram_size;
|
||||
++ngram_id;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ngram_details
|
||||
} // namespace contrib
|
||||
} // namespace onnxruntime
|
||||
|
||||
using namespace onnxruntime::contrib::ngram_details;
|
||||
|
||||
namespace std {
|
||||
template <typename T>
|
||||
struct hash<NgramEntry<T>> {
|
||||
typedef NgramEntry<T> argument_type;
|
||||
typedef size_t result_type;
|
||||
result_type operator()(const argument_type& a) const {
|
||||
return a.Hash();
|
||||
}
|
||||
};
|
||||
} // namespace std
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
// The weighting criteria.
|
||||
// "TF"(term frequency),
|
||||
// the counts are propagated to output
|
||||
// "IDF"(inverse document frequency),
|
||||
// all the counts larger than 1
|
||||
// would be truncated to 1 and the i-th element
|
||||
// in weights would be used to scale (by multiplication)
|
||||
// the count of the i-th n-gram in pool
|
||||
// "TFIDF" (the combination of TF and IDF).
|
||||
// counts are scaled by the associated values in the weights attribute.
|
||||
|
||||
enum WeightingCriteria {
|
||||
kNone = 0,
|
||||
kTF = 1,
|
||||
kIDF = 2,
|
||||
kTFIDF = 3
|
||||
};
|
||||
|
||||
struct Ngram::Impl {
|
||||
WeightingCriteria weighting_criteria_ = kNone;
|
||||
int64_t max_gram_length_ = 0;
|
||||
int64_t min_gram_length_ = 0;
|
||||
int64_t max_skip_count_ = 0;
|
||||
// This is the content of ngram_counts attribute.
|
||||
// The starting indexes of 1-grams, 2-grams,
|
||||
// and so on in pool. For example, if ngram_counts is [0, 17, 36],
|
||||
// the first index (zero-based) of 1-gram/2-gram/3-gram
|
||||
// in pool are 0/17/36.
|
||||
std::vector<int64_t> ngram_counts_;
|
||||
// Contains output indexes
|
||||
// represents ngram_indexes output
|
||||
std::vector<int64_t> ngram_indexes_;
|
||||
std::vector<float> weights_;
|
||||
|
||||
std::vector<std::string> pool_strings_;
|
||||
// This set contains references to pool_string_ entries
|
||||
// of pool_strings attribute
|
||||
StringPoolSet str_set_;
|
||||
// This set contains pool_int64s entries
|
||||
IntegerPoolSet int64_set_;
|
||||
size_t output_size_ = 0;
|
||||
|
||||
Impl() = default;
|
||||
~Impl() = default;
|
||||
Impl(const Impl&) = delete;
|
||||
Impl& operator=(const Impl&) = delete;
|
||||
|
||||
template <typename T>
|
||||
auto PoolEnd() const;
|
||||
|
||||
template <typename T>
|
||||
auto PoolFind(const ngram_details::NgramEntry<T>&) const;
|
||||
|
||||
void IncrementCount(size_t ngram_id, size_t row_num,
|
||||
std::vector<uint32_t>& frequencies) const {
|
||||
assert(ngram_id < ngram_indexes_.size());
|
||||
auto output_idx = row_num * output_size_ + ngram_indexes_[ngram_id];
|
||||
assert(static_cast<size_t>(output_idx) < frequencies.size());
|
||||
++frequencies[output_idx];
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
inline auto Ngram::Impl::PoolEnd<int64_t>() const {
|
||||
return int64_set_.cend();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline auto Ngram::Impl::PoolEnd<int32_t>() const {
|
||||
return PoolEnd<int64_t>();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline auto Ngram::Impl::PoolEnd<std::string>() const {
|
||||
return str_set_.cend();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline auto Ngram::Impl::PoolFind<int64_t>(const NgramEntry<int64_t>& i) const {
|
||||
return int64_set_.find(i);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline auto Ngram::Impl::PoolFind<int32_t>(const NgramEntry<int32_t>& i) const {
|
||||
return int64_set_.find(i);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline auto Ngram::Impl::PoolFind<std::string>(const NgramEntry<std::string>& i) const {
|
||||
return str_set_.find(i);
|
||||
}
|
||||
|
||||
Ngram::Ngram(const OpKernelInfo& info) : OpKernel(info), impl_(new Impl) {
|
||||
std::string mode;
|
||||
Status status = info.GetAttr("mode", &mode);
|
||||
ORT_ENFORCE(status.IsOK(), "mode is required");
|
||||
if (mode == "TF") {
|
||||
impl_->weighting_criteria_ = kTF;
|
||||
} else if (mode == "IDF") {
|
||||
impl_->weighting_criteria_ = kIDF;
|
||||
} else if (mode == "TFIDF") {
|
||||
impl_->weighting_criteria_ = kTFIDF;
|
||||
}
|
||||
ORT_ENFORCE(impl_->weighting_criteria_ != kNone, "mode: ", mode, " is unrecognized, acceptable values are TF,IDF,TFIDF");
|
||||
|
||||
status = info.GetAttr("min_gram_length", &impl_->min_gram_length_);
|
||||
ORT_ENFORCE(status.IsOK(), "min_gram_length is required");
|
||||
ORT_ENFORCE(impl_->min_gram_length_ > 0, "Required min_gram_length must be positive: ", std::to_string(impl_->min_gram_length_));
|
||||
|
||||
status = info.GetAttr("max_gram_length", &impl_->max_gram_length_);
|
||||
ORT_ENFORCE(status.IsOK(), "min_gram_length is required");
|
||||
ORT_ENFORCE(impl_->max_gram_length_ >= impl_->min_gram_length_,
|
||||
"min_gram_length >= max_gram_length required: ",
|
||||
std::to_string(impl_->max_gram_length_), " >= ", std::to_string(impl_->min_gram_length_));
|
||||
|
||||
status = info.GetAttr("max_skip_count", &impl_->max_skip_count_);
|
||||
ORT_ENFORCE(status.IsOK(), "max_skip_count is required");
|
||||
ORT_ENFORCE(impl_->max_skip_count_ >= 0, "max_skip_count must be non-negative: ", std::to_string(impl_->max_skip_count_));
|
||||
|
||||
status = info.GetAttrs(std::string("ngram_counts"), impl_->ngram_counts_);
|
||||
ORT_ENFORCE(status.IsOK() && !impl_->ngram_counts_.empty(), "Non-empty ngram_counts is required");
|
||||
ORT_ENFORCE(size_t(impl_->min_gram_length_) <= impl_->ngram_counts_.size(),
|
||||
"min_gram_length must be inbounds of ngram_counts: ",
|
||||
std::to_string(impl_->min_gram_length_), " <= ", std::to_string(impl_->ngram_counts_.size()));
|
||||
ORT_ENFORCE(size_t(impl_->max_gram_length_) <= impl_->ngram_counts_.size(),
|
||||
"max_gram_length must be inbounds of ngram_counts: ",
|
||||
std::to_string(impl_->max_gram_length_), " <= ", std::to_string(impl_->ngram_counts_.size()));
|
||||
|
||||
status = info.GetAttrs("ngram_indexes", impl_->ngram_indexes_);
|
||||
ORT_ENFORCE(status.IsOK() && !impl_->ngram_indexes_.empty(), "Non-empty ngram_indexes is required");
|
||||
{
|
||||
// Check that all are positive
|
||||
ORT_ENFORCE(std::all_of(impl_->ngram_indexes_.cbegin(), impl_->ngram_indexes_.cend(),
|
||||
[](int64_t i) { return i >= 0; }),
|
||||
"Negative ngram_indexes values are not allowed");
|
||||
// Set output size to max output index + 1;
|
||||
auto greatest_hit = std::max_element(impl_->ngram_indexes_.cbegin(), impl_->ngram_indexes_.cend());
|
||||
impl_->output_size_ = *greatest_hit + 1;
|
||||
}
|
||||
|
||||
status = info.GetAttrs("weights", impl_->weights_);
|
||||
if (status.IsOK()) {
|
||||
ORT_ENFORCE(impl_->weights_.size() == impl_->ngram_indexes_.size(),
|
||||
"Got weights of size: ", std::to_string(impl_->weights_.size()),
|
||||
" but ngram_indexes size: ", std::to_string(impl_->ngram_indexes_.size()),
|
||||
" must be of equal size");
|
||||
}
|
||||
|
||||
std::vector<int64_t> pool_int64s;
|
||||
status = info.GetAttrs("pool_strings", impl_->pool_strings_);
|
||||
if (status.IsOK()) {
|
||||
ORT_ENFORCE(!impl_->pool_strings_.empty(), "pool_strings must not be empty if specified");
|
||||
} else {
|
||||
status = info.GetAttrs("pool_int64s", pool_int64s);
|
||||
ORT_ENFORCE(status.IsOK() && !pool_int64s.empty(), "non-empty pool_int64s is required if pool_strings not provided");
|
||||
}
|
||||
|
||||
// Iterator via the pool. Insert 1 item for 1-grams, 2 items for 2-grams, etc.
|
||||
const auto total_items = (impl_->pool_strings_.empty()) ? pool_int64s.size() : impl_->pool_strings_.size();
|
||||
size_t ngram_id = 0;
|
||||
// Load into dictionary only required gram sizes
|
||||
const size_t min_gram_length = impl_->min_gram_length_;
|
||||
const size_t max_gram_length = impl_->max_gram_length_;
|
||||
size_t ngram_size = 1;
|
||||
for (size_t i = 0; i < impl_->ngram_counts_.size(); ++i) {
|
||||
size_t start_idx = impl_->ngram_counts_[i];
|
||||
size_t end_idx = ((i + 1) < impl_->ngram_counts_.size()) ? impl_->ngram_counts_[i + 1] : total_items;
|
||||
ORT_ENFORCE(end_idx >= start_idx && end_idx <= total_items,
|
||||
"n-gram counts out of bounds for ", std::to_string(ngram_size), "-grams");
|
||||
auto items = end_idx - start_idx;
|
||||
if (items > 0) {
|
||||
ORT_ENFORCE((items % ngram_size == 0),
|
||||
"Number of items must compose whole ", std::to_string(ngram_size), "-grams");
|
||||
auto ngrams = items / ngram_size;
|
||||
// Skip loading into hash_set ngrams that are not in the range of [min_gram_length-max_gram_length]
|
||||
if (ngram_size >= min_gram_length && ngram_size <= max_gram_length) {
|
||||
if (impl_->pool_strings_.empty()) {
|
||||
auto before_insert = impl_->int64_set_.size();
|
||||
Emplace(pool_int64s.begin() + start_idx, ngrams, ngram_size, ngram_id, impl_->int64_set_);
|
||||
ORT_ENFORCE((before_insert + ngrams) == impl_->int64_set_.size(), "pool_int64s duplicate ", std::to_string(ngram_size), "-grams detected");
|
||||
} else {
|
||||
auto before_insert = impl_->str_set_.size();
|
||||
Emplace(impl_->pool_strings_.begin() + start_idx, ngrams, ngram_size, ngram_id, impl_->str_set_);
|
||||
ORT_ENFORCE((before_insert + ngrams) == impl_->str_set_.size(), "poll_strings duplicate ", std::to_string(ngram_size), "-grams detected");
|
||||
}
|
||||
} else {
|
||||
ngram_id += ngrams;
|
||||
}
|
||||
}
|
||||
++ngram_size;
|
||||
}
|
||||
}
|
||||
|
||||
Ngram::~Ngram() {
|
||||
}
|
||||
|
||||
void Ngram::OutputResult(OpKernelContext* ctx, size_t B, const std::vector<uint32_t>& frequences) const {
|
||||
const Impl& impl = *impl_;
|
||||
std::vector<int64_t> output_dims;
|
||||
if (B == 0) {
|
||||
output_dims.push_back(impl.output_size_);
|
||||
} else {
|
||||
output_dims.push_back(B);
|
||||
output_dims.push_back(impl.output_size_);
|
||||
}
|
||||
|
||||
TensorShape output_shape(output_dims);
|
||||
assert(frequences.size() == static_cast<size_t>(output_shape.Size()));
|
||||
|
||||
auto Y = ctx->Output(0, output_shape);
|
||||
auto output_data = Y->MutableData<float>();
|
||||
const auto& w = impl.weights_;
|
||||
switch (impl.weighting_criteria_) {
|
||||
case kTF: {
|
||||
for (auto f : frequences) {
|
||||
*output_data++ = static_cast<float>(f);
|
||||
}
|
||||
} break;
|
||||
case kIDF: {
|
||||
if (!w.empty()) {
|
||||
assert(frequences.size() == w.size());
|
||||
for (size_t i = 0; i < frequences.size(); ++i) {
|
||||
*output_data++ = (frequences[i] > 0) ? w[i] : 0;
|
||||
}
|
||||
} else {
|
||||
for (auto f : frequences) {
|
||||
*output_data++ = (f > 0) ? 1.0f : 0;
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case kTFIDF: {
|
||||
if (!w.empty()) {
|
||||
assert(frequences.size() == w.size());
|
||||
for (size_t i = 0; i < frequences.size(); ++i) {
|
||||
*output_data++ = frequences[i] * w[i];
|
||||
}
|
||||
} else {
|
||||
for (auto f : frequences) {
|
||||
*output_data++ = static_cast<float>(f);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case kNone: // fall-through
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status Ngram::ComputeImpl(OpKernelContext* ctx) const {
|
||||
const auto& impl = *impl_;
|
||||
auto const set_end = impl.PoolEnd<T>();
|
||||
|
||||
auto X = ctx->Input<Tensor>(0);
|
||||
auto& input_shape = X->Shape();
|
||||
const size_t total_items = input_shape.Size();
|
||||
|
||||
size_t b_dim = 0;
|
||||
size_t B = 0;
|
||||
size_t C = 0;
|
||||
auto& input_dims = input_shape.GetDims();
|
||||
if (input_dims.empty()) {
|
||||
b_dim = 1;
|
||||
C = 1;
|
||||
assert(total_items == 1);
|
||||
} else if (input_dims.size() == 1) {
|
||||
b_dim = 1;
|
||||
C = input_dims[0];
|
||||
if (C < 1) {
|
||||
return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
|
||||
"Input shape must have either [C] or [B,C] dimensions where C > 0 and B > 0");
|
||||
}
|
||||
} else if (input_dims.size() == 2) {
|
||||
B = input_dims[0];
|
||||
C = input_dims[1];
|
||||
b_dim = B;
|
||||
if (B < 1 || C < 1) {
|
||||
return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
|
||||
"Input shape must have either [C] or [B,C] dimensions where C > 0 and B > 0");
|
||||
}
|
||||
} else {
|
||||
return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
|
||||
"Input shape must have either [C] or [B,C] dimensions where C > 0 and B > 0");
|
||||
}
|
||||
|
||||
assert((b_dim * C) == total_items);
|
||||
|
||||
// Frequency holder allocate [B..output_size_]
|
||||
// and init all to zero
|
||||
std::vector<uint32_t> frequencies;
|
||||
frequencies.resize(b_dim * impl.output_size_, 0);
|
||||
|
||||
const auto max_gram_length = impl.max_gram_length_;
|
||||
const auto max_skip_distance = impl.max_skip_count_ + 1; // Convert to distance
|
||||
auto start_ngram_size = impl.min_gram_length_;
|
||||
auto const input_data = X->template Data<T>();
|
||||
auto const end_data = input_data + total_items;
|
||||
NgramEntry<T> sample;
|
||||
|
||||
// Treat 1-grams in a special way
|
||||
if (start_ngram_size == 1) {
|
||||
size_t row_num = 0;
|
||||
auto ngram_start = input_data;
|
||||
while (ngram_start < end_data) {
|
||||
auto const ngram_row_end = ngram_start + C;
|
||||
while (ngram_start < ngram_row_end) {
|
||||
sample.Clear();
|
||||
sample.AddItem(*ngram_start);
|
||||
auto hit = impl.PoolFind<T>(sample);
|
||||
if (hit != set_end) {
|
||||
// record frequency
|
||||
auto ngram_id = hit->Id();
|
||||
impl.IncrementCount(ngram_id, row_num, frequencies);
|
||||
}
|
||||
++ngram_start;
|
||||
}
|
||||
++row_num;
|
||||
ngram_start = ngram_row_end;
|
||||
}
|
||||
if (++start_ngram_size > max_gram_length) {
|
||||
OutputResult(ctx, B, frequencies);
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
for (auto skip_distance = 1; skip_distance <= max_skip_distance; ++skip_distance) {
|
||||
auto ngram_start = input_data;
|
||||
size_t row_num = 0;
|
||||
while (ngram_start < end_data) {
|
||||
assert((B == 0) || (row_num < B));
|
||||
auto const ngram_row_end = ngram_start + C;
|
||||
assert(ngram_row_end <= end_data);
|
||||
while (ngram_start < ngram_row_end) {
|
||||
// Check if any n-gram size in [start_ngram_size..max_gram_length] range
|
||||
// fit before the end of the row so we do not waste time adding [1..start_ngram_size)
|
||||
// At least items of start_ngram_size should fit
|
||||
// last row should match end_data
|
||||
auto at_least_this = ngram_start + skip_distance * (start_ngram_size - 1);
|
||||
if (at_least_this >= ngram_row_end) {
|
||||
break;
|
||||
}
|
||||
sample.Clear();
|
||||
auto ngram_item = ngram_start;
|
||||
for (auto ngram_size = 1;
|
||||
ngram_size <= max_gram_length &&
|
||||
ngram_item < ngram_row_end;
|
||||
++ngram_size, ngram_item += skip_distance) {
|
||||
sample.AddItem(*ngram_item);
|
||||
|
||||
// Do not test anything before start_ngram_size
|
||||
if (ngram_size >= start_ngram_size) {
|
||||
auto hit = impl.PoolFind<T>(sample);
|
||||
if (hit != set_end) {
|
||||
// record frequency
|
||||
auto ngram_id = hit->Id();
|
||||
impl.IncrementCount(ngram_id, row_num, frequencies);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Sliding window shift
|
||||
++ngram_start;
|
||||
}
|
||||
// Next row
|
||||
ngram_start = ngram_row_end;
|
||||
++row_num;
|
||||
}
|
||||
}
|
||||
OutputResult(ctx, B, frequencies);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Ngram::Compute(OpKernelContext* ctx) const {
|
||||
Status s;
|
||||
|
||||
auto X = ctx->Input<Tensor>(0);
|
||||
|
||||
if (X->DataType() == DataTypeImpl::GetType<int32_t>()) {
|
||||
s = ComputeImpl<int32_t>(ctx);
|
||||
} else if (X->DataType() == DataTypeImpl::GetType<int64_t>()) {
|
||||
s = ComputeImpl<int64_t>(ctx);
|
||||
} else if (X->DataType() == DataTypeImpl::GetType<std::string>()) {
|
||||
s = ComputeImpl<std::string>(ctx);
|
||||
} else {
|
||||
s = Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
|
||||
"Invalid type of the input argument");
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace contrib
|
||||
} // namespace onnxruntime
|
||||
34
onnxruntime/contrib_ops/cpu/ngram.h
Normal file
34
onnxruntime/contrib_ops/cpu/ngram.h
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/common/common.h"
|
||||
#include "core/framework/op_kernel.h"
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
class Ngram final : public OpKernel {
|
||||
public:
|
||||
explicit Ngram(const OpKernelInfo& info);
|
||||
~Ngram();
|
||||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Ngram);
|
||||
|
||||
Status Compute(OpKernelContext* ctx) const override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
Status ComputeImpl(OpKernelContext* ctx) const;
|
||||
|
||||
// Apply weighing criteria and output
|
||||
void OutputResult(OpKernelContext* ctx, size_t b_dim, const std::vector<uint32_t>& frequences) const;
|
||||
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
} // namespace contrib
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -214,6 +214,139 @@ activation.)DOC")
|
|||
})
|
||||
.SetDoc(R"DOC(Tokenizer divides each string in X into a vector of strings along the last axis. All input strings including attributes are UTF-8 encoded.)DOC");
|
||||
|
||||
ONNX_CONTRIB_OPERATOR_SCHEMA(Ngram)
|
||||
.SetDomain(kMSDomain)
|
||||
.SinceVersion(1)
|
||||
.Input(0, "X", "Input for n-gram extraction", "T")
|
||||
.Output(0, "Y", "Ngram results", "T1")
|
||||
.TypeConstraint(
|
||||
"T",
|
||||
{"tensor(string)", "tensor(int32)", "tensor(int64)"},
|
||||
"Input is ether string UTF-8 or int32/int64")
|
||||
.TypeConstraint(
|
||||
"T1",
|
||||
{"tensor(float)"},
|
||||
"1-D tensor of floats")
|
||||
.Attr(
|
||||
"max_gram_length",
|
||||
"Maximum n-gram length. If this value is 3, 3-grams will be used to generate the output.",
|
||||
AttributeProto::INT)
|
||||
.Attr(
|
||||
"min_gram_length",
|
||||
"Minimum n-gram length. If this value is 2 and max_gram_length is 3, output may contain counts of 2-grams and 3-grams.",
|
||||
AttributeProto::INT)
|
||||
.Attr(
|
||||
"max_skip_count",
|
||||
"Maximum number of items (integers/strings) to be skipped when constructing an n-gram from X."
|
||||
"If max_skip_count=1, min_gram_length=2, max_gram_length=3, this operator may generate 2-grams"
|
||||
"with skip_count=0 and skip_count=1, and 3-grams with skip_count=0 and skip_count=1",
|
||||
AttributeProto::INT)
|
||||
.Attr(
|
||||
"pool_strings",
|
||||
"List of strings n-grams learned from the training set. Either this or pool_int64s attributes must be present but not both."
|
||||
"It's an 1-D tensor starting with the collections of all 1-grams and ending with the collections of n-grams."
|
||||
"The i-th element in pool stores the n-gram that should be mapped to index ngram_indexes[i] in the output vector.",
|
||||
AttributeProto::STRINGS,
|
||||
OPTIONAL)
|
||||
.Attr(
|
||||
"pool_int64s",
|
||||
"List of int64 n-grams learned from the training set. Either this or pool_strings attributes must be present but not both."
|
||||
"It's an 1-D tensor starting with the collections of all 1-grams and ending with the collections of n-grams."
|
||||
"The i-th element in pool stores the n-gram that should be mapped to index ngram_indexes[i] in the output vector.",
|
||||
AttributeProto::INTS,
|
||||
OPTIONAL)
|
||||
.Attr(
|
||||
"ngram_counts",
|
||||
"The starting indexes of 1-grams, 2-grams, and so on in pool."
|
||||
"It is useful when determining the boundary between two consecutive collections of n-grams."
|
||||
"For example, if ngram_counts is [0, 17, 36], the first index (zero-based) of 1-gram/2-gram/3-gram"
|
||||
"in pool are 0/17/36. This format is essentially identical to CSR (or CSC) sparse matrix format, "
|
||||
"and we choose to keep this due to its popularity.",
|
||||
AttributeProto::INTS)
|
||||
.Attr(
|
||||
"ngram_indexes",
|
||||
"list of int64s (type: AttributeProto::INTS). This list is parallel to the specified 'pool_*' attribute."
|
||||
"The i-th element in ngram_indexes indicate the coordinate of the i-th n-gram in the output tensor.",
|
||||
AttributeProto::INTS)
|
||||
.Attr(
|
||||
"weights",
|
||||
"list of floats. This attribute stores the weight of each n-gram in pool. The i-th element in weights"
|
||||
"is the weight of the i-th n-gram in pool. Its length equals to the size of ngram_indexes."
|
||||
"By default, weights is an all-one tensor.This attribute is used when mode is \"IDF\" or \"TFIDF\""
|
||||
"to scale the associated word counts.",
|
||||
AttributeProto::FLOATS,
|
||||
OPTIONAL)
|
||||
.Attr(
|
||||
"mode",
|
||||
"The weighting criteria. It can be one of \"TF\" (term frequency),"
|
||||
"\"IDF\" (inverse document frequency), and \"TFIDF\" (the combination of TF and IDF)",
|
||||
AttributeProto::STRING)
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
auto output_elem_type = ctx.getOutputType(0)->mutable_tensor_type();
|
||||
output_elem_type->set_elem_type(ONNX_NAMESPACE::TensorProto::FLOAT);
|
||||
|
||||
if (hasInputShape(ctx, 0)) {
|
||||
std::vector<int64_t> ngram_indexes;
|
||||
ONNX_NAMESPACE::getRepeatedAttribute(ctx, "ngram_indexes", ngram_indexes);
|
||||
if (ngram_indexes.empty() || !std::all_of(ngram_indexes.cbegin(), ngram_indexes.cend(),
|
||||
[](int64_t i) { return i >= 0; })) {
|
||||
fail_shape_inference(
|
||||
"ngram_indexes must be non-empty with no negative values");
|
||||
}
|
||||
|
||||
auto greatest_hit = std::max_element(ngram_indexes.cbegin(), ngram_indexes.cend());
|
||||
auto max_last_axis = *greatest_hit + 1;
|
||||
|
||||
ONNX_NAMESPACE::TensorShapeProto output_shape;
|
||||
auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
|
||||
auto dim_size = input_shape.dim_size();
|
||||
if (dim_size == 0 || dim_size == 1) {
|
||||
output_shape.add_dim()->set_dim_value(max_last_axis);
|
||||
} else if (dim_size == 2) {
|
||||
auto& B_dim = input_shape.dim(0);
|
||||
if (!B_dim.has_dim_value()) {
|
||||
fail_shape_inference(
|
||||
"Input shape does not have first dimension value");
|
||||
}
|
||||
output_shape.add_dim()->set_dim_value(B_dim.dim_value());
|
||||
output_shape.add_dim()->set_dim_value(max_last_axis);
|
||||
} else {
|
||||
fail_shape_inference(
|
||||
"Input shape must have either [C] or [B,C] dimensions where C > 0 and B > 0");
|
||||
}
|
||||
updateOutputShape(ctx, 0, output_shape);
|
||||
}
|
||||
})
|
||||
.SetDoc(R"DOC(
|
||||
This transform extracts n-grams from the input sequence and save them as a vector. Input can
|
||||
be either a 1-D or 2-D tensor. For 1-D input, output is the n-gram representation of that input.
|
||||
For 2-D input, the output is also a 2-D tensor whose i-th row is the n-gram representation of the i-th input row.
|
||||
More specifically, if input shape is [C], the corresponding output shape would be [max(ngram_indexes) + 1].
|
||||
If input shape is [N, C], this operator produces a [N, max(ngram_indexes) + 1]-tensor.
|
||||
|
||||
In contrast to standard n-gram extraction, here, the indexes of extracting an n-gram from the original
|
||||
sequence are not necessarily consecutive numbers. The discontinuity between indexes are controlled by the number of skips.
|
||||
If the number of skips is 2, we should skip two tokens when scanning through the original sequence.
|
||||
Let's consider an example. Assume that input sequence is [94, 17, 36, 12, 28] and the number of skips is 2.
|
||||
The associated 2-grams are [94, 12] and [17, 28] respectively indexed by [0, 3] and [1, 4].
|
||||
If the number of skips becomes 0, the 2-grams generated are [94, 17], [17, 36], [36, 12], [12, 28]
|
||||
indexed by [0, 1], [1, 2], [2, 3], [3, 4], respectively.
|
||||
|
||||
The output vector stores the count of each n-gram;
|
||||
Y[i] indicates the times that the i-th n-gram is found. The attribute ngram_indexes is used to determine the mapping
|
||||
between index i and the corresponding n-gram. If pool_int64s is [94 , 17 ,17, 36], ngram_indexes is [1, 0],
|
||||
ngram_counts=[0, 0], then the Y[0] (first element in Y) and Y[1] (second element in Y) are the counts of [17, 36] and [94, 17],
|
||||
respectively. An n-gram which cannot be found in pool_strings/pool_int64s should be ignored and has no effect on the output.
|
||||
Note that we may consider all skips up to S when generating the n-grams.
|
||||
|
||||
The examples used above are true if mode is "TF". If mode is "IDF", all the counts larger than 1 would be truncated to 1 and
|
||||
the i-th element in weights would be used to scale (by multiplication) the count of the i-th n-gram in pool. If mode is "TFIDF",
|
||||
this operator first computes the counts of all n-grams and then scale them by the associated values in the weights attribute.
|
||||
|
||||
Only one of pool_strings and pool_int64s can be set. If pool_int64s is set, the input should be an integer tensor.
|
||||
If pool_strings is set, the input must be a string tensor.
|
||||
)DOC");
|
||||
|
||||
// Operators for linear 8 bit quanitzation support.
|
||||
ONNX_CONTRIB_OPERATOR_SCHEMA(QuantizeLinear)
|
||||
.SetDomain(kMSDomain)
|
||||
|
|
|
|||
559
onnxruntime/test/contrib_ops/ngram_test.cc
Normal file
559
onnxruntime/test/contrib_ops/ngram_test.cc
Normal file
|
|
@ -0,0 +1,559 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "test/providers/provider_test_utils.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace test {
|
||||
namespace ngram_test {
|
||||
|
||||
constexpr const char* domain = onnxruntime::kMSDomain;
|
||||
const int opset_ver = 1;
|
||||
|
||||
void InitTestAttr(OpTester& test, const std::string& mode,
|
||||
int64_t min_gram_length, int64_t max_gram_length, int64_t max_skip_count,
|
||||
const std::vector<int64_t>& ngram_counts,
|
||||
const std::vector<int64_t>& ngram_indexes,
|
||||
const std::vector<float>& weights,
|
||||
const std::vector<int64_t>& pool_int64s,
|
||||
const std::vector<std::string>& pool_strings) {
|
||||
test.AddAttribute("mode", mode);
|
||||
test.AddAttribute("min_gram_length", min_gram_length);
|
||||
test.AddAttribute("max_gram_length", max_gram_length);
|
||||
test.AddAttribute("max_skip_count", max_skip_count);
|
||||
test.AddAttribute("ngram_counts", ngram_counts);
|
||||
test.AddAttribute("ngram_indexes", ngram_indexes);
|
||||
// optional
|
||||
if (!weights.empty()) {
|
||||
test.AddAttribute("weights", weights);
|
||||
}
|
||||
if (!pool_int64s.empty()) {
|
||||
test.AddAttribute("pool_int64s", pool_int64s);
|
||||
} else {
|
||||
test.AddAttribute("pool_strings", pool_strings);
|
||||
}
|
||||
}
|
||||
} // namespace ngram_test
|
||||
|
||||
using namespace ngram_test;
|
||||
|
||||
// Here is what takes place in general and in particular
|
||||
// in this unit test.There are 7 n - grams : 4 unigrams and 3 bigrams
|
||||
// that are expressed as 10 items(integers in this case) contained within pool_int64 attribute.
|
||||
// We only count and then optionally scale those ngrams that appear in the supplied pool parameter(either int64 or string).
|
||||
// M = 1 and N = 2 in this case.
|
||||
// However, attribute all controls whether we consider all of the supplied ngram[M..N] sizes
|
||||
// into consideration or not.With all = false, we only consider N - grams.
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_onlyBigrams_Skip0) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=0, Min=Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 2, 2, 0,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{2, 3, 5, 4, //1-grams
|
||||
5, 6, 7, 8, 6, 7}, //bi-grams
|
||||
{});
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<int32_t> input = {1, 1, 3, 3, 3, 7, 8, 6, 7, 5, 6, 8};
|
||||
test.AddInput<int32_t>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
std::vector<float> output = {0, 0, 0, 0, 1, 1, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_BatchOnlyBigrams_Skip0) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=0, Min=Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 2, 2, 0,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{2, 3, 5, 4, //1-grams
|
||||
5, 6, 7, 8, 6, 7}, //bi-grams
|
||||
{});
|
||||
|
||||
// Tow batches by six
|
||||
std::vector<int64_t> dims{2, 6};
|
||||
std::vector<int32_t> input = {1, 1, 3, 3, 3, 7,
|
||||
8, 6, 7, 5, 6, 8};
|
||||
test.AddInput<int32_t>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{2, 7};
|
||||
std::vector<float> output = {0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 1, 0, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TF_OnlyBigrams_Skip0) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=0, Min=Max=2, weights empty, string
|
||||
InitTestAttr(test, "TF", 2, 2, 0,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{},
|
||||
{"two", "three", "five", "four", //1-grams
|
||||
"five", "six", "seven", "eight", "six", "seven"}); //bi-grams
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<std::string> input{"one", "one", "three", "three", "three", "seven", "eight",
|
||||
"six", "seven", "five", "six", "eight"};
|
||||
test.AddInput<std::string>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
std::vector<float> output = {0, 0, 0, 0, 1, 1, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TF_BatchOnlyBigrams_Skip0) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=0, Min=Max=2, weights empty, string
|
||||
InitTestAttr(test, "TF", 2, 2, 0,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{},
|
||||
{"two", "three", "five", "four", //1-grams
|
||||
"five", "six", "seven", "eight", "six", "seven"}); //bi-grams
|
||||
|
||||
std::vector<int64_t> dims{2, 6};
|
||||
std::vector<std::string> input{"one", "one", "three", "three", "three", "seven",
|
||||
"eight", "six", "seven", "five", "six", "eight"};
|
||||
test.AddInput<std::string>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{2, 7};
|
||||
// ["seven", "eight"] can not be found due to batch boundary and s=0
|
||||
// bigram elements have to be next to each other
|
||||
std::vector<float> output = {0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 1, 0, 1};
|
||||
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_onlyBigrams_LevelEmpty) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=0, Min=Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 2, 2, 0,
|
||||
{0, 0}, // no unigrams, bi-grams start immediately
|
||||
{
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
}, //7 output indexes
|
||||
{},
|
||||
{ //1-grams none
|
||||
5, 6, 7, 8, 6, 7}, //bi-grams
|
||||
{});
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<int32_t> input = {1, 1, 3, 3, 3, 7, 8, 6, 7, 5, 6, 8};
|
||||
test.AddInput<int32_t>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{3};
|
||||
std::vector<float> output = {1, 1, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 2, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{2, 3, 5, 4, //1-grams
|
||||
5, 6, 7, 8, 6, 7}, //bi-grams
|
||||
{});
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<int32_t> input = {1, 1, 3, 3, 3, 7, 8, 6, 7, 5, 6, 8};
|
||||
test.AddInput<int32_t>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
// No 1-grams but Skip is 5 so we manage to count 3
|
||||
// occurrences of [7,8]
|
||||
std::vector<float> output = {0, 0, 0, 0, 1, 3, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_BatchOnlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, , Min=Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 2, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{2, 3, 5, 4, //1-grams
|
||||
5, 6, 7, 8, 6, 7}, //bi-grams
|
||||
{});
|
||||
|
||||
std::vector<int64_t> dims{2, 6};
|
||||
std::vector<int32_t> input = {1, 1, 3, 3, 3, 7,
|
||||
8, 6, 7, 5, 6, 8};
|
||||
test.AddInput<int32_t>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{2, 7};
|
||||
// Skip is 5 but we are constraint by row boundaries
|
||||
// so count only 1 of each
|
||||
std::vector<float> output = {0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 1, 1, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TF_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, , Min=Max=2, weights empty, string
|
||||
InitTestAttr(test, "TF", 2, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{},
|
||||
{"two", "three", "five", "four", //1-grams
|
||||
"five", "six", "seven", "eight", "six", "seven"}); //bi-grams
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<std::string> input{"one", "one", "three", "three", "three", "seven", "eight",
|
||||
"six", "seven", "five", "six", "eight"};
|
||||
test.AddInput<std::string>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
// No 1-grams but Skip is 5 so we manage to count 3
|
||||
// occurrences of [7,8] in one batch (row)
|
||||
std::vector<float> output = {0, 0, 0, 0, 1, 3, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TF_BatchOnlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, , Min=Max=2, weights empty, string
|
||||
InitTestAttr(test, "TF", 2, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{},
|
||||
{"two", "three", "five", "four", //1-grams
|
||||
"five", "six", "seven", "eight", "six", "seven"}); //bi-grams
|
||||
|
||||
std::vector<int64_t> dims{2, 6};
|
||||
std::vector<std::string> input{"one", "one", "three", "three", "three", "seven", "eight",
|
||||
"six", "seven", "five", "six", "eight"};
|
||||
test.AddInput<std::string>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{2, 7};
|
||||
std::vector<float> output = {0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 1, 1, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_UniAndBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, , Min=1, Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 1, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{2, 3, 5, 4, //1-grams
|
||||
5, 6, 7, 8, 6, 7}, //bi-grams
|
||||
{});
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<int32_t> input = {1, 1, 3, 3, 3, 7, 8, 6, 7, 5, 6, 8};
|
||||
test.AddInput<int32_t>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
// We consider both 1-grams and 2-grams so get all the counts here
|
||||
std::vector<float> output = {0, 3, 1, 0, 1, 3, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TF_BatchUniAndBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, Min=1, Max=2, weights empty, int32
|
||||
InitTestAttr(test, "TF", 1, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{2, 3, 5, 4, //1-grams
|
||||
5, 6, 7, 8, 6, 7}, //bi-grams
|
||||
{});
|
||||
|
||||
std::vector<int64_t> dims{2, 6};
|
||||
std::vector<int32_t> input = {1, 1, 3, 3, 3, 7,
|
||||
8, 6, 7, 5, 6, 8};
|
||||
test.AddInput<int32_t>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{2, 7};
|
||||
// Counts are now per row (batch)
|
||||
std::vector<float> output = {0, 3, 0, 0, 0, 0, 0,
|
||||
0, 0, 1, 0, 1, 1, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TF_UniAndBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, Min=1, Max=2, weights empty, string
|
||||
InitTestAttr(test, "TF", 1, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{},
|
||||
{"two", "three", "five", "four", //1-grams
|
||||
"five", "six", "seven", "eight", "six", "seven"}); //bi-grams
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<std::string> input{"one", "one", "three", "three", "three", "seven", "eight",
|
||||
"six", "seven", "five", "six", "eight"};
|
||||
test.AddInput<std::string>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
std::vector<float> output = {0, 3, 1, 0, 1, 3, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TF_BatchUniAndBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, Min=1, Max=2, weights empty, string
|
||||
InitTestAttr(test, "TF", 1, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{},
|
||||
{"two", "three", "five", "four", //1-grams
|
||||
"five", "six", "seven", "eight", "six", "seven"}); //bi-grams
|
||||
|
||||
std::vector<int64_t> dims{2, 6};
|
||||
std::vector<std::string> input{"one", "one", "three", "three", "three", "seven", "eight",
|
||||
"six", "seven", "five", "six", "eight"};
|
||||
test.AddInput<std::string>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{2, 7};
|
||||
std::vector<float> output = {0, 3, 0, 0, 0, 0, 0,
|
||||
0, 0, 1, 0, 1, 1, 1};
|
||||
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_IDF_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights empty, int32
|
||||
// We change to IDF but do not supply weights so
|
||||
// we should get all 1.0f where count is not zero
|
||||
InitTestAttr(test, "IDF", 2, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{2, 3, 5, 4, //1-grams
|
||||
5, 6, 7, 8, 6, 7}, //bi-grams
|
||||
{});
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<int32_t> input = {1, 1, 3, 3, 3, 7, 8, 6, 7, 5, 6, 8};
|
||||
test.AddInput<int32_t>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
std::vector<float> output = {0, 0, 0, 0, 1, 1, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_IDF_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights empty, string
|
||||
InitTestAttr(test, "IDF", 2, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{},
|
||||
{"two", "three", "five", "four", //1-grams
|
||||
"five", "six", "seven", "eight", "six", "seven"}); //bi-grams
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<std::string> input{"one", "one", "three", "three", "three", "seven", "eight",
|
||||
"six", "seven", "five", "six", "eight"};
|
||||
test.AddInput<std::string>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
std::vector<float> output = {0, 0, 0, 0, 1, 1, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TFIDF_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights empty, int32
|
||||
// We change to TFIDF but do not supply weights so
|
||||
// we should all get the original values as weights are 1.0f by
|
||||
// default
|
||||
InitTestAttr(test, "TFIDF", 2, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{2, 3, 5, 4, //1-grams
|
||||
5, 6, 7, 8, 6, 7}, //bi-grams
|
||||
{});
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<int32_t> input = {1, 1, 3, 3, 3, 7, 8, 6, 7, 5, 6, 8};
|
||||
test.AddInput<int32_t>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
std::vector<float> output = {0, 0, 0, 0, 1, 3, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TFIDF_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights empty, string
|
||||
InitTestAttr(test, "TFIDF", 2, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{},
|
||||
{},
|
||||
{"two", "three", "five", "four", //1-grams
|
||||
"five", "six", "seven", "eight", "six", "seven"}); //bi-grams
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<std::string> input{"one", "one", "three", "three", "three", "seven", "eight",
|
||||
"six", "seven", "five", "six", "eight"};
|
||||
test.AddInput<std::string>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
std::vector<float> output = {0, 0, 0, 0, 1, 3, 1};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_IDFWeights_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights specified, int32
|
||||
// We change to IDF with supplied weights. All
|
||||
// with non-zero counts must be replaced with the supplied weights
|
||||
InitTestAttr(test, "IDF", 2, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0}, // weights
|
||||
{2, 3, 5, 4, //1-grams
|
||||
5, 6, 7, 8, 6, 7}, //bi-grams
|
||||
{});
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<int32_t> input = {1, 1, 3, 3, 3, 7, 8, 6, 7, 5, 6, 8};
|
||||
test.AddInput<int32_t>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
std::vector<float> output = {0, 0, 0, 0, 2, 3, 2};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_IDFWeights_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights specified, string
|
||||
InitTestAttr(test, "IDF", 2, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0}, // weights
|
||||
{},
|
||||
{"two", "three", "five", "four", //1-grams
|
||||
"five", "six", "seven", "eight", "six", "seven"}); //bi-grams
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<std::string> input{"one", "one", "three", "three", "three", "seven", "eight",
|
||||
"six", "seven", "five", "six", "eight"};
|
||||
test.AddInput<std::string>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
std::vector<float> output = {0, 0, 0, 0, 2, 3, 2};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, Int32_TFIDFWeights_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights specified, int32
|
||||
// We change to TFIDF with supplied weights.
|
||||
// We should have all counts scaled by weights
|
||||
InitTestAttr(test, "TFIDF", 2, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0}, // weights
|
||||
{2, 3, 5, 4, //1-grams
|
||||
5, 6, 7, 8, 6, 7}, //bi-grams
|
||||
{});
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<int32_t> input = {1, 1, 3, 3, 3, 7, 8, 6, 7, 5, 6, 8};
|
||||
test.AddInput<int32_t>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
std::vector<float> output = {0, 0, 0, 0, 2, 9, 2};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
TEST(ContribOpNgramTest, String_TFIDFWeights_onlyBigrams_Skip5) {
|
||||
OpTester test("Ngram", opset_ver, domain);
|
||||
// s=5, Min=Max=2, weights specified, string
|
||||
InitTestAttr(test, "TFIDF", 2, 2, 5,
|
||||
{0, 4},
|
||||
{0, 1, 2, 3, 4, 5, 6}, //7 output indexes
|
||||
{2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0}, // weights
|
||||
{},
|
||||
{"two", "three", "five", "four", //1-grams
|
||||
"five", "six", "seven", "eight", "six", "seven"}); //bi-grams
|
||||
|
||||
std::vector<int64_t> dims{12};
|
||||
std::vector<std::string> input{"one", "one", "three", "three", "three", "seven", "eight",
|
||||
"six", "seven", "five", "six", "eight"};
|
||||
test.AddInput<std::string>("T", dims, input);
|
||||
|
||||
std::vector<int64_t> out_dims{7};
|
||||
std::vector<float> output = {0, 0, 0, 0, 2, 9, 2};
|
||||
test.AddOutput<float>("Y", out_dims, output);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess);
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace onnxruntime
|
||||
Loading…
Reference in a new issue