onnxruntime/onnxruntime/test/common/utf8_util_test.cc
Dmitri Smirnov c52636e187
Implement Tokenizer op (#31)
* Implement separator tokenizer with TST.
  TODO: Clarify what to do if the output is empty and no start/end text
  markers required. Also see if the current search algo is acceptable.

* Add utf8 util test

* For empty output produce [C] -> [C][0], [N][C] -> [N][C][0]

* Augument TST search with match conflict resolution in favor of
  earlier specified pattern matches.

* Address MAcOS build error.

* Adjust error message

* Address review comments.

* Remove nested loops.

* Remove 3rd party utf8 validation code.

* Address review comments part I.

* Move padding outside start/end markers.
  Split unit tests for invidividual test cases.

* Fix a common prefix bug reported by Xavier.
2018-12-05 17:52:04 -08:00

46 lines
1.2 KiB
C++

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/common/utf8_util.h"
#include "gtest/gtest.h"
#include "test/providers/provider_test_utils.h"
namespace onnxruntime {
namespace test {
struct Sample {
const char* sequence;
bool valid;
};
const std::vector<Sample> samples = {
{"a", true},
{"\xc3\xb1", true},
{"\xc3\x28", false},
{"\xa0\xa1", false},
{"\xe2\x82\xa1", true},
{"\xe2\x28\xa1", false},
{"\xe2\x82\x28", false},
{"\xf0\x90\x8c\xbc", true},
{"\xf0\x28\x8c\xbc", false},
{"\xf0\x90\x28\xbc", false},
{"\xf0\x28\x8c\x28", false},
{"\xf8\xa1\xa1\xa1\xa1", false}, // valid but not Unicode
{"\xfc\xa1\xa1\xa1\xa1\xa1", false}}; // valid but not Unicode
TEST(Utf8UtilTest, Validate) {
using namespace utf8_util;
for (auto& s : samples) {
size_t utf8_len = 0;
if (s.valid != utf8_validate(reinterpret_cast<const unsigned char*>(s.sequence), strlen(s.sequence), utf8_len)) {
ASSERT_TRUE(false);
} else {
if (s.valid) {
ASSERT_EQ(1U, utf8_len);
}
}
}
}
} // namespace test
} // namespace onnxruntime