mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
* Implement separator tokenizer with TST. TODO: Clarify what to do if the output is empty and no start/end text markers required. Also see if the current search algo is acceptable. * Add utf8 util test * For empty output produce [C] -> [C][0], [N][C] -> [N][C][0] * Augument TST search with match conflict resolution in favor of earlier specified pattern matches. * Address MAcOS build error. * Adjust error message * Address review comments. * Remove nested loops. * Remove 3rd party utf8 validation code. * Address review comments part I. * Move padding outside start/end markers. Split unit tests for invidividual test cases. * Fix a common prefix bug reported by Xavier.
46 lines
1.2 KiB
C++
46 lines
1.2 KiB
C++
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
// Licensed under the MIT License.
|
|
|
|
#include "core/common/utf8_util.h"
|
|
#include "gtest/gtest.h"
|
|
#include "test/providers/provider_test_utils.h"
|
|
|
|
namespace onnxruntime {
|
|
namespace test {
|
|
|
|
struct Sample {
|
|
const char* sequence;
|
|
bool valid;
|
|
};
|
|
|
|
const std::vector<Sample> samples = {
|
|
{"a", true},
|
|
{"\xc3\xb1", true},
|
|
{"\xc3\x28", false},
|
|
{"\xa0\xa1", false},
|
|
{"\xe2\x82\xa1", true},
|
|
{"\xe2\x28\xa1", false},
|
|
{"\xe2\x82\x28", false},
|
|
{"\xf0\x90\x8c\xbc", true},
|
|
{"\xf0\x28\x8c\xbc", false},
|
|
{"\xf0\x90\x28\xbc", false},
|
|
{"\xf0\x28\x8c\x28", false},
|
|
{"\xf8\xa1\xa1\xa1\xa1", false}, // valid but not Unicode
|
|
{"\xfc\xa1\xa1\xa1\xa1\xa1", false}}; // valid but not Unicode
|
|
|
|
TEST(Utf8UtilTest, Validate) {
|
|
using namespace utf8_util;
|
|
for (auto& s : samples) {
|
|
size_t utf8_len = 0;
|
|
if (s.valid != utf8_validate(reinterpret_cast<const unsigned char*>(s.sequence), strlen(s.sequence), utf8_len)) {
|
|
ASSERT_TRUE(false);
|
|
} else {
|
|
if (s.valid) {
|
|
ASSERT_EQ(1U, utf8_len);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace test
|
|
} // namespace onnxruntime
|