Fixes #626, remove posix option for regular expression in Tokenizer operator (#627)

* remove posix option
* add unit test for regular expression
This commit is contained in:
Xavier Dupré 2019-03-19 14:18:27 +01:00 committed by GitHub
parent da9af592d9
commit 4cc7121368
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 21 additions and 1 deletions

View file

@ -277,7 +277,6 @@ Tokenizer::Tokenizer(const OpKernelInfo& info) : OpKernel(info) {
// Use tokenexp
re2::RE2::Options options;
options.set_longest_match(true);
options.set_posix_syntax(true);
std::unique_ptr<re2::RE2> regex(new re2::RE2(tokenexp, options));
if (!regex->ok()) {
ORT_THROW("Can not digest regex: ", regex->error());

View file

@ -809,5 +809,26 @@ TEST(ContribOpTest, TokenizerExpression_RegDot) {
test.Run(OpTester::ExpectResult::kExpectSuccess);
}
TEST(ContribOpTest, TokenizerExpression_RegChar) {
OpTester test("Tokenizer", opset_ver, domain);
const std::string tokenexp(u8"\\w");
InitTestAttr(test, true, {}, 1, tokenexp);
std::vector<int64_t> dims{1};
std::vector<std::string> input{u8"a;;;b"};
test.AddInput<std::string>("T", dims, input);
std::vector<int64_t> output_dims(dims);
output_dims.push_back(int64_t(4));
std::vector<std::string> output{
start_mark,
u8"a",
u8"b",
end_mark};
test.AddOutput<std::string>("Y", output_dims, output);
test.Run(OpTester::ExpectResult::kExpectSuccess);
}
} // namespace test
} // namespace onnxruntime