Fix memory leak by improper handling of std::string typed (#227)

output buffer. Tensor returns a buffer to fully constructed std::strings and we should treat them as such.
2026-07-13 18:08:13 +00:00 · 2018-12-19 17:46:21 -08:00 · 2018-12-19 17:46:21 -08:00 · 255ee39af6
commit 255ee39af6
parent e97caa7787
2 changed files with 15 additions and 16 deletions
--- a/onnxruntime/contrib_ops/cpu/string_normalizer.cc
+++ b/onnxruntime/contrib_ops/cpu/string_normalizer.cc
@ -42,7 +42,7 @@ class Locale {
    loc_ = _create_locale(LC_CTYPE, name.c_str());
    if (loc_ == nullptr) {
      ORT_THROW("Failed to construct locale with name:",
-                        name, ":", ":Please, install necessary language-pack-XX and configure locales");
+                name, ":", ":Please, install necessary language-pack-XX and configure locales");
    }
  }

@ -78,7 +78,7 @@ class Locale {
  explicit Locale(const std::string& name) try : loc_(name) {
  } catch (const std::runtime_error& e) {
    ORT_THROW("Failed to construct locale with name:",
-                      name, ":", e.what(), ":Please, install necessary language-pack-XX and configure locales");
+              name, ":", e.what(), ":Please, install necessary language-pack-XX and configure locales");
  }

  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Locale);
@ -118,9 +118,8 @@ Status CopyCaseAction(ForwardIter first, ForwardIter end, OpKernelContext* ctx,
  if (C == 0) {
    output_dims.push_back(1);
    TensorShape output_shape(output_dims);
-    auto output_ten = ctx->Output(0, output_shape);
-    auto output_default = output_ten->template MutableData<std::string>();
-    new (output_default) std::string();
+    // This will create one empty string
+    ctx->Output(0, output_shape);
    return Status::OK();
  }

@ -141,11 +140,11 @@ Status CopyCaseAction(ForwardIter first, ForwardIter end, OpKernelContext* ctx,
      }
      // In place transform
      loc.ChangeCase(caseaction, wstr);
-      new (output_data + output_idx) std::string(converter.to_bytes(wstr));
+      *(output_data + output_idx) = converter.to_bytes(wstr);
    } else {
      assert(caseaction == StringNormalizer::NONE);
      // Simple copy or move if the iterator points to a non-const string
-      new (output_data + output_idx) std::string(std::move(s));
+      *(output_data + output_idx) = std::move(s);
    }
    ++output_idx;
    ++first;
--- a/onnxruntime/contrib_ops/cpu/tokenizer.cc
+++ b/onnxruntime/contrib_ops/cpu/tokenizer.cc
@ -218,7 +218,7 @@ Tokenizer::Tokenizer(const OpKernelInfo& info) : OpKernel(info) {
                        separators[0].empty());

  ORT_ENFORCE(!char_tokenezation_ || mincharnum_ < 2,
-                      "mincharnum is too big for char level tokenezation");
+              "mincharnum is too big for char level tokenezation");

  // Create TST and insert separators
  if (!char_tokenezation_) {
@ -284,7 +284,7 @@ Status Tokenizer::CharTokenize(OpKernelContext* ctx, size_t N, size_t C,
  while (curr_input != last) {
    const auto& s = *curr_input;
    if (mark_) {
-      new (output_data + output_index) std::string(&start_text, 1);
+      (output_data + output_index)->assign(&start_text, 1);
      ++output_index;
    }
    size_t tokens = 0;
@ -295,20 +295,20 @@ Status Tokenizer::CharTokenize(OpKernelContext* ctx, size_t N, size_t C,
      assert(result);
      (void)result;
      assert(token_idx + tlen <= str_len);
-      new (output_data + output_index) std::string(s.substr(token_idx, tlen));
+      *(output_data + output_index) = s.substr(token_idx, tlen);
      ++output_index;
      token_idx += tlen;
      ++tokens;
    }
    if (mark_) {
-      new (output_data + output_index) std::string(&end_text, 1);
+      (output_data + output_index)->assign(&end_text, 1);
      ++output_index;
    }
    // Padding strings
    assert(tokens + (mark_ * 2) <= max_tokens);
    const size_t pads = max_tokens - (mark_ * 2) - tokens;
    for (size_t p = 0; p < pads; ++p) {
-      new (output_data + output_index) std::string(pad_value_);
+      *(output_data + output_index) = pad_value_;
      ++output_index;
    }
    ++curr_input;
@ -422,21 +422,21 @@ Status Tokenizer::SeparatorTokenize(OpKernelContext* ctx,
    size_t c_idx = output_index;
 #endif
    if (mark_) {
-      new (output_data + output_index) std::string(&start_text, 1);
+      (output_data + output_index)->assign(&start_text, 1);
      ++output_index;
    }
    // Output tokens for this row
    for (auto& token : row) {
-      new (output_data + output_index) std::string(converter.to_bytes(token));
+      *(output_data + output_index) = converter.to_bytes(token);
      ++output_index;
    }
    if (mark_) {
-      new (output_data + output_index) std::string(&end_text, 1);
+      (output_data + output_index)->assign(&end_text, 1);
      ++output_index;
    }
    const size_t pads = max_tokens - (mark_ * 2) - row.size();
    for (size_t p = 0; p < pads; ++p) {
-      new (output_data + output_index) std::string(pad_value_);
+      *(output_data + output_index) = pad_value_;
      ++output_index;
    }
 #ifdef _DEBUG