diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 4d656dd4a3..e536db6472 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -118,110 +118,173 @@ const DataTypeImpl* NumpyToOnnxRuntimeTensorType(int numpy_type) {
   }
 }
 
+TensorShape GetArrayShape(PyArrayObject* pyObject) {
+  int ndim = PyArray_NDIM(pyObject);
+  const npy_intp* npy_dims = PyArray_DIMS(pyObject);
+  std::vector<int64_t> dims(ndim);
+  for (int i = 0; i < ndim; ++i) {
+    dims[i] = npy_dims[i];
+  }
+  TensorShape shape(std::move(dims));
+  return shape;
+}
+
+// This is a one time use, ad-hoc allocator that allows Tensors to take ownership of
+// python array objects and use the underlying memory directly and
+// properly deallocated them when they are done.
+//
+// This addresses the case when our interfaces receive python lists on the input.
+// We have to convert them into new Numpy arrays which 1) needs to be properly deallocated
+// because they are not owned by the calling python code (we create it inside pybind code)
+// 2) we still want to avoid yet another data copy and use it directly if possible
+// 3) string data types still need to be copied
+//
+// This is a stateful allocator. It will always return the same pre-allocated
+// buffer pointer and will own references to underlying objects.
+class OrtPybindSingleUseAllocator : public IAllocator {
+ public:
+   // This constructor is used when we create numpy array from python list
+  OrtPybindSingleUseAllocator(PyArrayObject* pyObject, const std::string& value_name, const OrtMemoryInfo& mem_info)
+      : pyObject_(pyObject, DecRefFn<PyArrayObject>()),
+        pyObjectContiguous_(PyArray_GETCONTIGUOUS(pyObject), DecRefFn<PyArrayObject>()),
+        mem_info_(mem_info) {
+    ORT_ENFORCE(pyObjectContiguous_ != nullptr, "The object must be a contiguous array for input :", value_name);
+  }
+
+  // Constructor to use when a contiguous array had to be copied. Instead of creating yet another copy
+  // we are still able to use it directly for primitive types
+  OrtPybindSingleUseAllocator(UniqueDecRefPtr<PyArrayObject>&& pyContiguous, const std::string& value_name, const OrtMemoryInfo& mem_info) 
+    : pyObject_(nullptr, DecRefFn<PyArrayObject>()),
+      pyObjectContiguous_(std::move(pyContiguous)),
+      mem_info_(mem_info){
+    ORT_ENFORCE(pyObjectContiguous_ != nullptr, "Expecting a valid contiguous array:", value_name);
+  }
+
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(OrtPybindSingleUseAllocator);
+
+  // Always return pre-allocated buffer
+  // which actually contains the array data
+  void* Alloc(size_t) override {
+    return static_cast<void*>(PyArray_DATA(pyObjectContiguous_.get()));
+  }
+
+  void Free(void*) override {
+    // Free when requested, do not wait for
+    // destruction of the allocator which may
+    // be non-deterministic. However, we do not anticipate
+    // true shared ownership of the allocator object except
+    // at the creation stack.
+    pyObjectContiguous_.reset();
+    pyObject_.reset();
+  }
+
+  const OrtMemoryInfo& Info() const override {
+    return mem_info_;
+  }
+
+  PyArrayObject* GetContiguous() const {
+    return pyObjectContiguous_.get();
+  }
+
+ private:
+  UniqueDecRefPtr<PyArrayObject> pyObject_;
+  UniqueDecRefPtr<PyArrayObject> pyObjectContiguous_;
+  OrtMemoryInfo mem_info_;
+};
+
+using OrtPybindSingleUseAllocatorPtr = std::shared_ptr<OrtPybindSingleUseAllocator>;
+
 bool PyObjectCheck_Array(PyObject* o) {
   return PyObject_HasAttrString(o, "__array_finalize__");
 }
 
-std::unique_ptr<Tensor> CreateTensor(AllocatorPtr alloc, const std::string& name_input, PyArrayObject* pyObject) {
-  PyArrayObject* darray = PyArray_GETCONTIGUOUS(pyObject);
-  if (darray == NULL) {
-    throw std::runtime_error(std::string("The object must be a contiguous array for input '") + name_input + std::string("'."));
-  }
-  bool dref = false;
-  std::unique_ptr<Tensor> p_tensor;
-  try {
-    const int npy_type = PyArray_TYPE(darray);
-    // numpy requires long int as its dims.
-    int ndim = PyArray_NDIM(darray);
-    npy_intp* npy_dims = PyArray_DIMS(darray);
-    std::vector<int64_t> dims(ndim);
-    for (int i = 0; i < ndim; ++i) {
-      dims[i] = npy_dims[i];
-    }
-
-    TensorShape shape(dims);
-    auto element_type = NumpyToOnnxRuntimeTensorType(npy_type);
-    if (pyObject == darray && npy_type != NPY_UNICODE && npy_type != NPY_STRING &&
-        npy_type != NPY_VOID && npy_type != NPY_OBJECT) {
-      p_tensor = onnxruntime::make_unique<Tensor>(
-          element_type, shape, static_cast<void*>(PyArray_DATA(darray)), alloc->Info());
-    } else {
-      p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);
-      if (npy_type == NPY_UNICODE) {
-        // Copy string data which needs to be done after Tensor is allocated.
-        // Strings are Python strings or numpy.unicode string.
-        std::string* dst = p_tensor->MutableData<std::string>();
-        auto item_size = PyArray_ITEMSIZE(darray);
-        auto num_chars = item_size / PyUnicode_4BYTE_KIND;
-        char* src = static_cast<char*>(PyArray_DATA(darray));
-        const char* str;
-        Py_ssize_t size;
-        PyObject* pStr;
-        for (int i = 0; i < shape.Size(); i++, src += item_size) {
-          // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
-          pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars);
-          str = PyUnicode_AsUTF8AndSize(pStr, &size);
-          if (str == NULL) {
-            dst[i] = "";
-          } else {
-            // Size is equal to the longest string size, numpy stores
-            // strings in a single array. Those code assumes a string ends with a final 0.
-            dst[i] = str;
-          }
-          Py_XDECREF(pStr);
-        }
-      } else if (npy_type == NPY_STRING || npy_type == NPY_VOID) {
-        // Copy string data which needs to be done after Tensor is allocated.
-        // Strings are given as bytes (encoded strings).
-        // NPY_VOID does not trim final 0.
-        // NPY_STRING assumes bytes string ends with a final 0.
-        std::string* dst = p_tensor->MutableData<std::string>();
-        auto item_size = PyArray_ITEMSIZE(darray);
-        char* src = static_cast<char*>(PyArray_DATA(darray));
-        for (int i = 0; i < shape.Size(); i++, src += item_size) {
-          if (npy_type == NPY_STRING) {
-            dst[i] = src;
-          } else {
-            dst[i].resize(item_size);
-            memcpy((void*)dst[i].c_str(), src, item_size);
-          }
-        }
-      } else if (npy_type == NPY_OBJECT) {
-        // Converts object into string.
-        std::string* dst = p_tensor->MutableData<std::string>();
-        auto item_size = PyArray_ITEMSIZE(darray);
-        char* src = static_cast<char*>(PyArray_DATA(darray));
-        PyObject *item, *pStr;
-        for (int i = 0; i < shape.Size(); ++i, src += item_size) {
-          // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
-          item = PyArray_GETITEM(darray, src);
-          pStr = PyObject_Str(item);
-          dst[i] = py::reinterpret_borrow<py::str>(pStr);
-          Py_XDECREF(pStr);
-        }
+// Expects p_tensor properly created
+// Does not manage darray life-cycle
+void CopyDataToTensor(PyArrayObject* darray, int npy_type, std::unique_ptr<Tensor>& p_tensor) {
+  const auto total_items = p_tensor->Shape().Size();
+  if (npy_type == NPY_UNICODE) {
+    // Copy string data which needs to be done after Tensor is allocated.
+    // Strings are Python strings or numpy.unicode string.
+    std::string* dst = p_tensor->MutableData<std::string>();
+    const auto item_size = PyArray_ITEMSIZE(darray);
+    const auto num_chars = item_size / PyUnicode_4BYTE_KIND;
+    const char* src = reinterpret_cast<const char*>(PyArray_DATA(darray));
+    for (int i = 0; i < total_items; i++, src += item_size) {
+      // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
+      PyObject* pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars);
+      UniqueDecRefPtr<PyObject> strGuard(pStr, DecRefFn<PyObject>());
+      const char* str = PyUnicode_AsUTF8(pStr);
+      if (str == NULL) {
+        dst[i].clear();
       } else {
-        void* buffer = p_tensor->MutableDataRaw();
-        size_t len;
-        if (!IAllocator::CalcMemSizeForArray(element_type->Size(), shape.Size(), &len)) {
-          throw std::runtime_error("length overflow");
-        }
-        memcpy(buffer, static_cast<void*>(PyArray_DATA(darray)), len);
+        // Size is equal to the longest string size, numpy stores
+        // strings in a single array.
+        dst[i] = str;
       }
     }
-  } catch (...) {
-    if (!dref) {
-      Py_XDECREF(darray);
-      dref = true;
+  } else if (npy_type == NPY_STRING || npy_type == NPY_VOID) {
+    // Copy string data which needs to be done after Tensor is allocated.
+    // Strings are given as bytes (encoded strings).
+    // NPY_VOID does not trim final 0.
+    // NPY_STRING assumes bytes string ends with a final 0.
+    std::string* dst = p_tensor->MutableData<std::string>();
+    const auto item_size = PyArray_ITEMSIZE(darray);
+    const char* src = reinterpret_cast<const char*>(PyArray_DATA(darray));
+    for (int i = 0; i < total_items; i++, src += item_size) {
+      if (npy_type == NPY_STRING) {
+        dst[i] = src;
+      } else {
+        dst[i].assign(src, item_size);
+      }
     }
-
-    // allocator should be able to gc the memory created by it.
-    // ...
-
-    throw;
+  } else if (npy_type == NPY_OBJECT) {
+    // Converts object into string.
+    std::string* dst = p_tensor->MutableData<std::string>();
+    const auto item_size = PyArray_ITEMSIZE(darray);
+    const char* src = reinterpret_cast<const char*>(PyArray_DATA(darray));
+    for (int i = 0; i < total_items; ++i, src += item_size) {
+      // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
+      PyObject* item = PyArray_GETITEM(darray, src);
+      PyObject* pStr = PyObject_Str(item);
+      UniqueDecRefPtr<PyObject> strGuard(pStr, DecRefFn<PyObject>());
+      dst[i] = py::reinterpret_borrow<py::str>(pStr);
+    }
+  } else {
+    void* buffer = p_tensor->MutableDataRaw();
+    size_t len;
+    if (!IAllocator::CalcMemSizeForArray(p_tensor->DataType()->Size(), p_tensor->Shape().Size(), &len)) {
+      throw std::runtime_error("length overflow");
+    }
+    memcpy(buffer, PyArray_DATA(darray), len);
   }
+}
 
-  if (!dref) {
-    Py_XDECREF(darray);
+std::unique_ptr<Tensor> CreateTensor(const AllocatorPtr& alloc, const std::string& name_input, PyArrayObject* pyObject) {
+  PyArrayObject* darray = PyArray_GETCONTIGUOUS(pyObject);
+  ORT_ENFORCE(darray != nullptr, "The object must be a contiguous array for input '", name_input, "'.");
+
+  UniqueDecRefPtr<PyArrayObject> darray_guard(darray, DecRefFn<PyArrayObject>());
+  std::unique_ptr<Tensor> p_tensor;
+
+  const int npy_type = PyArray_TYPE(darray);
+  TensorShape shape = GetArrayShape(darray);
+  auto element_type = NumpyToOnnxRuntimeTensorType(npy_type);
+  if (npy_type != NPY_UNICODE && npy_type != NPY_STRING &&
+      npy_type != NPY_VOID && npy_type != NPY_OBJECT) {
+    if (pyObject == darray) {
+      // Use the memory of numpy array directly. The ownership belongs to the calling
+      // python code. In this case, the incoming pyObject must itself be contiguous (pyObject == darray).
+      // darray reference will be decremented but the original array is still alive
+      p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, PyArray_DATA(darray), alloc->Info());
+    } else {
+      // This is the case when a contiguous array is a copy. We still can use it directly with OrtPybindSingleUseAllocator
+      // which takes ownership of the array.
+      auto pybind_alloc = std::make_shared<OrtPybindSingleUseAllocator>(std::move(darray_guard), name_input, alloc->Info());
+      p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, std::move(pybind_alloc));
+    }
+  } else {
+    p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);
+    CopyDataToTensor(darray, npy_type, p_tensor);
   }
 
   return p_tensor;
@@ -280,11 +343,34 @@ void CreateSequenceOfTensors(AllocatorPtr alloc, const std::string& name_input,
                   ml_tensor_sequence->GetDeleteFunc());
 }
 
-void CreateTensorMLValue(AllocatorPtr alloc, const std::string& name_input, PyArrayObject* pyObject,
+void CreateTensorMLValue(const AllocatorPtr& alloc, const std::string& name_input, PyArrayObject* pyObject,
                          OrtValue* p_mlvalue) {
   auto p_tensor = CreateTensor(alloc, name_input, pyObject);
-  if (!p_tensor) {
-    throw std::runtime_error("Got exception while creating tensor for input: " + name_input);
+
+  auto ml_tensor = DataTypeImpl::GetType<Tensor>();
+  p_mlvalue->Init(p_tensor.release(),
+                  ml_tensor,
+                  ml_tensor->GetDeleteFunc());
+}
+
+// This function will create a Tensor that owns the python array memory. This is done to properly
+// release python arrays allocated within the pybind code.
+void CreateTensorMLValueOwned(const OrtPybindSingleUseAllocatorPtr& pybind_alloc, const AllocatorPtr& alloc, OrtValue* p_mlvalue) {
+  auto npy_type = PyArray_TYPE(pybind_alloc->GetContiguous());
+  TensorShape shape = GetArrayShape(pybind_alloc->GetContiguous());
+  auto element_type = NumpyToOnnxRuntimeTensorType(npy_type);
+
+  std::unique_ptr<Tensor> p_tensor;
+
+  if (npy_type != NPY_UNICODE && npy_type != NPY_STRING &&
+      npy_type != NPY_VOID && npy_type != NPY_OBJECT) {
+    // We are able to reuse the memory of the contiguous python buffer and avoid
+    // extra copy using OrtPybindAllocator which will take care of the memory
+    p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, pybind_alloc);
+  } else {
+    // We still need to copy elements properly from the contiguous buffer
+    p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);
+    CopyDataToTensor(pybind_alloc->GetContiguous(), npy_type, p_tensor);
   }
 
   auto ml_tensor = DataTypeImpl::GetType<Tensor>();
@@ -495,7 +581,7 @@ void CreateGenericIterableMLValue(PyObject* iterator, AllocatorPtr alloc, const
   }
 }
 
-void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, AllocatorPtr alloc, const std::string& name_input,
+void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const AllocatorPtr& alloc, const std::string& name_input,
                           py::object& value, OrtValue* p_mlvalue) {
   onnx::TypeProto type_proto;
   if (PyObjectCheck_Array(value.ptr())) {
@@ -505,15 +591,14 @@ void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, Alloc
   } else if (PyList_Check(value.ptr()) &&
              !CheckIfInputIsSequenceType(name_input, input_def_list, type_proto)) {
     // This is not a sequence tensor. This is just a regular tensor fed through as a list.
-    if (!type_proto.tensor_type().has_elem_type()) {
-      std::runtime_error("The graph is missing type information needed to construct the ORT tensor");
-    }
+    ORT_ENFORCE(type_proto.tensor_type().has_elem_type(), "The graph is missing type information needed to construct the ORT tensor");
 
     MLDataType dtype = OrtTypeInfo::ElementTypeFromProto(
         static_cast<ONNX_NAMESPACE::TensorProto_DataType>(type_proto.tensor_type().elem_type()));
 
     int numpy_dtype = OnnxRuntimeTensorToNumpyType(dtype);
 
+    // This creates a new object with its own reference count
     PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(
         PyArray_FromAny(value.ptr(), PyArray_DescrFromType(numpy_dtype), 0, 0, 0, nullptr));
 
@@ -521,7 +606,10 @@ void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, Alloc
       throw std::runtime_error("Could not create tensor from given input list");
     }
 
-    CreateTensorMLValue(alloc, name_input, arr, p_mlvalue);
+    // The allocator will own the array memory and will decrement the reference on Free()
+    // or when destroyed
+    auto pybind_allloc = std::make_shared<OrtPybindSingleUseAllocator>(arr, name_input, alloc->Info());
+    CreateTensorMLValueOwned(pybind_allloc, alloc, p_mlvalue);
   } else if (PyList_Check(value.ptr())) {
     auto* seq_tensors = reinterpret_cast<PyObject*>(value.ptr());
     CreateSequenceOfTensors(alloc, name_input, input_def_list, seq_tensors, p_mlvalue);
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.h b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
index 6dbd611872..fdcaeb04b6 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.h
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
@@ -24,7 +24,17 @@ int OnnxRuntimeTensorToNumpyType(const DataTypeImpl* tensor_type);
 
 MLDataType NumpyTypeToOnnxRuntimeType(int numpy_type);
 
-void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, AllocatorPtr alloc, const std::string& name_input,
+void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const AllocatorPtr& alloc, const std::string& name_input,
                           py::object& value, OrtValue* p_mlvalue);
+
+template <class T>
+struct DecRefFn {
+  void operator()(T* pyobject) const {
+    Py_XDECREF(pyobject);
+  }
+};
+
+template <class T>
+using UniqueDecRefPtr = std::unique_ptr<T, DecRefFn<T>>;
 }  // namespace python
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc
index f841e0487f..bb726b553a 100644
--- a/orttraining/orttraining/python/orttraining_pybind_state.cc
+++ b/orttraining/orttraining/python/orttraining_pybind_state.cc
@@ -22,7 +22,7 @@ using namespace onnxruntime::logging;
 // BEGIN: forward declaration for stuff in onnxruntime_pybind_state
 void InitializeSession(InferenceSession* sess, const std::vector<std::string>& provider_types);
 void GetPyObjFromTensor(const Tensor& rtensor, py::object& obj, const DataTransferManager* data_transfer_manager = nullptr);
-void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, AllocatorPtr alloc, const std::string& name_input,
+void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const AllocatorPtr& alloc, const std::string& name_input,
                           py::object& value, OrtValue* p_mlvalue);
 // END: forward declaration