diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 309b21e9b3..32487aca1a 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -90,7 +90,7 @@ extern "C" {
 #endif
 
 // Copied from TensorProto::DataType
-// Currently, Ort doesn't support complex64, complex128, bfloat16 types
+// Currently, Ort doesn't support complex64, complex128
 typedef enum ONNXTensorElementDataType {
   ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED,
   ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,   // maps to c type float
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 890d4f6112..5026cef60f 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -99,6 +99,77 @@ ORT_DEFINE_RELEASE(ModelMetadata);
 ORT_DEFINE_RELEASE(ThreadingOptions);
 ORT_DEFINE_RELEASE(IoBinding);
 
+/*! \class Ort::Float16_t
+  * \brief it is a structure that represents float16 data.
+  * \details It is necessary for type dispatching to make use of C++ API
+  * The type is implicitly convertible to/from uint16_t.
+  * The size of the structure should align with uint16_t and one can freely cast
+  * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data.
+  * 
+  * Generally, you can feed any of your types as float16/blfoat16 data to create a tensor
+  * on top of it, providing it can form a continuous buffer with 16-bit elements with no padding.
+  * And you can also feed a array of uint16_t elements directly. For example,
+  * 
+  * \code{.unparsed}
+  * uint16_t values[] = { 15360, 16384, 16896, 17408, 17664};
+  * constexpr size_t values_length = sizeof(values) / sizeof(values[0]);
+  * std::vector<int64_t> dims = {values_length};  // one dimensional example
+  * Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+  * // Note we are passing bytes count in this api, not number of elements -> sizeof(values)
+  * auto float16_tensor = Ort::Value::CreateTensor(info, values, sizeof(values), 
+  *                                                dims.data(), dims.size(), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
+  * \endcode
+  * 
+  * Here is another example, a little bit more elaborate. Let's assume that you use your own float16 type and you want to use
+  * a templated version of the API above so the type is automatically set based on your type. You will need to supply an extra
+  * template specialization.
+  * 
+  * \code{.unparsed}
+  * namespace yours { struct half {}; } // assume this is your type, define this:
+  * namespace Ort { 
+  * template<>
+  * struct TypeToTensorType<yours::half> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; };
+  * } //namespace Ort
+  * 
+  * std::vector<yours::half> values;
+  * std::vector<int64_t> dims = {values.size()}; // one dimensional example
+  * Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+  * // Here we are passing element count -> values.size()
+  * auto float16_tensor = Ort::Value::CreateTensor<yours::half>(info, values.data(), values.size(), dims.data(), dims.size());
+  * 
+  *  \endcode
+  */
+struct Float16_t {
+  uint16_t value;
+  constexpr Float16_t() noexcept : value(0) {}
+  constexpr Float16_t(uint16_t v) noexcept : value(v) {}
+  constexpr operator uint16_t() const noexcept { return value; }
+  constexpr bool operator==(const Float16_t& rhs) const noexcept { return value == rhs.value; };
+  constexpr bool operator!=(const Float16_t& rhs) const noexcept { return value != rhs.value; };
+};
+
+static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
+
+/*! \class Ort::BFloat16_t
+  * \brief is a structure that represents bfloat16 data.
+  * \details It is necessary for type dispatching to make use of C++ API
+  * The type is implicitly convertible to/from uint16_t.
+  * The size of the structure should align with uint16_t and one can freely cast
+  * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data.
+  * 
+  * See also code examples for Float16_t above.
+  */
+struct BFloat16_t {
+  uint16_t value;
+  constexpr BFloat16_t() noexcept : value(0) {}
+  constexpr BFloat16_t(uint16_t v) noexcept : value(v) {}
+  constexpr operator uint16_t() const noexcept { return value; }
+  constexpr bool operator==(const BFloat16_t& rhs) const noexcept { return value == rhs.value; };
+  constexpr bool operator!=(const BFloat16_t& rhs) const noexcept { return value != rhs.value; };
+};
+
+static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
+
 // This is used internally by the C++ API. This is the common base class used by the wrapper objects.
 template <typename T>
 struct Base {
@@ -252,7 +323,6 @@ struct SessionOptions : Base<OrtSessionOptions> {
   SessionOptions& AddConfigEntry(const char* config_key, const char* config_value);
   SessionOptions& AddInitializer(const char* name, const OrtValue* ort_val);
   OrtStatus* OrtSessionOptionsAppendExecutionProvider_CUDA(OrtSessionOptions* options, OrtCUDAProviderOptions* cuda_options);
-
 };
 
 struct ModelMetadata : Base<OrtModelMetadata> {
@@ -518,8 +588,7 @@ template <typename TOp, typename TKernel>
 struct CustomOpBase : OrtCustomOp {
   CustomOpBase() {
     OrtCustomOp::version = ORT_API_VERSION;
-    OrtCustomOp::CreateKernel = [](const OrtCustomOp* this_, const OrtApi* api, const OrtKernelInfo* info) {
-      return static_cast<const TOp*>(this_)->CreateKernel(*api, info); };
+    OrtCustomOp::CreateKernel = [](const OrtCustomOp* this_, const OrtApi* api, const OrtKernelInfo* info) { return static_cast<const TOp*>(this_)->CreateKernel(*api, info); };
     OrtCustomOp::GetName = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetName(); };
 
     OrtCustomOp::GetExecutionProviderType = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetExecutionProviderType(); };
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index e0fe933d9d..0172dc7867 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -27,6 +27,10 @@ template <typename T>
 struct TypeToTensorType;
 template <>
 struct TypeToTensorType<float> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
+template<>
+struct TypeToTensorType<Float16_t> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; };
+template<>
+struct TypeToTensorType<BFloat16_t> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16; };
 template <>
 struct TypeToTensorType<double> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE; };
 template <>
diff --git a/onnxruntime/test/framework/data_types_test.cc b/onnxruntime/test/framework/data_types_test.cc
index b764062037..222271c6e3 100644
--- a/onnxruntime/test/framework/data_types_test.cc
+++ b/onnxruntime/test/framework/data_types_test.cc
@@ -9,6 +9,9 @@
 #include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
+#include "core/util/math.h"
+#include <ostream>
+
 #ifdef __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wignored-qualifiers"
@@ -433,7 +436,7 @@ TEST_F(DataTypeTest, BFloat16Test) {
     FloatToBFloat16(sample, converted, sizeof(sample) / sizeof(float));
     for (size_t i = 0; i < sizeof(sample) / sizeof(float); ++i) {
       const double diff = std::fabs(sample[i] - converted[i].ToFloat());
-      if (diff > FLT_EPSILON || (std::isnan(diff) && !std::isnan(sample[i]))) {
+      if ((std::isnan(diff) && !std::isnan(sample[i])) || diff > FLT_EPSILON) {
         EXPECT_TRUE(false);
       }
     }
@@ -442,7 +445,7 @@ TEST_F(DataTypeTest, BFloat16Test) {
     BFloat16ToFloat(converted, back_converted, sizeof(sample) / sizeof(float));
     for (size_t i = 0; i < sizeof(sample) / sizeof(float); ++i) {
       const double diff = std::fabs(sample[i] - back_converted[i]);
-      if (diff > FLT_EPSILON || (std::isnan(diff) && !std::isnan(sample[i]))) {
+      if ((std::isnan(diff) && !std::isnan(sample[i])) || diff > FLT_EPSILON) {
         EXPECT_TRUE(false);
       }
     }
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 4108176e83..12524be636 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -780,7 +780,7 @@ TEST(CApiTest, create_tensor_with_data) {
   std::vector<int64_t> dims = {4};
   Ort::Value tensor = Ort::Value::CreateTensor<float>(info, values, values_length, dims.data(), dims.size());
 
-  float* new_pointer = tensor.GetTensorMutableData<float>();
+  const float* new_pointer = tensor.GetTensorData<float>();
   ASSERT_EQ(new_pointer, values);
 
   auto type_info = tensor.GetTypeInfo();
@@ -790,6 +790,52 @@ TEST(CApiTest, create_tensor_with_data) {
   ASSERT_EQ(1u, tensor_info.GetDimensionsCount());
 }
 
+TEST(CApiTest, create_tensor_with_data_float16) {
+  // Example with C++. However, what we are feeding underneath is really
+  // a continuous buffer of uint16_t
+  // Use 3rd party libraries such as Eigen to convert floats and doubles to float16 types.
+  Ort::Float16_t values[] = { 15360, 16384, 16896, 17408, 17664}; // 1.f, 2.f, 3.f, 4.f, 5.f
+  constexpr size_t values_length = sizeof(values) / sizeof(values[0]);
+
+  std::vector<int64_t> dims = {values_length};
+  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+
+  Ort::Value tensor = Ort::Value::CreateTensor<Ort::Float16_t>(info, values, values_length, dims.data(), dims.size());
+  const auto* new_pointer = tensor.GetTensorData<Ort::Float16_t>();
+  ASSERT_EQ(new_pointer, values);
+  auto type_info = tensor.GetTypeInfo();
+  auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
+  ASSERT_NE(tensor_info, nullptr);
+  ASSERT_EQ(1u, tensor_info.GetDimensionsCount());
+  ASSERT_EQ(tensor_info.GetElementType(), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
+
+  Ort::Float16_t value_at_1 = tensor.At<Ort::Float16_t>({1});
+  ASSERT_EQ(values[1], value_at_1);
+}
+
+TEST(CApiTest, create_tensor_with_data_bfloat16) {
+  // Example with C++. However, what we are feeding underneath is really
+  // a continuous buffer of uint16_t
+  // Conversion from float to bfloat16 is simple. Strip off half of the bytes from float.
+  Ort::BFloat16_t values[] =  {16256, 16384, 16448, 16512, 16544}; // 1.f, 2.f, 3.f, 4.f, 5.f
+  constexpr size_t values_length = sizeof(values) / sizeof(values[0]);
+  std::vector<int64_t> dims = {values_length};
+
+  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+
+  Ort::Value tensor = Ort::Value::CreateTensor<Ort::BFloat16_t>(info, values, values_length, dims.data(), dims.size());
+  const auto* new_pointer = tensor.GetTensorData<Ort::BFloat16_t>();
+  ASSERT_EQ(new_pointer, values);
+  auto type_info = tensor.GetTypeInfo();
+  auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
+  ASSERT_NE(tensor_info, nullptr);
+  ASSERT_EQ(1u, tensor_info.GetDimensionsCount());
+  ASSERT_EQ(tensor_info.GetElementType(), ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16);
+
+  Ort::BFloat16_t value_at_1 = tensor.At<Ort::BFloat16_t>({1});
+  ASSERT_EQ(values[1], value_at_1);
+}
+
 TEST(CApiTest, access_tensor_data_elements) {
   /**
    * Create a 2x3 data blob that looks like: