diff --git a/cmake/winml_unittests.cmake b/cmake/winml_unittests.cmake
index afe89813c5..5c14e3521b 100644
--- a/cmake/winml_unittests.cmake
+++ b/cmake/winml_unittests.cmake
@@ -40,6 +40,7 @@ function(add_winml_test)
   source_group(TREE ${WINML_TEST_SRC_DIR} FILES ${_UT_SOURCES})
   set_winml_target_properties(${_UT_TARGET})
   target_compile_definitions(${_UT_TARGET} PRIVATE BUILD_GOOGLE_TEST)
+  target_precompiled_header(${_UT_TARGET} testPch.h)
 
   if (_UT_DEPENDS)
     add_dependencies(${_UT_TARGET} ${_UT_DEPENDS})
@@ -74,6 +75,14 @@ function(get_winml_test_api_src
   set(${output_winml_test_api_src} ${winml_test_api_src} PARENT_SCOPE)
 endfunction()
 
+function(get_winml_test_concurrency_src
+  winml_test_src_path
+  output_winml_test_concurrency_src
+)
+  file(GLOB winml_test_concurrency_src CONFIGURE_DEPENDS "${winml_test_src_path}/concurrency/*.cpp")
+  set(${output_winml_test_concurrency_src} ${winml_test_concurrency_src} PARENT_SCOPE)
+endfunction()
+
 file(GLOB winml_test_common_src CONFIGURE_DEPENDS "${WINML_TEST_SRC_DIR}/common/*.cpp")
 add_library(winml_test_common STATIC ${winml_test_common_src})
 add_dependencies(winml_test_common
@@ -92,8 +101,6 @@ add_winml_test(
   SOURCES ${winml_test_api_src}
   LIBS winml_test_common delayimp.lib
 )
-target_precompiled_header(winml_test_api testPch.h)
-
 target_link_options(winml_test_api PRIVATE /DELAYLOAD:dxgi.dll /DELAYLOAD:d3d12.dll /DELAYLOAD:api-ms-win-core-file-l1-2-2.dll /DELAYLOAD:api-ms-win-core-synch-l1-2-1.dll)
 if (onnxruntime_USE_DML)
   target_link_options(winml_test_api PRIVATE /DELAYLOAD:directml.dll)
@@ -108,8 +115,6 @@ add_winml_test(
   SOURCES ${winml_test_scenario_src}
   LIBS winml_test_common delayimp.lib ${winml_test_scenario_libs}
 )
-target_precompiled_header(winml_test_scenario testPch.h)
-
 target_link_options(winml_test_scenario PRIVATE /DELAYLOAD:d2d1.dll /DELAYLOAD:d3d11.dll /DELAYLOAD:dxgi.dll /DELAYLOAD:d3d12.dll /DELAYLOAD:api-ms-win-core-libraryloader-l1-2-1.dll /DELAYLOAD:api-ms-win-core-file-l1-2-2.dll /DELAYLOAD:api-ms-win-core-synch-l1-2-1.dll)
 if (onnxruntime_USE_DML)
   target_link_options(winml_test_scenario PRIVATE /DELAYLOAD:directml.dll)
@@ -123,6 +128,14 @@ endif()
 target_link_options(winml_test_scenario PRIVATE /ignore:4199)
 
 
+get_winml_test_concurrency_src(${WINML_TEST_SRC_DIR} winml_test_concurrency_src)
+add_winml_test(
+  TARGET winml_test_concurrency
+  SOURCES ${winml_test_concurrency_src}
+  LIBS winml_test_common
+)
+target_include_directories(winml_test_concurrency PRIVATE ${ONNXRUNTIME_ROOT}/core/graph)
+
 # During build time, copy any modified collaterals.
 # configure_file(source destination COPYONLY), which configures CMake to copy the file whenever source is modified,
 # can't be used here because we don't know the destination during configure time (in multi-configuration generators,
diff --git a/winml/test/common/SqueezeNetValidator.cpp b/winml/test/common/SqueezeNetValidator.cpp
index 8beec6f48e..95ee72c038 100644
--- a/winml/test/common/SqueezeNetValidator.cpp
+++ b/winml/test/common/SqueezeNetValidator.cpp
@@ -121,7 +121,7 @@ ImageFeatureValue BindImageOutput(
 
 
 void ModelValidator::FnsCandy16(
-    std::string instance,
+    const std::string& instance,
     LearningModelDeviceKind deviceKind,
     OutputBindingStrategy outputBindingStrategy,
     bool bindInputsAsIInspectable,
@@ -193,7 +193,7 @@ void ModelValidator::FnsCandy16(
 }
 
 void ModelValidator::SqueezeNet(
-    std::string instance,
+    const std::string& instance,
     LearningModelDeviceKind deviceKind,
     float dataTolerance,
     bool bindAsImage,
@@ -251,7 +251,7 @@ void ModelValidator::SqueezeNet(
         outputBindingStrategy, modelBinding, outputDataBindingName, expectedResultsTensor.Shape());
 
     // Evaluate the model
-    std::cout << "Calling EvaluateSync on instance" << instance << "\n";
+    std::cout << "Calling EvaluateSync on instance " << instance << "\n";
     LearningModelEvaluationResult result = nullptr;
     result = modelSession.Evaluate(modelBinding, {});
 
diff --git a/winml/test/common/SqueezeNetValidator.h b/winml/test/common/SqueezeNetValidator.h
index ef8ad2cd00..5c6e6fa03d 100644
--- a/winml/test/common/SqueezeNetValidator.h
+++ b/winml/test/common/SqueezeNetValidator.h
@@ -10,14 +10,14 @@ enum OutputBindingStrategy { Bound, Unbound, Empty };
 namespace WinML::Engine::Test::ModelValidator
 {
     void FnsCandy16(
-        std::string instance,
+        const std::string& instance,
         winrt::Windows::AI::MachineLearning::LearningModelDeviceKind deviceKind,
         OutputBindingStrategy outputBindingStrategy,
         bool bindInputsAsIInspectable,
         float dataTolerance = false);
 
     void SqueezeNet(
-        std::string instance,
+        const std::string& instance,
         winrt::Windows::AI::MachineLearning::LearningModelDeviceKind deviceKind,
         float dataTolerance,
         bool bindAsImage = false,
diff --git a/winml/test/common/googleTestMacros.h b/winml/test/common/googleTestMacros.h
index dd2bc26515..8e55e6b762 100644
--- a/winml/test/common/googleTestMacros.h
+++ b/winml/test/common/googleTestMacros.h
@@ -51,7 +51,7 @@
 #endif
 
 #define WINML_SKIP_TEST(message) \
-  GTEST_SKIP() << message;
+  WINML_SUPRESS_UNREACHABLE_BELOW(GTEST_SKIP() << message)
 
 #define WINML_EXPECT_NO_THROW(statement) EXPECT_NO_THROW(statement)
 #define WINML_EXPECT_TRUE(statement) EXPECT_TRUE(statement)
@@ -69,13 +69,16 @@
 
 #ifndef USE_DML
 #define GPUTEST \
-  WINML_SUPRESS_UNREACHABLE_BELOW(WINML_SKIP_TEST("GPU tests disabled because this is a WinML only build (no DML)"))
+  WINML_SKIP_TEST("GPU tests disabled because this is a WinML only build (no DML)")
+#define GPUTEST_ENABLED alwaysFalse()
 #else
-#define GPUTEST                                                                         \
-  if (auto noGpuTests = RuntimeParameters::Parameters.find("noGPUtests");               \
-      noGpuTests != RuntimeParameters::Parameters.end() && noGpuTests->second != "0") { \
-    WINML_SKIP_TEST("GPU tests disabled");                                              \
+#define GPUTEST                                                                             \
+  if (auto no_gpu_tests = RuntimeParameters::Parameters.find("noGPUtests");                 \
+      no_gpu_tests != RuntimeParameters::Parameters.end() && no_gpu_tests->second != "0") { \
+    WINML_SKIP_TEST("GPU tests disabled");                                                  \
   }
+#define GPUTEST_ENABLED auto _no_gpu_tests = RuntimeParameters::Parameters.find("noGPUtests");    \
+      _no_gpu_tests == RuntimeParameters::Parameters.end() || _no_gpu_tests->second == "0"
 #endif
 
 #define SKIP_EDGECORE                                                                   \
diff --git a/winml/test/common/taefTestMacros.h b/winml/test/common/taefTestMacros.h
index 9eab80bcdf..a4b3b6beab 100644
--- a/winml/test/common/taefTestMacros.h
+++ b/winml/test/common/taefTestMacros.h
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#define INLINE_TEST_METHOD_MARKUP
+
 #include "WexTestClass.h"
 
 using namespace WEX::Logging;
@@ -27,11 +29,11 @@ using namespace WEX::TestExecution;
   }
 
 #define WINML_SKIP_TEST(message)                                                                 \
-  do {                                                                                           \
+  WINML_SUPRESS_UNREACHABLE_BELOW(                                                               \
     Log::Result(TestResults::Skipped,                                                            \
                 std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(message).c_str()); \
     return;                                                                                      \
-  } while (0)
+  )
 
 #define WINML_EXPECT_NO_THROW(statement) VERIFY_NO_THROW(statement)
 #define WINML_EXPECT_TRUE(statement) VERIFY_IS_TRUE(statement)
@@ -48,19 +50,26 @@ using namespace WEX::TestExecution;
 
 #ifndef USE_DML
 #define GPUTEST \
-  WINML_SUPRESS_UNREACHABLE_BELOW(WINML_SKIP_TEST("GPU tests disabled because this is a WinML only build (no DML)"))
+  WINML_SKIP_TEST("GPU tests disabled because this is a WinML only build (no DML)")
+#define GPUTEST_ENABLED alwaysFalse()
 #else
-#define GPUTEST                                                                             \
-  bool noGPUTests;                                                                          \
-  if (SUCCEEDED(RuntimeParameters::TryGetValue(L"noGPUtests", noGPUTests)) && noGPUTests) { \
-    WINML_SKIP_TEST("This test is disabled by the noGPUTests runtime parameter.");          \
-    return;                                                                                 \
-  }
+#define GPUTEST                                                                               \
+  do {                                                                                        \
+    bool noGPUTests;                                                                          \
+    if (SUCCEEDED(RuntimeParameters::TryGetValue(L"noGPUtests", noGPUTests)) && noGPUTests) { \
+      WINML_SKIP_TEST("This test is disabled by the noGPUTests runtime parameter.");          \
+      return;                                                                                 \
+    }                                                                                         \
+  } while (0)
+#define GPUTEST_ENABLED bool _no_gpu_tests; \
+    !SUCCEEDED(RuntimeParameters::TryGetValue(L"noGPUtests", _no_gpu_tests)) || !_no_gpu_tests
 #endif
 
-#define SKIP_EDGECORE                                                                       \
-  bool edgeCoreRun;                                                                         \
-  if (SUCCEEDED(RuntimeParameters::TryGetValue(L"EdgeCore", edgeCoreRun)) && edgeCoreRun) { \
-    WINML_SKIP_TEST("This test is disabled by the EdgeCore runtime parameter.");            \
-    return;                                                                                 \
-  }
+#define SKIP_EDGECORE                                                                           \
+  do {                                                                                          \
+    bool is_edge_core;                                                                          \
+    if (SUCCEEDED(RuntimeParameters::TryGetValue(L"EdgeCore", is_edge_core)) && is_edge_core) { \
+      WINML_SKIP_TEST("This test is disabled by the EdgeCore runtime parameter.");              \
+      return;                                                                                   \
+    }                                                                                           \
+  } while (0)
diff --git a/winml/test/common/test.h b/winml/test/common/test.h
index 9a97331d06..7c122e992c 100644
--- a/winml/test/common/test.h
+++ b/winml/test/common/test.h
@@ -9,6 +9,9 @@ using SetupTest = VoidTest;
 constexpr bool alwaysTrue() {
     return true;
 }
+constexpr bool alwaysFalse() {
+    return false;
+}
 #define WINML_SUPRESS_UNREACHABLE_BELOW(statement)    \
     if (alwaysTrue()) { statement; }
 
diff --git a/winml/test/common/testPch.h b/winml/test/common/testPch.h
index ec055aee26..1dd14d2aa5 100644
--- a/winml/test/common/testPch.h
+++ b/winml/test/common/testPch.h
@@ -6,6 +6,12 @@
 #endif
 #include "std.h"
 
+// Windows pollutes with preprocessor that redefine OPTIONAL.
+// Undefine OPTIONAL to get onnx macros to resolve correctly.
+#ifdef OPTIONAL
+#undef OPTIONAL
+#endif
+
 #include <wrl/client.h>
 #include <wrl/implements.h>
 
diff --git a/winml/test/concurrency/ConcurrencyTests.cpp b/winml/test/concurrency/ConcurrencyTests.cpp
new file mode 100644
index 0000000000..d7e22d25a0
--- /dev/null
+++ b/winml/test/concurrency/ConcurrencyTests.cpp
@@ -0,0 +1,339 @@
+#include "testPch.h"
+
+#include "concurrencytests.h"
+#include "model.h"
+#include "SqueezeNetValidator.h"
+#include "threadPool.h"
+#include "windows.ai.machinelearning.native.internal.h"
+
+#include <chrono>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+
+using namespace winrt::Windows::AI::MachineLearning;
+using namespace winrt;
+
+namespace {
+void LoadBindEvalSqueezenetRealDataWithValidationConcurrently() {
+    WINML_SKIP_TEST("Skipping due to bug 21617097");
+
+    constexpr auto load_test_model = [](const std::string& instance, LearningModelDeviceKind device) {
+        WinML::Engine::Test::ModelValidator::SqueezeNet(instance, device, 0.00001f, false);
+    };
+
+    std::vector<std::thread> threads;
+    for (const auto& instance : {"1", "2", "3", "4"}) {
+        threads.emplace_back(load_test_model, instance, LearningModelDeviceKind::Cpu);
+    }
+    if (GPUTEST_ENABLED) {
+        for (const auto& instance : {"GPU_1", "GPU_2", "GPU_3", "GPU_4"}) {
+            threads.emplace_back(load_test_model, instance, LearningModelDeviceKind::DirectX);
+        }
+    }
+
+    for (auto& thread : threads) {
+        thread.join();
+    }
+}
+
+void ConcurrencyTestsApiSetup() {
+    init_apartment();
+    std::srand(static_cast<unsigned>(std::time(nullptr)));
+}
+
+struct EvaluationUnit {
+    LearningModel model;
+    LearningModelSession session;
+    LearningModelBinding binding;
+    winrt::Windows::Foundation::IAsyncOperation<LearningModelEvaluationResult> operation;
+    LearningModelEvaluationResult result;
+
+    EvaluationUnit() : model(nullptr), session(nullptr), binding(nullptr), result(nullptr) {}
+};
+
+// Run EvalAsync for each unit concurrently and get results
+void RunAsync(std::vector<EvaluationUnit> &evaluation_units) {
+    std::for_each(evaluation_units.begin(), evaluation_units.end(), [](EvaluationUnit &unit) {
+        unit.operation = unit.session.EvaluateAsync(unit.binding, L"");
+    });
+    // get results
+    std::for_each(evaluation_units.begin(), evaluation_units.end(), [](EvaluationUnit &unit) {
+        unit.result = unit.operation.get();
+    });
+}
+
+void VerifyEvaluation(const std::vector<EvaluationUnit> &evaluation_units, std::vector<uint32_t> expected_indices) {
+    assert(evaluation_units.size() == expected_indices.size());
+    for (size_t i = 0; i < evaluation_units.size(); ++i) {
+        auto unit = evaluation_units[i];
+        auto expectedIndex = expected_indices[i];
+        auto result = unit.result.Outputs().Lookup(L"softmaxout_1").as<TensorFloat>().GetAsVectorView();
+        int64_t maxIndex = 0;
+        float maxValue = 0;
+        for (uint32_t j = 0; j < result.Size(); ++j)
+        {
+            float val = result.GetAt(j);
+            if (val > maxValue)
+            {
+                maxValue = val;
+                maxIndex = j;
+            }
+        }
+        WINML_EXPECT_TRUE(maxIndex == expectedIndex);
+    }
+}
+
+void CopyLocalFile(LPCWSTR from, LPCWSTR to) {
+    using namespace std;
+    ifstream source(FileHelpers::GetModulePath() + from, ios::binary);
+    ofstream dest(FileHelpers::GetModulePath() + to, ios::binary);
+
+    dest << source.rdbuf();
+}
+
+// Run evaluations with different models
+void EvalAsyncDifferentModels() {
+    CopyLocalFile(L"model.onnx", L"model2.onnx");
+
+    std::vector<std::wstring> model_paths = { L"model.onnx", L"model2.onnx" };
+    const unsigned int num_units = static_cast<unsigned int>(model_paths.size());
+    std::vector<EvaluationUnit> evaluation_units(num_units, EvaluationUnit());
+
+    auto ifv = FileHelpers::LoadImageFeatureValue(L"kitten_224.png");
+
+    for (unsigned int i = 0; i < num_units; ++i) {
+        evaluation_units[i].model = LearningModel::LoadFromFilePath(FileHelpers::GetModulePath() + model_paths[i]);
+        evaluation_units[i].session = LearningModelSession(evaluation_units[i].model);
+        evaluation_units[i].binding = LearningModelBinding(evaluation_units[i].session);
+        evaluation_units[i].binding.Bind(L"data_0", ifv);
+    }
+
+    RunAsync(evaluation_units);
+    std::vector<uint32_t> indices(num_units, TABBY_CAT_INDEX);
+    VerifyEvaluation(evaluation_units, indices);
+}
+
+// Run evaluations with same model, different sessions
+void EvalAsyncDifferentSessions() {
+    unsigned int num_units = 3;
+    std::vector<EvaluationUnit> evaluation_units(num_units, EvaluationUnit());
+    auto ifv = FileHelpers::LoadImageFeatureValue(L"kitten_224.png");
+
+    // same model, different session
+    auto model = LearningModel::LoadFromFilePath(FileHelpers::GetModulePath() + L"model.onnx");
+    for (unsigned int i = 0; i < num_units; ++i) {
+        evaluation_units[i].model = model;
+        evaluation_units[i].session = LearningModelSession(evaluation_units[i].model);
+        evaluation_units[i].binding = LearningModelBinding(evaluation_units[i].session);
+        evaluation_units[i].binding.Bind(L"data_0", ifv);
+    }
+
+    RunAsync(evaluation_units);
+    std::vector<uint32_t> indices(num_units, TABBY_CAT_INDEX);
+    VerifyEvaluation(evaluation_units, indices);
+}
+
+// Run evaluations with same session (and model), with different bindings
+void EvalAsyncDifferentBindings() {
+    unsigned int num_units = 2;
+    std::vector<EvaluationUnit> evaluation_units(num_units, EvaluationUnit());
+
+    std::vector<ImageFeatureValue> ifvs = {FileHelpers::LoadImageFeatureValue(L"kitten_224.png"),
+                                           FileHelpers::LoadImageFeatureValue(L"fish.png")};
+
+    // same session, different binding
+    auto model = LearningModel::LoadFromFilePath(FileHelpers::GetModulePath() + L"model.onnx");
+    auto session = LearningModelSession(model);
+    for (unsigned int i = 0; i < num_units; ++i) {
+        evaluation_units[i].model = model;
+        evaluation_units[i].session = session;
+        evaluation_units[i].binding = LearningModelBinding(evaluation_units[i].session);
+        evaluation_units[i].binding.Bind(L"data_0", ifvs[i]);
+    }
+
+    RunAsync(evaluation_units);
+    VerifyEvaluation(evaluation_units, { TABBY_CAT_INDEX, TENCH_INDEX });
+}
+
+winrt::Windows::AI::MachineLearning::ILearningModelFeatureDescriptor UnusedCreateFeatureDescriptor(
+    std::shared_ptr<onnxruntime::Model> model,
+    const std::wstring& name,
+    const std::wstring& description,
+    bool is_required,
+    const ::onnx::TypeProto *type_proto);
+
+// Get random number in interval [1,max_number]
+unsigned int GetRandomNumber(unsigned int max_number) {
+    return std::rand() % max_number + 1;
+}
+
+void MultiThreadLoadModel() {
+    // load same model
+    auto path = FileHelpers::GetModulePath() + L"model.onnx";
+    ThreadPool pool(NUM_THREADS);
+    try {
+        for (unsigned int i = 0; i < NUM_THREADS; ++i) {
+            pool.SubmitWork([&path]() {
+                auto model = LearningModel::LoadFromFilePath(path);
+                std::wstring name(model.Name());
+                WINML_EXPECT_EQUAL(name, L"squeezenet_old");
+            });
+        }
+    }
+    catch (...) {
+        WINML_LOG_ERROR("Failed to load model concurrently.");
+    }
+}
+
+void MultiThreadMultiSessionOnDevice(const LearningModelDevice& device) {
+    auto path = FileHelpers::GetModulePath() + L"model.onnx";
+    auto model = LearningModel::LoadFromFilePath(path);
+    std::vector<ImageFeatureValue> ivfs = {
+        FileHelpers::LoadImageFeatureValue(L"kitten_224.png"),
+        FileHelpers::LoadImageFeatureValue(L"fish.png")
+    };
+    std::vector<int> max_indices = {
+        281, // tabby, tabby cat
+        0 // tench, Tinca tinca
+    };
+    std::vector<float> max_values = {
+        0.9314f,
+        0.7385f
+    };
+    float tolerance = 0.001f;
+    std::vector<LearningModelSession> modelSessions(NUM_THREADS, nullptr);
+    ThreadPool pool(NUM_THREADS);
+    try {
+        device.as<IMetacommandsController>()->SetMetacommandsEnabled(false);
+        // create all the sessions
+        for (unsigned i = 0; i < NUM_THREADS; ++i) {
+            modelSessions[i] = LearningModelSession(model, device);
+        }
+        // start all the threads
+        for (unsigned i = 0; i < NUM_THREADS; ++i) {
+            LearningModelSession &model_session = modelSessions[i];
+            pool.SubmitWork([&model_session,&ivfs,&max_indices,&max_values,tolerance,i]() {
+                DWORD start_time = GetTickCount();
+                while (((GetTickCount() - start_time) / 1000) < NUM_SECONDS) {
+                    auto j = i % ivfs.size();
+                    auto input = ivfs[j];
+                    auto expected_index = max_indices[j];
+                    auto expected_value = max_values[j];
+                    LearningModelBinding bind(model_session);
+                    bind.Bind(L"data_0", input);
+                    auto result = model_session.Evaluate(bind, L"").Outputs();
+                    auto softmax = result.Lookup(L"softmaxout_1");
+                    if (auto tensor = softmax.try_as<ITensorFloat>()) {
+                        auto view = tensor.GetAsVectorView();
+                        float max_val = .0f;
+                        int max_index = -1;
+                        for (uint32_t i = 0; i < view.Size(); ++i) {
+                            auto val = view.GetAt(i);
+                            if (val > max_val)
+                            {
+                                max_index = i;
+                                max_val = val;
+                            }
+                        }
+                        WINML_EXPECT_EQUAL(expected_index, max_index);
+                        WINML_EXPECT_TRUE(std::abs(expected_value - max_val) < tolerance);
+                    }
+                }
+           });
+        }
+    }
+    catch (...) {
+        WINML_EXPECT_HRESULT_SUCCEEDED(E_FAIL, L"Failed to create session concurrently.");
+    }
+}
+
+void MultiThreadMultiSession() {
+    MultiThreadMultiSessionOnDevice(LearningModelDeviceKind::Cpu);
+    if (GPUTEST_ENABLED) {
+        MultiThreadMultiSessionOnDevice(LearningModelDeviceKind::DirectX);
+    }
+}
+
+// Create different sessions for each thread, and evaluate
+void MultiThreadSingleSessionOnDevice(const LearningModelDevice& device) {
+    auto path = FileHelpers::GetModulePath() + L"model.onnx";
+    auto model = LearningModel::LoadFromFilePath(path);
+    LearningModelSession model_session = nullptr;
+    WINML_EXPECT_NO_THROW(model_session = LearningModelSession(model, device));
+    std::vector<ImageFeatureValue> ivfs = {
+        FileHelpers::LoadImageFeatureValue(L"kitten_224.png"),
+        FileHelpers::LoadImageFeatureValue(L"fish.png")
+    };
+    std::vector<int> max_indices = {
+        281, // tabby, tabby cat
+        0 // tench, Tinca tinca
+    };
+    std::vector<float> max_values = {
+        0.9314f,
+        0.7385f
+    };
+    float tolerance = 0.001f;
+
+    ThreadPool pool(NUM_THREADS);
+    try {
+        for (unsigned i = 0; i < NUM_THREADS; ++i) {
+           pool.SubmitWork([&model_session, &ivfs, &max_indices, &max_values, tolerance, i]() {
+                DWORD start_time = GetTickCount();
+                while (((GetTickCount() - start_time) / 1000) < NUM_SECONDS) {
+                    auto j = i % ivfs.size();
+                    auto input = ivfs[j];
+                    auto expected_index = max_indices[j];
+                    auto expected_value = max_values[j];
+                    std::wstring name(model_session.Model().Name());
+                    LearningModelBinding bind(model_session);
+                    bind.Bind(L"data_0", input);
+                    auto result = model_session.Evaluate(bind, L"").Outputs();
+                    auto softmax = result.Lookup(L"softmaxout_1");
+                    if (auto tensor = softmax.try_as<ITensorFloat>())
+                    {
+                        auto view = tensor.GetAsVectorView();
+                        float max_val = .0f;
+                        int max_index = -1;
+                        for (uint32_t k = 0; k < view.Size(); ++k)
+                        {
+                            auto val = view.GetAt(k);
+                            if (val > max_val)
+                            {
+                                max_index = k;
+                                max_val = val;
+                            }
+                        }
+                        WINML_EXPECT_EQUAL(expected_index, max_index);
+                        WINML_EXPECT_TRUE(std::abs(expected_value - max_val) < tolerance);
+                    }
+                }
+           });
+        }
+    }
+    catch (...) {
+        WINML_LOG_ERROR("Failed to create session concurrently.");
+    }
+}
+
+void MultiThreadSingleSession() {
+    MultiThreadSingleSessionOnDevice(LearningModelDeviceKind::Cpu);
+    if (GPUTEST_ENABLED) {
+        MultiThreadSingleSessionOnDevice(LearningModelDeviceKind::DirectX);
+    }
+}
+}
+
+const ConcurrencyTestsApi& getapi() {
+  static constexpr ConcurrencyTestsApi api = {
+    ConcurrencyTestsApiSetup,
+    LoadBindEvalSqueezenetRealDataWithValidationConcurrently,
+    MultiThreadLoadModel,
+    MultiThreadMultiSession,
+    MultiThreadSingleSession,
+    EvalAsyncDifferentModels,
+    EvalAsyncDifferentSessions,
+    EvalAsyncDifferentBindings
+  };
+  return api;
+}
diff --git a/winml/test/concurrency/ConcurrencyTests.h b/winml/test/concurrency/ConcurrencyTests.h
new file mode 100644
index 0000000000..45e0518870
--- /dev/null
+++ b/winml/test/concurrency/ConcurrencyTests.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "test.h"
+
+struct ConcurrencyTestsApi
+{
+  SetupTest ConcurrencyTestsApiSetup;
+  VoidTest LoadBindEvalSqueezenetRealDataWithValidationConcurrently;
+  VoidTest MultiThreadLoadModel;
+  VoidTest MultiThreadMultiSession;
+  VoidTest MultiThreadSingleSession;
+  VoidTest EvalAsyncDifferentModels;
+  VoidTest EvalAsyncDifferentSessions;
+  VoidTest EvalAsyncDifferentBindings;
+};
+const ConcurrencyTestsApi& getapi();
+
+WINML_TEST_CLASS_BEGIN_WITH_SETUP(ConcurrencyTests, ConcurrencyTestsApiSetup)
+WINML_TEST(ConcurrencyTests, LoadBindEvalSqueezenetRealDataWithValidationConcurrently)
+WINML_TEST(ConcurrencyTests, MultiThreadLoadModel)
+WINML_TEST(ConcurrencyTests, MultiThreadMultiSession)
+WINML_TEST(ConcurrencyTests, MultiThreadSingleSession)
+WINML_TEST(ConcurrencyTests, EvalAsyncDifferentModels)
+WINML_TEST(ConcurrencyTests, EvalAsyncDifferentSessions)
+WINML_TEST(ConcurrencyTests, EvalAsyncDifferentBindings)
+WINML_TEST_CLASS_END()
+
+// indices for imagenet label
+static constexpr uint32_t TABBY_CAT_INDEX = 281;
+static constexpr uint32_t TENCH_INDEX = 0;
+// concurrency bugs are often race conditions and hard to catch deterministically.
+//
+// there are several approachs to find them, from consistent testing to random
+// style stress/fuzz testing
+//
+// the testing strategy for *this* test is:
+// - use a consistent and reasonable number of threads (10 vs. 1000)
+// - run them for a consistent and long enough period of time (60 seconds) .
+// - the smaller number of threads is also to make sure memory pressure is not an issue
+//   on pre checkin CI test machines
+static constexpr uint32_t NUM_THREADS = 10;
+static constexpr uint32_t NUM_SECONDS = 10;
diff --git a/winml/test/concurrency/ThreadPool.cpp b/winml/test/concurrency/ThreadPool.cpp
new file mode 100644
index 0000000000..cd81f5f58a
--- /dev/null
+++ b/winml/test/concurrency/ThreadPool.cpp
@@ -0,0 +1,34 @@
+#include "testPch.h"
+#include "ThreadPool.h"
+#include <ctime>
+
+ThreadPool::ThreadPool(unsigned int initial_pool_size): m_threads(), m_destruct_pool(false) {
+    for (unsigned int i = 0; i < initial_pool_size; i++) {
+        m_threads.emplace_back([this]() {
+            while (true) {
+                std::unique_lock<std::mutex> lock(m_mutex);
+                // thread listening for event and acquire lock if event triggered
+                m_cond_var.wait(lock, [this] { return m_destruct_pool || !m_work_queue.empty(); });
+                if (!m_work_queue.empty()) {
+                    auto work = m_work_queue.front();
+                    m_work_queue.pop();
+                    lock.unlock();
+                    work();
+                }
+                else {
+                    // Work queue is empty but lock acquired
+                    // This means we are destructing the pool
+                    break;
+                }
+            }
+        });
+    }
+}
+
+ThreadPool::~ThreadPool() {
+    m_destruct_pool = true;
+    m_cond_var.notify_all(); // notify destruction to threads
+    for (auto &thread : m_threads) {
+        thread.join();
+    }
+}
diff --git a/winml/test/concurrency/ThreadPool.h b/winml/test/concurrency/ThreadPool.h
new file mode 100644
index 0000000000..96a279f53c
--- /dev/null
+++ b/winml/test/concurrency/ThreadPool.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <vector>
+#include <thread>
+#include <queue>
+#include <mutex>
+#include <future>
+
+class ThreadPool {
+private:
+    std::condition_variable m_cond_var;
+    bool m_destruct_pool;
+    std::mutex m_mutex;
+    std::vector<std::thread> m_threads;
+    std::queue<std::function<void()>> m_work_queue;
+
+public:
+    ThreadPool(unsigned int initial_pool_size);
+    ~ThreadPool();
+    template <typename F, typename...Args>
+    inline auto SubmitWork(F &&f, Args&&... args) -> std::future<decltype(f(args...))> {
+        auto func = std::bind(std::forward<F>(f), std::forward<Args>(args)...);
+        auto task = std::make_shared<std::packaged_task<decltype(f(args...))()>>(std::forward<decltype(func)>(func));
+        {
+            std::lock_guard<std::mutex> lock(m_mutex);
+            // wrap packed task into a void return function type so that it can be stored in queue
+            m_work_queue.push([task]() { (*task)(); });
+        }
+
+        m_cond_var.notify_one(); // unblocks one of the waiting threads
+        return task->get_future();
+    }
+};