diff --git a/dockerfiles/scripts/docker_run_vitisai.sh b/dockerfiles/scripts/docker_run_vitisai.sh
index 43fc08a7df..10d44de70b 100755
--- a/dockerfiles/scripts/docker_run_vitisai.sh
+++ b/dockerfiles/scripts/docker_run_vitisai.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # --------------------------------------------------------------
 # Copyright(C) Xilinx Inc.
 # Licensed under the MIT License.
diff --git a/docs/execution_providers/Vitis-AI-ExecutionProvider.md b/docs/execution_providers/Vitis-AI-ExecutionProvider.md
index 289010b403..0063d6bfb6 100644
--- a/docs/execution_providers/Vitis-AI-ExecutionProvider.md
+++ b/docs/execution_providers/Vitis-AI-ExecutionProvider.md
@@ -6,9 +6,13 @@
 
 [Vitis-AI](https://github.com/Xilinx/Vitis-AI) is Xilinx's development stack for hardware-accelerated AI inference on Xilinx platforms, including both edge devices and Alveo cards. It consists of optimized IP, tools, libraries, models, and example designs. It is designed with high efficiency and ease of use in mind, unleashing the full potential of AI acceleration on Xilinx FPGA and ACAP.
 
+The current Vitis-AI execution provider inside ONNXRuntime enables acceleration of Neural Network model inference using DPUv1. DPUv1 is a hardware accelerator for Convolutional Neural Networks (CNN) on top of the Xilinx [Alveo](https://www.xilinx.com/products/boards-and-kits/alveo.html) platform and targets U200 and U250 accelerator cards.
+
+On this page you will find information on how to [build](#Build) ONNXRuntime with Vitis-AI and on how to [get started](#Getting-started) with an example.
+
 ## Build
 
-For build instructions, please see the [BUILD page](../../BUILD.md#Vitis-AI). Please setup the hardware environment before starting the build: [Hardware setup](#Hardware-setup).
+For building ONNXRuntime with the Vitis-AI execution provider, you will have to setup the hardware environment and build the docker, see [build steps](#Hardware-setup-and-docker-build).
 
 ### System requirements
 
@@ -28,7 +32,7 @@ The following table lists system requirements for running docker containers as w
 | FPGA                                                | Xilinx Alveo U200 or U250                                  |
 | Docker Version                                      | 19\.03\.1                                                  |
 
-### Hardware setup
+### Hardware setup and docker build
 
 1. Clone the Vitis AI repository:
     ```
@@ -66,11 +70,24 @@ The following table lists system requirements for running docker containers as w
    conda activate vitis-ai-tensorflow
    ```
 
+## Getting started
 
+### On-the-fly quantization
+
+Usually, to be able to accelerate inference of Neural Network models with Vitis-AI DPU accelerators, those models need to quantized upfront. In the ONNXRuntime Vitis-AI execution provider we make use of on-the-fly quantization to remove this additional preprocessing step. In this flow, one doesn't need to quantize his/her model upfront but can make use of the typical inference execution calls (InferenceSession.run) to quantize the model on-the-fly using the first N inputs that are provided (see more information below). This will set up and calibrate the Vitis-AI DPU and from that point onwards inference will be accelerated for all next inputs.
+
+### Config/Settings
+
+A couple of environment variables can be used to customize the Vitis-AI execution provider.
+
+| **Environment Variable**   | **Default if unset**      | **Explanation**                                         |
+|----------------------------|---------------------------|---------------------------------------------------------|
+| PX_QUANT_SIZE              | 128                    | The number of inputs that will be used for quantization (necessary for Vitis-AI acceleration) |
+| PX_BUILD_DIR               | Use the on-the-fly quantization flow | Loads the quantization and compilation information from the provided build directory and immediately starts Vitis-AI hardware acceleration. This configuration can be used if the model has been executed before using on-the-fly quantization during which the quantization and comilation information was cached in a build directory. |
 
 ### Samples
 
-For python, you can base yourself on the following example:
+When using python, you can base yourself on the following example:
 
 ```
 # Import pyxir before onnxruntime
@@ -91,6 +108,7 @@ session = onnxruntime.InferenceSession('[model_file].onnx', None,["VitisAIExecut
 
 # First N (default = 128) inputs are used for quantization calibration and will
 #   be executed on the CPU
+# This config can be changed by setting the 'PX_QUANT_SIZE' (e.g. export PX_QUANT_SIZE=64)
 imput_name = [...]
 outputs = [session.run([], {input_name: calib_inputs[i]})[0] for i in range(128)]
 
diff --git a/onnxruntime/core/providers/vitisai/vitisai_custom_op.cc b/onnxruntime/core/providers/vitisai/vitisai_custom_op.cc
index 4cc163ced4..a7a919026e 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_custom_op.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_custom_op.cc
@@ -56,7 +56,7 @@ VitisAICustomOp::VitisAICustomOp(const ComputeContext* context,
                                  const onnxruntime::Node* fused_node,
                                  const std::string &backend_type,
                                  const logging::Logger* logger)
-  : backend_type_(backend_type) 
+  : backend_type_(backend_type)
 {
   SetLogger(logger);
 
@@ -74,18 +74,16 @@ VitisAICustomOp::VitisAICustomOp(const ComputeContext* context,
 
   auto input_defs = fused_node->InputDefs();
   for (auto idef : input_defs) {
-    // std::cout << "DPU input def: " << idef->Name() << std::endl;
     in_tensor_names_.push_back(idef->Name());
   }
 
   auto output_defs = fused_node->OutputDefs();
   for (auto odef : output_defs) {
-    // std::cout << "DPU output def: " << odef->Name() << std::endl;
     out_tensor_names_.push_back(odef->Name());
   }
   
   pyxir::RunOptionsHolder run_options(new pyxir::runtime::RunOptions());
-  run_options->online_quantization = true;
+  run_options->on_the_fly_quantization = true;
   rt_mod_ = pyxir::build_rt(xg_, backend_type_, in_tensor_names_, out_tensor_names_,
                             "vai", run_options);
 }
@@ -93,7 +91,7 @@ VitisAICustomOp::VitisAICustomOp(const ComputeContext* context,
 VitisAICustomOp::~VitisAICustomOp() {}
 
 
-Status VitisAICustomOp::Compute(const OrtApi* api, OrtKernelContext* context) const {
+Status VitisAICustomOp::Compute(const OrtApi* api, OrtKernelContext* context) const { 
   Ort::CustomOpApi ort{*api};
   const unsigned num_inputs = (unsigned) xg_->get_nb_inputs();
 
@@ -104,7 +102,6 @@ Status VitisAICustomOp::Compute(const OrtApi* api, OrtKernelContext* context) co
   // Initialize input tensors.
   try {
     for (unsigned i = 0; i < num_inputs; ++i) {
-      // std::cout << "Input name: " << in_tensor_names_[i];
       const OrtValue* input_tensor = ort.KernelContext_GetInput(context, i);
       auto tensor_info = ort.GetTensorTypeAndShape(input_tensor);
       auto tensor_type = ort.GetTensorElementType(tensor_info);
@@ -134,7 +131,7 @@ Status VitisAICustomOp::Compute(const OrtApi* api, OrtKernelContext* context) co
       std::vector<ssize_t> out_shape{shape.begin(), shape.end()};
       out_shape[0] = batch_size;
       std::vector<int64_t> ort_shape{out_shape.begin(), out_shape.end()};
-        
+
       OrtValue* output_tensor = ort.KernelContext_GetOutput(context, i, ort_shape.data(), ort_shape.size());
       auto tensor_info = ort.GetTensorTypeAndShape(output_tensor);
       auto tensor_type = ort.GetTensorElementType(tensor_info);
@@ -162,7 +159,7 @@ Status VitisAICustomOp::Compute(const OrtApi* api, OrtKernelContext* context) co
   } catch (...) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, name_ + ": Unknown exception while executing Pyxir computation");
   }
-
+  
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/vitisai/vitisai_custom_op.h b/onnxruntime/core/providers/vitisai/vitisai_custom_op.h
index 26dd507160..5c1604eb9c 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_custom_op.h
+++ b/onnxruntime/core/providers/vitisai/vitisai_custom_op.h
@@ -46,7 +46,6 @@ class VitisAICustomOp {
   }
 
  private:
-  Status Initialize(const OrtApi* api, OrtKernelContext* context) const;
 
   std::vector<std::string> in_tensor_names_;
   std::vector<std::string> out_tensor_names_;
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 74d6934cc5..699f0cb2e6 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -47,31 +47,50 @@ VitisAIExecutionProvider::VitisAIExecutionProvider(const VitisAIExecutionProvide
   InsertAllocator(CreateAllocator(default_memory_info));
 }
 
-
-static std::vector<NodeIndex>
-GetUnsupportedNodeIndices(const XGraphHolder &xg, const std::string &backend_type, 
-                          const GraphViewer& graph_viewer,
+/**
+ * Returns a vector of clusters (or node_idx) that are supported by the given
+ * backend type
+ */
+static std::vector<std::vector<NodeIndex>>
+GetSupportedNodeClusters(const XGraphHolder &xg, const std::string &backend_type, 
+                         const GraphViewer& graph_viewer,
                           /*out*/ std::unordered_set<std::string>& required_initializers) {
 
-  // Retrieve 
-  std::set<std::string> supported_tensors;
+  std::vector<std::vector<NodeIndex>> clusters;
+
+  // Retrieve supported tensor names and corresponding subgraphs they belong to
+  int cur_idx = 0;
+  std::unordered_map<std::string, std::string> supported_tensors;
+  std::unordered_map<std::string, int> cluster_idx;
   for (auto &xl_name : xg->get_layer_names()) {
     XLayerHolder xl = xg->get(xl_name);
-    if (xl->target == backend_type)
-      supported_tensors.insert(xl->get_attr("onnx_id").get_string());
+    if (xl->target == backend_type) {
+      supported_tensors[xl->get_attr("onnx_id").get_string()] = xl->subgraph;
+      if (cluster_idx.find(xl->subgraph) == cluster_idx.end()) {
+        cluster_idx[xl->subgraph] = cur_idx;
+        std::vector<NodeIndex> new_cluster;
+        clusters.push_back(new_cluster);
+      }
+    }
   }
 
-  std::vector<NodeIndex> unsupported_nodes_idx;
-
   for (const auto& node_idx : graph_viewer.GetNodesInTopologicalOrder()) {
     ConstPointerContainer<std::vector<NodeArg*>> node_args
       = graph_viewer.GetNode(node_idx)->OutputDefs();
     
+    int cluster_id = -1;
     bool is_node_supported = false;
     for (ConstPointerContainer<std::vector<NodeArg*>>::ConstIterator it = 
          node_args.begin(); it != node_args.end(); ++it) {
       if (supported_tensors.find((*it)->Name()) != supported_tensors.end()) {
         is_node_supported = true;
+        int found_cluster_id = cluster_idx[supported_tensors[(*it)->Name()]];
+        if (cluster_id != -1 && found_cluster_id != cluster_id) {
+          //Output tensors belong to different clusters
+          LOGS_DEFAULT(FATAL) << "VITIS-AI EP: Found node which belongs to "
+            << "multiple clusters. This is an invalid case";
+        }
+        cluster_id = found_cluster_id;
       } else if (is_node_supported) {
         // Some output tensors are supported but not others,
         //  should not happen
@@ -87,41 +106,10 @@ GetUnsupportedNodeIndices(const XGraphHolder &xg, const std::string &backend_typ
         if(is_input && graph_viewer.GetAllInitializedTensors().count(node_arg.Name())) {
           required_initializers.insert(node_arg.Name());
         } }, true);
-    } else {
-      unsupported_nodes_idx.push_back(node_idx);
+      clusters[cluster_id].push_back(node_idx);
     }
   }
 
-  return unsupported_nodes_idx;
-}
-
-/**
- * Returns a vector clusters(or node_idx). For each unsupported node, the graph is split into 3 parts.
- * supported_cluster + (UNsupported_node + rest_of_the_graph). This functions returns vector of all supported_clusters by DPU
- */
-static std::vector<std::vector<NodeIndex>>
-GetPartitionedClusters(const std::vector<NodeIndex>& topological_order, const std::vector<NodeIndex>& unsupported_nodes) {
-  std::vector<std::vector<NodeIndex>> clusters;
-
-  auto prev = topological_order.begin();
-
-  for (const auto& unsup_node : unsupported_nodes) {
-    auto it = std::find(prev, topological_order.end(), unsup_node);
-    // Create a cluster vector[supported_node_idx, unsupported_node_idx) and append it to return list.
-    std::vector<NodeIndex> this_cluster{prev, it};
-    if (!this_cluster.empty()) {
-      clusters.push_back(std::move(this_cluster));
-    }
-    // Point prev to node idx past this unsuported node.
-    prev = ++it;
-  }
-
-  //Tail
-  std::vector<NodeIndex> this_cluster{prev, topological_order.end()};
-  if (!this_cluster.empty()) {
-    clusters.push_back(std::move(this_cluster));
-  }
-
   return clusters;
 }
 
@@ -279,10 +267,8 @@ VitisAIExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
   }
 
   std::unordered_set<std::string> required_initializers;
-  const auto unsupported_nodes = GetUnsupportedNodeIndices(xg, backend_type_, graph, required_initializers);
-
-  const auto clusters = GetPartitionedClusters(graph.GetNodesInTopologicalOrder(), unsupported_nodes);
-
+  const auto clusters = GetSupportedNodeClusters(xg, backend_type_, graph, required_initializers);
+  
   for (const auto& this_cluster : clusters) {
     std::vector<std::string> cluster_inputs, cluster_outputs;
     GetInputsOutputsOfCluster(graph, this_cluster, required_initializers, cluster_inputs, cluster_outputs);