enable serialize prepacked weights into data file (#22256)

### Description part of https://github.com/microsoft/onnxruntime/issues/21448 This change is intend to save CPU memory during model load for inference. Added session option save_prepacked_constant_initializers, with save_prepacked_constant_initializers turn on: 1. optimize model with inference session, prepacked external initializer will be saved into data file. 2. load optimized model and external data file with prepacked initializer, no prepack is needed 3. run inference with optimized model and data file Tested with model Phi-3-mini-instruct-onnx, with ORT 1.12.0: ![image](https://github.com/user-attachments/assets/3c0337be-f340-4bb7-8f9f-30f3552072ef) with this change: ![image](https://github.com/user-attachments/assets/23282990-2e1e-4a1f-92de-afa8ed7e6a43) Peak memory usage dropped from **5.438 GB to 2.726GB**. This change takes advantage of ORT loads external initializer with mmap on CPU. Prepack will use extra memory on heap, omit prepack process can save this part of memory (roughly same size as external initializers). next step: Change all the kernels on CPU with PrePack method implemented and test properly. Will do in next PR. ### Motivation and Context
2026-06-07 00:13:17 +00:00 · 2024-10-24 22:24:48 -07:00 · 2024-10-24 22:24:48 -07:00 · c5b6be045f
commit c5b6be045f
parent 4ed5bec2e7
72 changed files with 870 additions and 135 deletions
--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@ -79,6 +79,7 @@ class OpKernel {
  //               the allocator tied to the session if the kernel owns the pre-packed buffer or an
  //               allocator shared between sessions if the pre-packed buffer is to be shared across sessions
  //               (i.e.) the kernel does not own the buffer.
+  // @param save_prepacked_initializers: Set it to true if intend to save prepacked initializers to external data file.
  // @param is_packed: Set it to true if the kernel packed the tensor or to false
  //                   The kernel is responsible for keeping the packed data and related metadata if is_packed is true,
  //                   and the original initialized constant tensor will be released and not accessible anymore in
@ -88,6 +89,7 @@ class OpKernel {

  virtual Status
  PrePack(const Tensor& /*tensor*/, int /*input_idx*/, AllocatorPtr /*alloc*/,
+          bool, /*save_prepacked_initializers*/
          /*out*/ bool& is_packed, /*out*/ PrePackedWeights* /*prepacked_weights*/) {
    is_packed = false;
    return Status::OK();
@ -129,6 +131,26 @@ class OpKernel {
    return Status::OK();
  }

+  // Override this function to get pre-packed tensors from this kernel.
+  // Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
+  // ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
+  // @param input_idx : The index of input we prepacked before and intend to get packed tensor back.
+  // Please refer to matmul_nbits kernel for a complete example.
+  virtual std::optional<Tensor> GetPrePackTensor(int /*input_idx*/) {
+    return std::nullopt;
+  }
+
+  // Override this function to set pre-packed tensors to this kernel and restore prepacked weight buffer.
+  // Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
+  // ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
+  // Please refer to matmul_nbits kernel for a complete example.
+  // @param input_idx : The input index of the tensor in this kernel.
+  // @param pre_packed_tensor: The prepacked tensor read from onnx data file and use the prepacked tensor
+  // to restore prepacked weight buffer.
+  virtual Status SetPrePackTensor(int /*input_idx*/, const Tensor& /*pre_packed_tensor*/) {
+    return Status::OK();
+  }
+
  const OrtDevice GetDevice(OrtMemType mem_type) const;
  const OpKernelInfo& Info() const {
    return *op_kernel_info_;
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@ -1148,6 +1148,11 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
  void FinalizeFuseSubGraph(const IndexedSubGraph& sub_graph, Node& fused_node);
 #endif

+  // Since one constant initializer could be used by different kernels
+  // and prepacked differently, use an unordered_map to store prepacked
+  // initializer in format of <[initializer_name], <[node_name], [prepacked_initializer]>>
+  typedef std::unordered_map<std::string, std::unordered_map<std::string, ONNX_NAMESPACE::TensorProto>> PrePackedTensorProtoToSave;
+
 #if !defined(ORT_MINIMAL_BUILD)
  /** Gets the GraphProto representation of this Graph. */
  const ONNX_NAMESPACE::GraphProto& ToGraphProto();
@ -1182,18 +1187,26 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
  @param initializer_size_threshold initializers larger or equal to this threshold (in bytes) are saved
  in the external file. Initializer smaller than this threshold are included in the onnx file.
  @param align_info offset alignment info.
+  @param save_prepacked_constant_initializers whether to save prepacked initializer into external data file.
+         If set false to this boolean, prepacked initializer will not be saved into onnxruntime data file,
+         we keep constant initializer as it is.
+  @param pre_packed_initializers struct used to store all the prepacked initializers.
  @returns GraphProto serialization of the graph.
  */
  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                  const std::filesystem::path& model_file_path,
                                                                  size_t initializer_size_threshold,
-                                                                  const OffsetAlignmentInfo& align_info) const;
+                                                                  const OffsetAlignmentInfo& align_info,
+                                                                  bool save_prepacked_constant_initializers,
+                                                                  PrePackedTensorProtoToSave& pre_packed_initializers) const;

  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                  const std::filesystem::path& model_file_path,
                                                                  size_t initializer_size_threshold) const {
    OffsetAlignmentInfo default_options;
-    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
+    PrePackedTensorProtoToSave pre_packed_initializers;
+    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options,
+                                                false, pre_packed_initializers);
  }

  /** Gets the ISchemaRegistry instances being used with this Graph. */
@ -1508,6 +1521,18 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
 private:
  void InitializeStateFromModelFileGraphProto();

+  // Private method used to setup external initializer properly during model save,
+  // this external initializer could be oroginal initializer or prepacked initializer.
+  static void SetUpExternalInitializer(const Graph::OffsetAlignmentInfo& align_info,
+                                       size_t tensor_bytes_size,
+                                       int64_t& external_offset,
+                                       std::ofstream& external_stream,
+                                       gsl::span<const uint8_t> raw_data,
+                                       ONNX_NAMESPACE::TensorProto& output_proto,
+                                       const std::filesystem::path& external_file_path,
+                                       const ONNX_NAMESPACE::TensorProto& initializer,
+                                       bool is_prepacked);
+
  // Add node with specified <node_proto>.
  Node& AddNode(const ONNX_NAMESPACE::NodeProto& node_proto,
                const ArgNameToTypeMap& name_to_type);
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@ -246,6 +246,12 @@ static const char* const kOrtSessionOptionsDisableCPUEPFallback = "session.disab
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFileName =
    "session.optimized_model_external_initializers_file_name";

+// Use this config when save prepacked constant initializers to onnx external data file.
+// Default is not save prepacked initializers to onnx data file.
+// Sample usage: sess_options.add_session_config_entry('session.save_prepacked_constant_initializers',  "1")
+static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
+    "session.save_prepacked_constant_initializers";
+
 // Use this config to control the minimum size of the initializer when externalizing it during serialization
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
    "session.optimized_model_external_initializers_min_size_in_bytes";
--- a/onnxruntime/contrib_ops/cpu/bert/attention.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/attention.cc
@ -30,6 +30,7 @@ class Attention : public OpKernel, public AttentionCPUBase {
  Status Compute(OpKernelContext* context) const override;

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

@ -101,6 +102,7 @@ bool Attention<T>::IsPackWeightsSuccessful(int qkv_index,

 template <typename T>
 Status Attention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
+                             bool /*save_prepacked_initializers*/,
                             /*out*/ bool& is_packed,
                             /*out*/ PrePackedWeights* prepacked_weights) {
  /* The PrePack() massages the weights to speed up Compute(), there is an option to
--- a/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc
@ -24,6 +24,7 @@ class QAttention : public OpKernel, public AttentionCPUBase {
  Status Compute(OpKernelContext* context) const override;

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 bool& /*out*/ is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

@ -58,6 +59,7 @@ QAttention<T>::QAttention(const OpKernelInfo& info) : OpKernel(info), AttentionC

 template <typename T>
 Status QAttention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
+                              bool /*save_prepacked_initializers*/,
                              /*out*/ bool& is_packed,
                              /*out*/ PrePackedWeights* prepacked_weights) {
  if (1 != input_idx) {
--- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
@ -13,7 +13,7 @@ class DynamicQuantizeLSTM : public OpKernel, public LSTMBase {
  DynamicQuantizeLSTM(const OpKernelInfo& info) : OpKernel(info), LSTMBase(info) {}

  Status PrePack(const Tensor& tensor, int input_idx,
-                 AllocatorPtr alloc, /*out*/ bool& is_packed,
+                 AllocatorPtr alloc, bool save_prepacked_initializers, /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

  Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
@ -91,6 +91,7 @@ static void UseSharedPrePackedBuffersImpl(std::vector<BufferUniquePtr>& prepacke
 }

 Status DynamicQuantizeLSTM::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                                    bool /*save_prepacked_initializers*/,
                                    /*out*/ bool& is_packed,
                                    /*out*/ PrePackedWeights* prepacked_weights) {
  is_packed = false;
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@ -98,12 +98,19 @@ class MatMulNBits final : public OpKernel {
  Status Compute(OpKernelContext* context) const override;

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

+  void ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx);
+
  Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers, int input_idx,
                                   /*out*/ bool& used_shared_buffers) override;

+  std::optional<Tensor> GetPrePackTensor(int /*input_idx*/) override;
+
+  Status SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) override;
+
 private:
  const size_t K_;
  const size_t N_;
@ -119,6 +126,8 @@ class MatMulNBits final : public OpKernel {
  size_t packed_b_size_{0};
  IAllocatorUniquePtr<float> scales_fp32_{};
  IAllocatorUniquePtr<float> bias_fp32_{};
+  std::optional<Tensor> packed_tensor_{std::nullopt};
+  MLDataType prepack_tensor_data_type_;

  bool has_zp_input_{false};

@ -148,8 +157,22 @@ class MatMulNBits final : public OpKernel {
  }
 };

+template <typename T1>
+void MatMulNBits<T1>::ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx) {
+  if (input_idx == InputIndex::B) {
+    prepack_tensor_data_type_ = tensor.DataType();
+  }
+
+  TensorShapeVector weights_dims = {static_cast<int64_t>((packed_b_size_ - 1) / prepack_tensor_data_type_->Size()) + 1};
+  packed_tensor_ = Tensor(prepack_tensor_data_type_,
+                          TensorShape(weights_dims),
+                          packed_b_.get(),
+                          OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator));
+}
+
 template <typename T1>
 Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
+                                bool save_prepacked_initializers,
                                /*out*/ bool& is_packed,
                                /*out*/ PrePackedWeights* prepacked_weights) {
  ORT_UNUSED_PARAMETER(prepacked_weights);
@ -185,11 +208,16 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
 #endif  // MLAS_TARGET_AMD64_IX86
  }

+  if (save_prepacked_initializers) {
+    ConvertPrepackWeightIntoTensor(tensor, input_idx);
+  }
+
  return Status::OK();
 }

 template <>
 Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
+                                       bool save_prepacked_initializers,
                                       /*out*/ bool& is_packed,
                                       /*out*/ PrePackedWeights* prepacked_weights) {
  ORT_UNUSED_PARAMETER(prepacked_weights);
@ -239,6 +267,34 @@ Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*ou
 #endif  // MLAS_TARGET_AMD64_IX86
  }

+  if (save_prepacked_initializers) {
+    ConvertPrepackWeightIntoTensor(tensor, input_idx);
+  }
+
+  return Status::OK();
+}
+
+template <typename T1>
+std::optional<Tensor> MatMulNBits<T1>::GetPrePackTensor(int input_idx) {
+  // For this kernel, prepack is performed on input_B, and possibly scales, zeros_points.
+  // During compute process, scales and zeros_points will keep as it is and only use prepacked
+  // buffer to replace input_B.
+  // Inorder to cope with this logic, we need to return latest prepacked buffer and only serialize
+  // the latest one. So, we need to always return packed_tensor_ here not only for input_B.
+  ORT_UNUSED_PARAMETER(input_idx);
+  return std::move(packed_tensor_);
+}
+
+template <typename T1>
+Status MatMulNBits<T1>::SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) {
+  if (input_idx == 1) {
+    // pre_packed_tensor is constant initialized tensor and its lifecycle is managed by session_state,
+    // session_state will release memory from pre_packed_tensor. packed_b_ will not release memory so
+    // pass empty/default buffer deleter here.
+    // const_cast here is temporary, will fix in follow up PR.
+    packed_b_ = BufferUniquePtr(const_cast<void*>(pre_packed_tensor.DataRaw()), BufferDeleter());
+  }
+
  return Status::OK();
 }

--- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
+++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
@ -278,6 +278,7 @@ Status SkipLayerNorm<T, simplified>::Compute(OpKernelContext* p_ctx) const {

 template <typename T, bool simplified>
 Status SkipLayerNorm<T, simplified>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                                             bool /*save_prepacked_initializers*/,
                                             bool& is_packed, PrePackedWeights* prepacked_weights) {
  ORT_UNUSED_PARAMETER(prepacked_weights);

--- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h
+++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h
@ -16,7 +16,7 @@ class SkipLayerNorm final : public OpKernel {
  SkipLayerNorm(const OpKernelInfo& op_kernel_info);
  Status Compute(OpKernelContext* p_op_kernel_context) const override;

-  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool save_prepacked_initializers,
                 bool& is_packed, PrePackedWeights* prepacked_weights) override;

 private:
--- a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
@ -95,6 +95,7 @@ GroupNorm::GroupNorm(const OpKernelInfo& op_info) : CudaKernel(op_info) {
 }

 Status GroupNorm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr /*alloc*/,
+                          bool /*save_prepacked_initializers*/,
                          bool& is_packed, PrePackedWeights* /*prepacked_weights*/) {
  is_packed = false;

--- a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.h
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.h
@ -17,6 +17,7 @@ class GroupNorm final : public CudaKernel {
  Status ComputeInternal(OpKernelContext* context) const override;

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 bool& is_packed, PrePackedWeights* prepacked_weights) override;

 private:
--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc
@ -99,6 +99,7 @@ Status QOrderedAttention::PutIntoMergedBias(const Tensor& tensor, AllocatorPtr a
 }

 Status QOrderedAttention::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
+                                  bool /*save_prepacked_initializers*/,
                                  /*out*/ bool& is_packed,
                                  /*out*/ PrePackedWeights* /*prepacked_weights*/) {
  is_packed = false;
--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.h
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.h
@ -20,6 +20,7 @@ class QOrderedAttention final : public CudaKernel, public AttentionBase {

 public:
  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.cc
@ -51,6 +51,7 @@ QOrderedMatMul::QOrderedMatMul(const OpKernelInfo& info) : CudaKernel(info) {
 }

 Status QOrderedMatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                               bool /*save_prepacked_initializers*/,
                               /*out*/ bool& is_packed,
                               /*out*/ PrePackedWeights* /* prepacked_weights */) {
  is_packed = false;
--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.h
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.h
@ -18,6 +18,7 @@ class QOrderedMatMul final : public CudaKernel {
  Status ComputeInternal(OpKernelContext* context) const override;

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@ -83,6 +83,11 @@ struct SessionOptions {
  // enable profiling for this session.
  bool enable_profiling = false;

+  // save pre-packed constant external initializers instead of original initializers to onnxruntime data file.
+  // Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
+  // ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
+  bool save_prepacked_constant_initializers = false;
+
  // Non empty filepath enables serialization of the transformed optimized model to the specified filepath.
  //
  // Set session config value for ORT_SESSION_OPTIONS_CONFIG_SAVE_MODEL_FORMAT to 'ORT' or 'ONNX' to explicitly
@ -191,6 +196,7 @@ inline std::ostream& operator<<(std::ostream& os, const SessionOptions& session_
     << " execution_mode:" << session_options.execution_mode
     << " execution_order:" << session_options.execution_order
     << " enable_profiling:" << session_options.enable_profiling
+     << " save_prepacked_constant_initializers:" << session_options.save_prepacked_constant_initializers
     << " optimized_model_filepath:" << ORT_TSTR_CONVERT_TO_PRINTABLE_STRING(session_options.optimized_model_filepath)
     << " enable_mem_pattern:" << session_options.enable_mem_pattern
     << " enable_mem_reuse:" << session_options.enable_mem_reuse
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@ -14,6 +14,7 @@
 #include "core/framework/op_kernel.h"
 #include "core/framework/ort_value_pattern_planner.h"
 #include "core/framework/session_state_utils.h"
+#include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/controlflow/utils.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
@ -397,12 +398,18 @@ static std::string GenerateKeyForPrepackedWeightsMap(const std::string& op_type,
 }

 Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
-                                                       const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map) {
-  auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map](
+                                                       const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map,
+                                                       bool save_prepacked_constant_initializers,
+                                                       PrePackInitializers& pre_packed_initializers) {
+  auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map,
+                                     save_prepacked_constant_initializers, &pre_packed_initializers](
                                        bool should_cache_prepacked_weights_for_shared_initializers) -> Status {
+    std::unordered_map<std::string, std::string> pre_packed_kernel_input_map;
    for (auto& node : GetGraphViewer().Nodes()) {
      auto kernel = GetMutableKernel(node.Index());
+      auto kernel_name = kernel->Info().node().Name();
      int input_idx = 0;
+      bool is_kernel_prepacked = false;
      for (auto& input_def : node.InputDefs()) {
        if (input_def->Exists()) {
          const std::string& input_name = input_def->Name();
@ -414,16 +421,27 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
            if (st->GetOrtValueNameIdxMap().GetIdx(input_name, ort_value_idx).IsOK()) {
              std::unordered_map<int, OrtValue>& constant_initialized_tensors = st->constant_initialized_tensors_;

-              if (constant_initialized_tensors.count(ort_value_idx)) {
+              if (constant_initialized_tensors.count(ort_value_idx) && !is_kernel_prepacked) {
                bool is_packed = false;
                const Tensor& const_initialized_tensor = constant_initialized_tensors[ort_value_idx].Get<Tensor>();

                auto iter = initializers_to_share_map.find(input_name);
                bool is_shared_initializer = (iter != initializers_to_share_map.end());

+                // found pre-packed constant initializers from data file, no need to do pre-packing again
+                // apply pre-packed tensor to kernel so kernel can use it directly
+                if (pre_packed_initializers.pre_packed_initializer_names_read_from_file.count(input_name) != 0) {
+                  is_packed = true;
+
+                  // kernel like Matmul_nbits will call prepack multiple times with input_B and possibly scales/zero_points.
+                  // If prepacked weights already read from ONNX data file (this happens we ORT reads data file with prepacked
+                  // weights serialized), only need to set prepacked weights once to kernel.
+                  is_kernel_prepacked = true;
+                  ORT_THROW_IF_ERROR(kernel->SetPrePackTensor(input_idx, const_initialized_tensor));
+                }
                // Caching pre-packed weights is limited to shared initializers associated with the CPU EP for now
-                if (is_shared_initializer && should_cache_prepacked_weights_for_shared_initializers &&
-                    node.GetExecutionProviderType() == kCpuExecutionProvider) {  // caching of pre-packed weights' turned ON
+                else if (is_shared_initializer && should_cache_prepacked_weights_for_shared_initializers &&
+                         node.GetExecutionProviderType() == kCpuExecutionProvider) {  // caching of pre-packed weights' turned ON

                  AllocatorPtr allocator_for_caching = prepacked_weights_container_->GetOrCreateAllocator(CPU);
                  ORT_ENFORCE(allocator_for_caching.get() != nullptr);
@ -435,7 +453,7 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
                  // weight with the pre-packed weight generated by this instance of the same op_type because other static
                  // properties of the node like node attributes could play a role in the pre-packed weights' contents.
                  ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx, allocator_for_caching,
-                                                      is_packed,
+                                                      save_prepacked_constant_initializers, is_packed,
                                                      &weights_to_be_filled_in));

                  if (is_packed) {
@ -482,18 +500,50 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
                  AllocatorPtr session_cpu_alloc = GetAllocator(kernel->Info().GetDevice(OrtMemType::OrtMemTypeDefault));
                  ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx,
                                                      session_cpu_alloc,  // use allocator tied to this session
+                                                      save_prepacked_constant_initializers,
                                                      is_packed,
                                                      nullptr  // no caching required
                                                      ));
                }
                if (is_packed) {
+                  // if intended to save prepacked initializers, get prepacked tensors from kernel and save in hashmap,
+                  // will save to data file later
+                  if (save_prepacked_constant_initializers) {
+                    auto tensor = kernel->GetPrePackTensor(input_idx);
+
+                    if (tensor != std::nullopt) {
+                      // save prepacked initializers per initializer and kernel since one initializer could
+                      // be used by multiple kernels
+                      pre_packed_initializers.pre_packed_initializers_to_save[input_name][kernel_name] = std::move(tensor.value());
+
+                      pre_packed_kernel_input_map[kernel_name] = input_name;
+                    }
+                  }
+
                  ++number_of_prepacks_counter_;

-                  if (constant_initializers_use_count.count(input_name) && --constant_initializers_use_count[input_name] == 0) {
+                  // if constant_initialized_tensor is already pre-packed, don't need to remove it
+                  if (pre_packed_initializers.pre_packed_initializer_names_read_from_file.count(input_name) == 0 &&
+                      constant_initializers_use_count.count(input_name) && --constant_initializers_use_count[input_name] == 0) {
                    // release the constant initialized tensor
                    st->initialized_tensors_.erase(ort_value_idx);
                    constant_initialized_tensors.erase(ort_value_idx);
                  }
+                } else {
+                  // handle prepack for matmul_nbits, it will prepack several times but set is_packed
+                  // to false for scales and zero_points, we keep scales and zero_points as it is only
+                  // update packed_tensor to input_B.
+                  // TODO: this logic works with matmul_nbits kernel but if other kernels also call prepack
+                  // multiple times and use different initializers to store prepacked weights, this piece of logic
+                  // might introduce bug and need a per kernel strategy to update prepacked weights.
+                  if (save_prepacked_constant_initializers && pre_packed_kernel_input_map.count(kernel_name)) {
+                    auto tensor = kernel->GetPrePackTensor(input_idx);
+
+                    if (tensor != std::nullopt) {
+                      auto existing_input_name = pre_packed_kernel_input_map[kernel_name];
+                      pre_packed_initializers.pre_packed_initializers_to_save[existing_input_name][kernel_name] = std::move(tensor.value());
+                    }
+                  }
                }
              }
              // stop searching in 2 cases:
@ -1176,6 +1226,7 @@ static Status VerifyEachNodeIsAssignedToAnEp(const Graph& graph, const logging::

 Status SessionState::FinalizeSessionState(const std::basic_string<PATH_CHAR_TYPE>& graph_location,
                                          const KernelRegistryManager& kernel_registry_manager,
+                                          PrePackInitializers& pre_packed_initializers,
                                          bool remove_initializers,
                                          bool saving_ort_format) {
  // recursively create the subgraph session state instances and populate the kernel create info in them.
@ -1189,7 +1240,7 @@ Status SessionState::FinalizeSessionState(const std::basic_string<PATH_CHAR_TYPE
  InlinedHashMap<std::string, size_t> constant_initializers_use_count;
  ComputeConstantInitializerUseCount(graph_, constant_initializers_use_count);
  return FinalizeSessionStateImpl(graph_location, kernel_registry_manager, nullptr, sess_options_,
-                                  remove_initializers, constant_initializers_use_count);
+                                  remove_initializers, constant_initializers_use_count, pre_packed_initializers);
 }

 static Status Index(const OrtValueNameIdxMap& ort_value_name_idx_map,
@ -1323,6 +1374,7 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
                                              const SessionOptions& session_options,
                                              bool remove_initializers,
                                              InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
+                                              PrePackInitializers& pre_packed_initializers,
                                              const InlinedHashMap<OrtValueName, OrtDevice>& outer_scope_node_arg_to_location_map,
                                              bool graph_info_already_created) {
  if (!graph_info_already_created) {
@ -1422,6 +1474,8 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
  // For inference it is enabled by default, but users can choose to disable it via session options.
  const bool disable_prepacking =
      session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigDisablePrepacking, "0") == "1";
+  const bool save_prepacked_constant_initializers =
+      session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsSavePrePackedConstantInitializers, "0") == "1";
  // Memory pattern tracer allocates all initializers on a single contiguous
  // buffer. This has the effect of reducing memory fragmentation.
  // Further more, in training scenarios NCCL kernels require initializers to be allocated
@ -1474,7 +1528,9 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
    }
  }
 #endif
-
+  // This unorder set is used during model load with prepacked initializer saved in ONNX data file.
+  // ORT reads prepacked initializers and store them into this set so we could skip PrePack
+  // process later to save heap memory.
  ORT_RETURN_IF_ERROR(
      session_state_utils::SaveInitializedTensors(
          Env::Default(), graph_location, *graph_viewer_,
@ -1489,7 +1545,7 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
            return Status::OK();
          },
          logger_, data_transfer_mgr_, external_data_loader_mgr_, *p_seq_exec_plan_, session_options,
-          memory_profile_func, name_to_buffered_tensor_));
+          memory_profile_func, name_to_buffered_tensor_, pre_packed_initializers.pre_packed_initializer_names_read_from_file));

 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
  // Record Weight allocation info on device
@ -1505,9 +1561,14 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_

  ORT_RETURN_IF_ERROR(CreateKernels(kernel_registry_manager));

+  // pre-packed constant initializers only when:
+  //   1. pre-packing is not disabled
+  //   2. pre-packed not initializers found and loaded during model load
  if (!disable_prepacking) {
    ORT_RETURN_IF_ERROR(PrepackConstantInitializedTensors(constant_initializers_use_count,
-                                                          session_options.initializers_to_share_map));
+                                                          session_options.initializers_to_share_map,
+                                                          save_prepacked_constant_initializers,
+                                                          pre_packed_initializers));
  }

  ORT_RETURN_IF_ERROR(
@ -1546,7 +1607,7 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
                                                               subgraph_outer_scope_node_arg_to_location_map));
      ORT_RETURN_IF_ERROR(subgraph_session_state.FinalizeSessionStateImpl(
          graph_location, kernel_registry_manager, &node, subgraph_session_options, remove_initializers,
-          constant_initializers_use_count, subgraph_outer_scope_node_arg_to_location_map, true));
+          constant_initializers_use_count, pre_packed_initializers, subgraph_outer_scope_node_arg_to_location_map, true));

      // setup all the info for handling the feeds and fetches used in subgraph execution
      auto* p_op_kernel = GetMutableKernel(node.Index());
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@ -312,8 +312,25 @@ class SessionState {
    return &name_to_buffered_tensor_;
  }

+  // Data structure stores prepacked initializers in format of Tensor.
+  struct PrePackInitializers {
+    // This map is used during model save for prepacked initializers.
+    // Since one constant initializer could be used by different kernels
+    // and prepacked differently, use an unordered_map to store prepacked
+    // initializer in format of <[initializer_name], <[kernel_name], [prepacked_initializer]>>
+    typedef std::unordered_map<std::string, std::unordered_map<std::string, Tensor>> PrePackedTensorsToSave;
+    PrePackedTensorsToSave pre_packed_initializers_to_save;
+
+    // This set is used during model load with prepacked initializer serialized in external data file.
+    // ORT reads prepacked initializers and store their name into this set so we could skip PrePack
+    // process later to save heap memory. Prepacked tensor itself is saved in session state's constant_initialized_tensors_.
+    typedef std::unordered_set<std::string> PrePackedTensorNamesReadFromFile;
+    PrePackedTensorNamesReadFromFile pre_packed_initializer_names_read_from_file;
+  };
+
  Status FinalizeSessionState(const std::basic_string<PATH_CHAR_TYPE>& graph_loc,
                              const KernelRegistryManager& kernel_registry_manager,
+                              PrePackInitializers& pre_packed_initializers,
                              bool remove_initializers = true,
                              bool saving_ort_format = false);

@ -321,6 +338,15 @@ class SessionState {
    return parent_;
  }

+  Status FinalizeSessionState(const std::basic_string<PATH_CHAR_TYPE>& graph_loc,
+                              const KernelRegistryManager& kernel_registry_manager,
+                              bool remove_initializers = true,
+                              bool saving_ort_format = false) {
+    PrePackInitializers pre_packed_initializers;
+    return FinalizeSessionState(graph_loc, kernel_registry_manager, pre_packed_initializers,
+                                remove_initializers, saving_ort_format);
+  }
+
  // Clear all removable attributes if they exists.
  // The function logs the list of removable attributes for every node.
  void PruneRemovableAttributes();
@ -380,9 +406,13 @@ class SessionState {
  /**
   * Prepack the constant initialized tensors for better performance.
   * The original constant initialized tensors will be removed to save memory.
+   * For model with prepacked initializer serialized into ONNX data file,
+   * PrePack will be skipped to save memory.
   */
  Status PrepackConstantInitializedTensors(InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
-                                           const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map);
+                                           const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map,
+                                           bool save_prepacked_constant_initializers,
+                                           PrePackInitializers& pre_packed_initializers);

  SessionState* GetMutableSubgraphSessionState(onnxruntime::NodeIndex index, const std::string& attribute_name);

@ -400,6 +430,7 @@ class SessionState {
                                  const SessionOptions& session_options,
                                  bool remove_initializers,
                                  InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
+                                  PrePackInitializers& pre_packed_initializers,
                                  const InlinedHashMap<OrtValueName, OrtDevice>& outer_scope_node_arg_to_location_map = {},
                                  bool graph_info_already_created = false);

--- a/onnxruntime/core/framework/session_state_utils.cc
+++ b/onnxruntime/core/framework/session_state_utils.cc
@ -21,7 +21,6 @@
 #include "core/framework/ort_value_pattern_planner.h"
 #include "core/framework/ort_value_name_idx_map.h"
 #include "core/framework/sequential_execution_plan.h"
-#include "core/framework/session_state.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/framework/bfc_arena.h"
@ -72,6 +71,7 @@ static inline common::Status ExtDataTensorProtoToTensor(const Env& env,
                                                        const std::basic_string<PATH_CHAR_TYPE>& proto_path,
                                                        const ONNX_NAMESPACE::TensorProto& tensor_proto,
                                                        Tensor& tensor, OrtCallback& ext_data_deleter,
+                                                        SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile& pre_packed_initializers_name_set,
                                                        Tensor* buffered_tensor = nullptr) {
  ORT_ENFORCE(utils::HasExternalData(tensor_proto));

@ -79,7 +79,7 @@ static inline common::Status ExtDataTensorProtoToTensor(const Env& env,
  SafeInt<size_t> ext_data_len = 0;
  ORT_RETURN_IF_ERROR(utils::GetExtDataFromTensorProto(env, proto_path.c_str(), tensor_proto,
                                                       ext_data_buf, ext_data_len, ext_data_deleter,
-                                                       buffered_tensor));
+                                                       &pre_packed_initializers_name_set, buffered_tensor));

  // NB: creating a do-nothing allocator per tensor is wasteful; can perhaps be
  // avoided if the Tensor class implements the do-nothing behavior when given a
@ -100,6 +100,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
                                             const AllocatorPtr& alloc, const AllocatorPtr& default_cpu_alloc,
                                             OrtValue& ort_value, const DataTransferManager& data_transfer_mgr,
                                             const ExternalDataLoaderManager& external_data_loader_mgr,
+                                             SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile& pre_packed_initializers_name_set,
                                             bool use_device_allocator_for_initializers = false,
                                             Tensor* buffered_tensor = nullptr) {
  if (bool(alloc) == (m != nullptr)) {
@ -139,7 +140,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
      // TensorProtoToTensor it would copy the data, causing unnecessary overhead
      OrtCallback ext_data_deleter;
      ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_tensor,
-                                                     ext_data_deleter, buffered_tensor));
+                                                     ext_data_deleter, pre_packed_initializers_name_set, buffered_tensor));

      ExtDataValueDeleter deleter{ext_data_deleter, p_tensor.get()};
      MLDataType ml_tensor_type = DataTypeImpl::GetType<Tensor>();
@ -163,7 +164,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
      OrtCallback ext_data_deleter;
      std::optional<ScopedOrtCallbackInvoker> scoped_ort_callback_invoker;
      ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_deserialize_tensor,
-                                                     ext_data_deleter, buffered_tensor));
+                                                     ext_data_deleter, pre_packed_initializers_name_set, buffered_tensor));
      scoped_ort_callback_invoker = ScopedOrtCallbackInvoker(ext_data_deleter);
      // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.

@ -272,7 +273,8 @@ common::Status SaveInitializedTensors(
    const ExecutionPlanBase& exec_plan,
    const SessionOptions& session_options,
    const MemoryProfileFunction& memory_profile_func,
-    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors) {
+    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors,
+    SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile& pre_packed_initializers_name_set) {
  LOGS(logger, INFO) << "Saving initialized tensors.";
  ORT_ENFORCE(ort_value_name_idx_map.MaxIdx() > -1, "OrtValue indexes should have been populated.");

@ -401,6 +403,7 @@ common::Status SaveInitializedTensors(

      Status st = DeserializeTensorProto(env, graph_loc, tensor_proto, (m.has_value()) ? &*m : nullptr, alloc,
                                         default_cpu_alloc, ort_value, data_transfer_mgr, external_data_loader_mgr,
+                                         pre_packed_initializers_name_set,
                                         use_device_allocator_for_initializers, p_tensor);
      if (!st.IsOK()) {
        std::ostringstream oss;
--- a/onnxruntime/core/framework/session_state_utils.h
+++ b/onnxruntime/core/framework/session_state_utils.h
@ -12,6 +12,7 @@
 #include "core/framework/tensor.h"
 #include "core/framework/tensor_allocator.h"
 #include "core/framework/session_options.h"
+#include "core/framework/session_state.h"
 #include "core/framework/sequential_execution_plan.h"
 #include "core/platform/path_lib.h"

@ -50,7 +51,8 @@ common::Status SaveInitializedTensors(
    const ExecutionPlanBase& exec_plan,
    const SessionOptions& session_options,
    const MemoryProfileFunction& memory_profile_func,
-    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors);
+    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors,
+    SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile& pre_packed_initializers_name_set);

 common::Status AllocateTensor(
    const onnxruntime::MemBuffer* m,
--- a/onnxruntime/core/framework/tensor_external_data_info.cc
+++ b/onnxruntime/core/framework/tensor_external_data_info.cc
@ -40,6 +40,8 @@ Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>&
        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", stringmap.value(), " failed");
    } else if (stringmap.key() == "checksum" && !stringmap.value().empty()) {
      out->checksum_ = stringmap.value();
+    } else if (stringmap.key() == "prepacked" && !stringmap.value().empty()) {
+      out->prepacked_ = stringmap.value() == "1";
    } else {
      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error!");
    }
--- a/onnxruntime/core/framework/tensor_external_data_info.h
+++ b/onnxruntime/core/framework/tensor_external_data_info.h
@ -23,6 +23,8 @@ class ExternalDataInfo {

  const std::string& GetChecksum() const { return checksum_; }

+  bool GetPrePacked() const noexcept { return prepacked_; }
+
  // If the value of 'offset' or 'length' field is larger the max value of ssize_t, this function will treat it as a
  // wrong value and return FAIL.
  static common::Status Create(
@ -36,5 +38,6 @@ class ExternalDataInfo {
  // 0 means the whole file
  size_t length_ = 0;
  std::string checksum_;
+  bool prepacked_ = false;
 };
 }  // namespace onnxruntime
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@ -230,11 +230,12 @@ Status TensorProtoToOrtValueImpl(const Env& env, const std::filesystem::path& mo

 namespace utils {

-Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
-                           const std::filesystem::path& tensor_proto_dir,
-                           std::basic_string<ORTCHAR_T>& external_file_path,
-                           onnxruntime::FileOffsetType& file_offset,
-                           SafeInt<size_t>& tensor_byte_size) {
+static Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
+                                  const std::filesystem::path& tensor_proto_dir,
+                                  std::basic_string<ORTCHAR_T>& external_file_path,
+                                  onnxruntime::FileOffsetType& file_offset,
+                                  SafeInt<size_t>& tensor_byte_size,
+                                  bool& pre_packed) {
  ORT_RETURN_IF_NOT(onnxruntime::utils::HasExternalData(tensor_proto),
                    "Tensor does not have external data to read from.");

@ -244,6 +245,8 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
  std::unique_ptr<onnxruntime::ExternalDataInfo> external_data_info;
  ORT_RETURN_IF_ERROR(onnxruntime::ExternalDataInfo::Create(tensor_proto.external_data(), external_data_info));

+  pre_packed = external_data_info->GetPrePacked();
+
  const auto& location = external_data_info->GetRelPath();

  external_file_path = location == onnxruntime::utils::kTensorProtoMemoryAddressTag ? std::filesystem::path(location)
@ -265,6 +268,11 @@ void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::str
  tensor_proto.set_raw_data(std::move(param));
 }

+Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto, const std::filesystem::path& tensor_proto_dir, std::basic_string<ORTCHAR_T>& external_file_path, onnxruntime::FileOffsetType& file_offset, SafeInt<size_t>& tensor_byte_size) {
+  bool pre_packed = false;
+  return GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_file_path, file_offset, tensor_byte_size, pre_packed);
+}
+
 void ConvertRawDataInTensorProto(TensorProto* tensor) {
  size_t element_size = 1;
  char* bytes = NULL;
@ -988,7 +996,7 @@ static Status GetFileContent(const Env& env, const std::filesystem::path& file_p
 Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& model_path,
                                 const ONNX_NAMESPACE::TensorProto& tensor_proto, void*& ext_data_buf,
                                 SafeInt<size_t>& ext_data_len, OrtCallback& ext_data_deleter,
-                                 Tensor* buffered_tensor) {
+                                 SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile* pre_packed_initializers_name_set, Tensor* buffered_tensor) {
  ORT_ENFORCE(utils::HasExternalData(tensor_proto));
  std::basic_string<ORTCHAR_T> tensor_proto_dir;
  if (!model_path.empty()) {
@ -997,8 +1005,13 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo
  std::basic_string<ORTCHAR_T> external_data_file_path;
  FileOffsetType file_offset;
  SafeInt<size_t> raw_data_safe_len = 0;
+  bool pre_packed = false;
  ORT_RETURN_IF_ERROR(
-      GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset, raw_data_safe_len));
+      GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset, raw_data_safe_len, pre_packed));
+
+  if (pre_packed && pre_packed_initializers_name_set != nullptr) {
+    (*pre_packed_initializers_name_set).insert(tensor_proto.name());
+  }

  if (external_data_file_path == onnxruntime::utils::kTensorProtoMemoryAddressTag) {
    // the value in location is the memory address of the data
@ -1108,7 +1121,7 @@ Status TensorProtoToTensor(const Env& env, const std::filesystem::path& model_pa
  OrtCallback& d = deleter_for_file_data.d;

  if (utils::HasExternalData(tensor_proto)) {
-    ORT_RETURN_IF_ERROR(GetExtDataFromTensorProto(env, model_path, tensor_proto, raw_data, raw_data_len, d));
+    ORT_RETURN_IF_ERROR(GetExtDataFromTensorProto(env, model_path, tensor_proto, raw_data, raw_data_len, d, nullptr));
  } else if (utils::HasRawData(tensor_proto)) {
    raw_data = const_cast<char*>(tensor_proto.raw_data().data());
    // TODO The line above has const-correctness issues. Below is a possible fix which copies the tensor_proto data
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@ -17,26 +17,19 @@
 #include "core/framework/external_data_loader.h"
 #include "core/framework/ort_value.h"
 #include "core/framework/mem_buffer.h"
+#include "core/framework/session_state.h"
 #include "core/framework/tensor_external_data_info.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/platform/env.h"

 namespace onnxruntime {
 namespace utils {
-/**
- * This function is used to get the external data info from the given tensor proto.
- * @param tensor_proto       given initializer tensor
- * @param tensor_proto_dir   directory of the tensor proto file
- * @param external_file_path output external file path
- * @param file_offset        output tensor offset
- * @param tensor_byte_size   output tensor byte size
- * @returns                  Status::OK() if the function is executed successfully
- */
 Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
                           const std::filesystem::path& tensor_proto_dir,
                           std::basic_string<ORTCHAR_T>& external_file_path,
                           onnxruntime::FileOffsetType& file_offset,
                           SafeInt<size_t>& tensor_byte_size);
+
 /**
 * This function is used to convert the endianess of Tensor data.
 * Mostly, will be used in big endian system to support the model file
@ -172,6 +165,7 @@ common::Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::
                                         const ONNX_NAMESPACE::TensorProto& tensor_proto,
                                         void*& ext_data_buf, SafeInt<size_t>& ext_data_len,
                                         OrtCallback& ext_data_deleter,
+                                         SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile* pre_packed_initializers_name_set,
                                         Tensor* buffered_tensor = nullptr);

 // Given a tensor proto with external data obtain a tensor using the specified custom external data loader.
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@ -1064,5 +1064,11 @@ bool IsOutputOnCpu(const Node& node, const KernelCreateInfo* p_kci, size_t index
  return false;
 }

+std::string GetPrepackedInitializerName(const std::string& initializer_name, const std::string& node_name) {
+  const std::string seperator = ":";
+
+  return initializer_name + seperator + node_name;
+}
+
 }  // namespace utils
 }  // namespace onnxruntime
--- a/onnxruntime/core/framework/utils.h
+++ b/onnxruntime/core/framework/utils.h
@ -234,6 +234,8 @@ constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<UInt4x2>() {

 int32_t ONNXTensorElementDataTypeToProtoTensorType(ONNXTensorElementDataType);

+std::string GetPrepackedInitializerName(const std::string& initializer_name, const std::string& node_name);
+
 #ifdef ENABLE_TRAINING
 common::Status VerifyInputTensorsAllocatedContiguously(OpKernelContext* context);
 #endif
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@ -4084,10 +4084,75 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProto() const {
  return result;
 }

+void Graph::SetUpExternalInitializer(const Graph::OffsetAlignmentInfo& align_info,
+                                     size_t tensor_bytes_size,
+                                     int64_t& external_offset,
+                                     std::ofstream& external_stream,
+                                     gsl::span<const uint8_t> raw_data,
+                                     ONNX_NAMESPACE::TensorProto& output_proto,
+                                     const std::filesystem::path& external_file_path,
+                                     const ONNX_NAMESPACE::TensorProto& initializer,
+                                     bool is_prepacked) {
+  // update external_offset for alignment
+  // need to do padding before write actual tensor data as we do offset alignment at the begin of
+  // large tensors (offset need to be page aligned and alloction granularity aligned) like below:
+  // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
+  // |<---small tensor---->|<---padding--->|<------------------large tensor----------------------------->|
+  if (align_info.align_offset && static_cast<int64_t>(tensor_bytes_size) > align_info.align_threshold) {
+    // Align to the larger of the page size or the allocation granularity
+    int64_t alignment_factor = std::max(static_cast<int64_t>(4096), align_info.allocation_granularity);
+    // Align to the next page or alloc granularity boundary
+    int64_t new_external_offset = static_cast<int64_t>(
+                                      std::floor((external_offset + alignment_factor - 1) / alignment_factor)) *
+                                  alignment_factor;
+
+    // padding tensor with zeros for alignment
+    InlinedVector<uint8_t> paddings;
+    size_t padding_size = SafeInt<size_t>(new_external_offset - external_offset);
+    paddings.reserve(padding_size);
+    for (size_t index = 0; index != padding_size; ++index) {
+      paddings.push_back(0x0);
+    }
+    external_stream.write(reinterpret_cast<const char*>(paddings.data()), padding_size);
+
+    external_offset = new_external_offset;
+  }
+
+  external_stream.write(reinterpret_cast<const char*>(raw_data.data()), tensor_bytes_size);
+
+  output_proto.set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
+  ONNX_NAMESPACE::StringStringEntryProto* location = output_proto.add_external_data();
+  location->set_key("location");
+  location->set_value(ToUTF8String(external_file_path.native()));
+  ONNX_NAMESPACE::StringStringEntryProto* offset = output_proto.add_external_data();
+  offset->set_key("offset");
+  offset->set_value(std::to_string(external_offset));
+  ONNX_NAMESPACE::StringStringEntryProto* length = output_proto.add_external_data();
+  length->set_key("length");
+  length->set_value(std::to_string(tensor_bytes_size));
+
+  if (is_prepacked) {
+    ONNX_NAMESPACE::StringStringEntryProto* pre_packed = output_proto.add_external_data();
+    pre_packed->set_key("prepacked");
+    pre_packed->set_value("1");
+  }
+
+  output_proto.set_name(initializer.name());
+  output_proto.set_data_type(initializer.data_type());
+  for (int i = 0; i != initializer.dims_size(); ++i) {
+    output_proto.add_dims(initializer.dims(i));
+  }
+  output_proto.set_doc_string(initializer.doc_string());
+
+  external_offset += tensor_bytes_size;
+}
+
 ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                       const std::filesystem::path& model_file_path,
                                                                       size_t initializer_size_threshold,
-                                                                       const OffsetAlignmentInfo& align_info) const {
+                                                                       const OffsetAlignmentInfo& align_info,
+                                                                       bool save_prepacked_constant_initializers,
+                                                                       PrePackedTensorProtoToSave& pre_packed_initializers) const {
  GraphProto result;
  ToGraphProtoInternal(result);
  ORT_ENFORCE(external_file_path.is_relative());
@ -4106,6 +4171,34 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std
 #endif

  for (const auto& initializer : graph_proto_->initializer()) {
+    bool use_pre_packed_initializer = false;
+    InlinedVector<TensorProto> pre_packed_initializers_tensor_proto;
+    // If this initializer has been prepacked, saved prepacked external initializer instead of original one.
+    // Since one initializer could be used by multiple kernels and been prepacked differently,
+    // Save each prepacked initializers seperately, chagne the initializer name to [initializer_name]:[node_name]
+    // to avoid conflict. Change the node input name accordingly.
+    // IT could potentially make the ONNX data file larger since we store multiple prepacked initializers into disk
+    // but this could be rare case.
+    if (save_prepacked_constant_initializers && pre_packed_initializers.count(initializer.name())) {
+      for (const auto& item : pre_packed_initializers[initializer.name()]) {
+        auto& node_name = item.first;
+        std::string prepacked_initializer_name = utils::GetPrepackedInitializerName(initializer.name(), node_name);
+        pre_packed_initializers_tensor_proto.push_back(item.second);
+        use_pre_packed_initializer = true;
+
+        for (auto& node : *result.mutable_node()) {
+          if (node.name() == node_name) {
+            int input_index = 0;
+            for (const auto& input : node.input()) {
+              if (input == initializer.name()) {
+                node.set_input(input_index, prepacked_initializer_name);
+              }
+              input_index += 1;
+            }
+          }
+        }
+      }
+    }
 #if !defined(DISABLE_SPARSE_TENSORS)
    if (sparse_end != sparse_tensor_names_.find(initializer.name())) {
      // Sparse tensors are added to the ONNX file.
@ -4114,61 +4207,39 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std
      ORT_ENFORCE(status.IsOK(), "Failed to convert dense initializer to sparse");
    } else {
 #endif
-      // Dense tensors larger than the threshold are added to the external file.
-      TensorProto* output_proto = result.add_initializer();
+      if (use_pre_packed_initializer) {
+        for (const auto& pre_packed_initializer : pre_packed_initializers_tensor_proto) {
+          // Dense tensors larger than the threshold are added to the external file.
+          TensorProto* output_proto = result.add_initializer();
+          std::vector<uint8_t> raw_data;
+          size_t tensor_bytes_size = 0;

-      std::vector<uint8_t> raw_data;
-      ORT_THROW_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
-      size_t tensor_bytes_size = raw_data.size();
-      if (tensor_bytes_size < initializer_size_threshold) {
-        *output_proto = initializer;
-        continue;
-      }
+          ORT_THROW_IF_ERROR(utils::UnpackInitializerData(pre_packed_initializer, model_path, raw_data));
+          tensor_bytes_size = raw_data.size();
+          if (tensor_bytes_size < initializer_size_threshold) {
+            *output_proto = pre_packed_initializer;
+            continue;
+          }

-      // update external_offset for alignment
-      // need to do padding before write actual tensor data as we do offset alignment at the begin of
-      // large tensors (offset need to be page aligned and alloction granularity aligned) like below:
-      // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
-      // |<---small tensor---->|<---padding--->|<------------------large tensor----------------------------->|
-      if (align_info.align_offset && static_cast<int64_t>(tensor_bytes_size) > align_info.align_threshold) {
-        // Align to the larger of the page size or the allocation granularity
-        int64_t alignment_factor = std::max(static_cast<int64_t>(4096), align_info.allocation_granularity);
-        // Align to the next page or alloc granularity boundary
-        int64_t new_external_offset = static_cast<int64_t>(
-                                          std::floor((external_offset + alignment_factor - 1) / alignment_factor)) *
-                                      alignment_factor;
+          SetUpExternalInitializer(align_info, tensor_bytes_size, external_offset, external_stream,
+                                   raw_data, *output_proto, external_file_path, pre_packed_initializer, true);
+        }
+      } else {
+        // Dense tensors larger than the threshold are added to the external file.
+        TensorProto* output_proto = result.add_initializer();
+        std::vector<uint8_t> raw_data;
+        size_t tensor_bytes_size = 0;

-        // padding tensor with zeros for alignment
-        for (int64_t index = external_offset; index != new_external_offset; ++index) {
-          external_stream << '0';
+        ORT_THROW_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
+        tensor_bytes_size = raw_data.size();
+        if (tensor_bytes_size < initializer_size_threshold) {
+          *output_proto = initializer;
+          continue;
        }

-        external_offset = new_external_offset;
+        SetUpExternalInitializer(align_info, tensor_bytes_size, external_offset, external_stream,
+                                 raw_data, *output_proto, external_file_path, initializer, false);
      }
-
-      for (size_t index = 0; index != tensor_bytes_size; ++index) {
-        external_stream << raw_data[index];
-      }
-
-      output_proto->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
-      ONNX_NAMESPACE::StringStringEntryProto* location = output_proto->add_external_data();
-      location->set_key("location");
-      location->set_value(ToUTF8String(external_file_path.native()));
-      ONNX_NAMESPACE::StringStringEntryProto* offset = output_proto->add_external_data();
-      offset->set_key("offset");
-      offset->set_value(std::to_string(external_offset));
-      ONNX_NAMESPACE::StringStringEntryProto* length = output_proto->add_external_data();
-      length->set_key("length");
-      length->set_value(std::to_string(tensor_bytes_size));
-
-      output_proto->set_name(initializer.name());
-      output_proto->set_data_type(initializer.data_type());
-      for (int i = 0; i != initializer.dims_size(); ++i) {
-        output_proto->add_dims(initializer.dims(i));
-      }
-      output_proto->set_doc_string(initializer.doc_string());
-
-      external_offset += tensor_bytes_size;
 #if !defined(DISABLE_SPARSE_TENSORS)
    }
 #endif
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@ -384,13 +384,17 @@ ModelProto Model::ToProto() const {
 ModelProto Model::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
                                                       const std::filesystem::path& file_path,
                                                       size_t initializer_size_threshold,
-                                                       const Graph::OffsetAlignmentInfo& align_info) const {
+                                                       const Graph::OffsetAlignmentInfo& align_info,
+                                                       bool save_prepacked_constant_initializers,
+                                                       Graph::PrePackedTensorProtoToSave& pre_packed_initializers) const {
  ModelProto result(model_proto_);
  const auto& graph = *graph_;
  *(result.mutable_graph()) = graph.ToGraphProtoWithExternalInitializers(external_file_name,
                                                                         file_path,
                                                                         initializer_size_threshold,
-                                                                         align_info);
+                                                                         align_info,
+                                                                         save_prepacked_constant_initializers,
+                                                                         pre_packed_initializers);
  return result;
 }

@ -608,7 +612,9 @@ static Status SaveModelWithExternalInitializers(Model& model,
                                                const T& file_path,
                                                const std::filesystem::path& external_file_name,
                                                size_t initializer_size_threshold,
-                                                const Graph::OffsetAlignmentInfo& align_info) {
+                                                const Graph::OffsetAlignmentInfo& align_info,
+                                                bool save_prepacked_constant_initializers,
+                                                Graph::PrePackedTensorProtoToSave& pre_packed_initializers) {
  int fd = 0;
  Status status = Env::Default().FileOpenWr(file_path, fd);
  ORT_RETURN_IF_ERROR(status);
@ -616,7 +622,8 @@ static Status SaveModelWithExternalInitializers(Model& model,
  ORT_TRY {
    status = Model::SaveWithExternalInitializers(model, fd, file_path, external_file_name,
                                                 initializer_size_threshold,
-                                                 align_info);
+                                                 align_info, save_prepacked_constant_initializers,
+                                                 pre_packed_initializers);
  }
  ORT_CATCH(const std::exception& ex) {
    ORT_HANDLE_EXCEPTION([&]() {
@ -647,9 +654,12 @@ Status Model::Load(const PathString& file_path, std::shared_ptr<Model>& p_model,
 Status Model::SaveWithExternalInitializers(Model& model, const std::filesystem::path& file_path,
                                           const std::filesystem::path& external_file_name,
                                           size_t initializer_size_threshold,
-                                           const Graph::OffsetAlignmentInfo& align_info) {
+                                           const Graph::OffsetAlignmentInfo& align_info,
+                                           bool save_prepacked_constant_initializers,
+                                           Graph::PrePackedTensorProtoToSave& pre_packed_initializers) {
  return SaveModelWithExternalInitializers(model, file_path, external_file_name, initializer_size_threshold,
-                                           align_info);
+                                           align_info, save_prepacked_constant_initializers,
+                                           pre_packed_initializers);
 }

 Status Model::LoadFromBytes(int count, const void* p_bytes, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) {
@ -766,7 +776,9 @@ Status Model::SaveWithExternalInitializers(Model& model,
                                           const std::filesystem::path& file_path,
                                           const std::filesystem::path& external_file_name,
                                           size_t initializer_size_threshold,
-                                           const Graph::OffsetAlignmentInfo& align_info) {
+                                           const Graph::OffsetAlignmentInfo& align_info,
+                                           bool save_prepacked_constant_initializers,
+                                           Graph::PrePackedTensorProtoToSave& pre_packed_initializers) {
  if (fd < 0) {
    return Status(ONNXRUNTIME, INVALID_ARGUMENT, "<fd> is less than 0.");
  }
@ -775,7 +787,8 @@ Status Model::SaveWithExternalInitializers(Model& model,

  auto model_proto = model.ToGraphProtoWithExternalInitializers(external_file_name, file_path,
                                                                initializer_size_threshold,
-                                                                align_info);
+                                                                align_info, save_prepacked_constant_initializers,
+                                                                pre_packed_initializers);
  google::protobuf::io::FileOutputStream output(fd);
  const bool result = model_proto.SerializeToZeroCopyStream(&output) && output.Flush();
  if (result) {
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@ -191,13 +191,17 @@ class Model {
  ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
                                                                  const std::filesystem::path& file_path,
                                                                  size_t initializer_size_threshold,
-                                                                  const Graph::OffsetAlignmentInfo& align_info) const;
+                                                                  const Graph::OffsetAlignmentInfo& align_info,
+                                                                  bool save_prepacked_constant_initializers,
+                                                                  Graph::PrePackedTensorProtoToSave& pre_packed_initializers) const;

  ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
                                                                  const std::filesystem::path& file_path,
                                                                  size_t initializer_size_threshold) const {
    Graph::OffsetAlignmentInfo default_align_info;
-    return ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold, default_align_info);
+    Graph::PrePackedTensorProtoToSave pre_packed_initializers;
+    return ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold, default_align_info,
+                                                false, pre_packed_initializers);
  }

  static common::Status Save(Model& model, const PathString& file_path);
@ -210,14 +214,18 @@ class Model {
                                                     const std::filesystem::path& file_path,
                                                     const std::filesystem::path& external_file_path,
                                                     size_t initializer_size_threshold,
-                                                     const Graph::OffsetAlignmentInfo& align_info);
+                                                     const Graph::OffsetAlignmentInfo& align_info,
+                                                     bool save_prepacked_constant_initializers,
+                                                     Graph::PrePackedTensorProtoToSave& pre_packed_initializers);

  static common::Status SaveWithExternalInitializers(Model& model,
                                                     const std::filesystem::path& file_path,
                                                     const std::filesystem::path& external_file_path,
                                                     size_t initializer_size_threshold) {
    Graph::OffsetAlignmentInfo default_align_info;
-    return SaveWithExternalInitializers(model, file_path, external_file_path, initializer_size_threshold, default_align_info);
+    Graph::PrePackedTensorProtoToSave pre_packed_initializers;
+    return SaveWithExternalInitializers(model, file_path, external_file_path, initializer_size_threshold, default_align_info,
+                                        false, pre_packed_initializers);
  }

  static common::Status SaveWithExternalInitializers(Model& model,
@ -225,7 +233,9 @@ class Model {
                                                     const std::filesystem::path& file_path,
                                                     const std::filesystem::path& external_file_path,
                                                     size_t initializer_size_threshold,
-                                                     const Graph::OffsetAlignmentInfo& align_info);
+                                                     const Graph::OffsetAlignmentInfo& align_info,
+                                                     bool save_prepacked_constant_initializers,
+                                                     Graph::PrePackedTensorProtoToSave& pre_packed_initializers);

  static common::Status SaveWithExternalInitializers(Model& model,
                                                     int fd,
@ -233,7 +243,9 @@ class Model {
                                                     const std::filesystem::path& external_file_path,
                                                     size_t initializer_size_threshold) {
    Graph::OffsetAlignmentInfo default_align_info;
-    return SaveWithExternalInitializers(model, fd, file_path, external_file_path, initializer_size_threshold, default_align_info);
+    Graph::PrePackedTensorProtoToSave pre_packed_initializers;
+    return SaveWithExternalInitializers(model, fd, file_path, external_file_path, initializer_size_threshold, default_align_info,
+                                        false, pre_packed_initializers);
  }

  static common::Status Load(std::istream& model_istream, ONNX_NAMESPACE::ModelProto* p_model_proto);
--- a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
+++ b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
@ -51,6 +51,7 @@ class FusedConvFp16 final : public OpKernel {
  Status Compute(OpKernelContext* context) const override;

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed, /*out*/ PrePackedWeights* prepacked_weights) override;

  Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
@ -101,6 +102,7 @@ class FusedConvFp16 final : public OpKernel {
 };

 Status FusedConvFp16::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                              bool /*save_prepacked_initializers*/,
                              /*out*/ bool& is_packed,
                              /*out*/ PrePackedWeights* prepacked_weights) {
  is_packed = false;
--- a/onnxruntime/core/providers/cpu/math/gemm.cc
+++ b/onnxruntime/core/providers/cpu/math/gemm.cc
@ -248,6 +248,7 @@ template void Gemm<float>::ComputeGemm(CBLAS_TRANSPOSE trans_a, CBLAS_TRANSPOSE

 template <typename T>
 Status Gemm<T>::PrePack(const Tensor& /* tensor */, int /* input_idx */, AllocatorPtr /*alloc_for_caching*/,
+                        bool /*save_prepacked_initializers*/,
                        /*out*/ bool& is_packed,
                        /*out*/ PrePackedWeights* /*prepacked_weight_for_caching*/) {
  is_packed = false;
@ -256,7 +257,7 @@ Status Gemm<T>::PrePack(const Tensor& /* tensor */, int /* input_idx */, Allocat

 template <>
 Status Gemm<float>::PrePack(const Tensor& tensor, int input_idx,
-                            AllocatorPtr alloc, /*out*/ bool& is_packed,
+                            AllocatorPtr alloc, bool /*save_prepacked_initializers*/, /*out*/ bool& is_packed,
                            /*out*/ PrePackedWeights* prepacked_weights) {
  is_packed = false;

--- a/onnxruntime/core/providers/cpu/math/gemm.h
+++ b/onnxruntime/core/providers/cpu/math/gemm.h
@ -21,6 +21,7 @@ class Gemm : protected GemmBase, public OpKernel {
  Status Compute(OpKernelContext* context) const override;

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

--- a/onnxruntime/core/providers/cpu/math/matmul.cc
+++ b/onnxruntime/core/providers/cpu/math/matmul.cc
@ -173,6 +173,7 @@ bool GemmPackBBfloat16(AllocatorPtr& alloc,
 #endif

 Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
+                              bool /*save_prepacked_initializers*/,
                              /*out*/ bool& is_packed,
                              /*out*/ PrePackedWeights* prepacked_weights) {
  is_packed = false;
--- a/onnxruntime/core/providers/cpu/math/matmul.h
+++ b/onnxruntime/core/providers/cpu/math/matmul.h
@ -37,6 +37,7 @@ class MatMul<float> final : public OpKernel {
  }

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

--- a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
@ -38,6 +38,7 @@ ONNX_CPU_OPERATOR_KERNEL(

 template <typename T>
 Status ConvTranspose<T>::PrePack(const Tensor& /*tensor*/, int /*input_idx*/, AllocatorPtr /*alloc*/,
+                                 bool /*save_prepacked_initializers*/,
                                 /*out*/ bool& is_packed,
                                 /*out*/ PrePackedWeights* /*prepacked_weights*/
 ) {
@ -47,6 +48,7 @@ Status ConvTranspose<T>::PrePack(const Tensor& /*tensor*/, int /*input_idx*/, Al

 template <>
 Status ConvTranspose<float>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                                     bool /*save_prepacked_initializers*/,
                                     /*out*/ bool& is_packed,
                                     /*out*/ PrePackedWeights* prepacked_weights) {
  is_packed = false;
--- a/onnxruntime/core/providers/cpu/nn/conv_transpose.h
+++ b/onnxruntime/core/providers/cpu/nn/conv_transpose.h
@ -28,6 +28,7 @@ class ConvTranspose : public OpKernel {
  ConvTranspose(const OpKernelInfo& info) : OpKernel(info), conv_transpose_attrs_(info) {}

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

--- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
+++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
@ -229,6 +229,7 @@ Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const {
 }

 Status LayerNormImpl::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                              bool /*save_prepacked_initializers*/,
                              bool& is_packed, PrePackedWeights* prepacked_weights) {
  ORT_UNUSED_PARAMETER(prepacked_weights);

--- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h
+++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h
@ -15,7 +15,7 @@ class LayerNormImpl : public OpKernel {
  LayerNormImpl(const OpKernelInfo& op_kernel_info, bool simplified = false, bool contrib_op = false);
  Status Compute(OpKernelContext* p_op_kernel_context) const override;

-  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool save_prepacked_initializers,
                 bool& is_packed, PrePackedWeights* prepacked_weights) override;

  // This method was created so that it can be called directly from `test/onnx/microbenchmark/layer_normalization.cc`.
--- a/onnxruntime/core/providers/cpu/quantization/matmul_integer_base.h
+++ b/onnxruntime/core/providers/cpu/quantization/matmul_integer_base.h
@ -14,6 +14,7 @@ class MatMulIntegerBase : public OpKernel {
  MatMulIntegerBase(const OpKernelInfo& info) : OpKernel(info) {}

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool /*save_prepacked_initializers*/,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override {
    is_packed = false;
--- a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
+++ b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
@ -25,6 +25,7 @@ class QLinearConv : public OpKernel {
  Status Compute(OpKernelContext* context) const override;

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

@ -360,6 +361,7 @@ REGISTER_QLINEARCONV_INT8_KERNEL(kMSDomain, 1);

 template <typename ActType>
 Status QLinearConv<ActType>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                                     bool /*save_prepacked_initializers*/,
                                     /*out*/ bool& is_packed,
                                     /*out*/ PrePackedWeights* prepacked_weights) {
  is_packed = false;
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
@ -284,6 +284,7 @@ bool DeepCpuGruOp::TryPackRecurrentWeights(const Tensor& weights, AllocatorPtr&
 }

 Status DeepCpuGruOp::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                             bool /*save_prepacked_initializers*/,
                             bool& is_packed, PrePackedWeights* prepacked_weights) {
  is_packed = false;

--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.h
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.h
@ -62,6 +62,7 @@ class DeepCpuGruOp final : public OpKernel {

 private:
  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

@ -197,4 +198,4 @@ class UniDirectionalGru {
 };
 }  // namespace detail

-}  // namespace onnxruntime
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
@ -225,7 +225,9 @@ static void UseSharedPrePackedBuffersImpl(std::vector<BufferUniquePtr>& prepacke
 }

 Status DeepCpuLstmOp::PrePack(const Tensor& tensor, int input_idx,
-                              AllocatorPtr alloc, /*out*/ bool& is_packed,
+                              AllocatorPtr alloc,
+                              bool /*save_prepacked_initializers*/,
+                              /*out*/ bool& is_packed,
                              /*out*/ PrePackedWeights* prepacked_weights) {
  is_packed = false;

--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
@ -19,6 +19,7 @@ class DeepCpuLstmOp final : public OpKernel, public LSTMBase {
  DeepCpuLstmOp(const OpKernelInfo& info) : OpKernel(info), LSTMBase(info) {}

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@ -52,6 +52,7 @@ REGISTER_KERNEL_TYPED(MLFloat16, kMSInternalNHWCDomain, true)
 // First input (in this case X) is in case NHWC == true also in NHWC format, the other inputs in NCHW
 template <typename T, bool NHWC>
 Status Conv<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                              bool /*save_prepacked_initializers*/,
                              bool& is_packed, PrePackedWeights* /*prepacked_weights*/) {
  is_packed = false;
  // only layout of weight input is adjusted via PrePack
--- a/onnxruntime/core/providers/cuda/nn/conv.h
+++ b/onnxruntime/core/providers/cuda/nn/conv.h
@ -219,6 +219,7 @@ class Conv : public CudaKernel {
  }

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 bool& is_packed, PrePackedWeights* prepacked_weights) override;

  Status ComputeInternal(OpKernelContext* context) const override;
--- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
@ -45,7 +45,8 @@ REGISTER_KERNEL_TYPED(MLFloat16, kMSInternalNHWCDomain, true)

 // First input (in this case X) is in case NHWC == true also in NHWC format, the other inputs in NCHW
 template <typename T, bool NHWC>
-Status ConvTranspose<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed,
+Status ConvTranspose<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                                       bool /*save_prepacked_initializers*/, bool& is_packed,
                                       [[maybe_unused]] PrePackedWeights* prepacked_weights) {
  is_packed = false;
  // only layout of weight input is adjusted via PrePack
--- a/onnxruntime/core/providers/cuda/nn/conv_transpose.h
+++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.h
@ -22,6 +22,7 @@ class ConvTranspose : public CudaKernel {

  ConvTranspose(const OpKernelInfo& info) : CudaKernel(info), conv_transpose_attrs_(info) {};
  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) override;
  Status ComputeInternal(OpKernelContext* context) const override;
  Status DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const;
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@ -78,6 +78,7 @@ class ConvBase : public JsKernel {
  }

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* /* prepacked_weights */) override {
    is_packed = false;
--- a/onnxruntime/core/providers/js/operators/conv_transpose.h
+++ b/onnxruntime/core/providers/js/operators/conv_transpose.h
@ -126,8 +126,10 @@ class ConvTranspose : public JsKernel {
  }

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* /* prepacked_weights */) override {
+    ORT_UNUSED_PARAMETER(save_prepacked_initializers);
    is_packed = false;

    if (input_idx == 1) {
--- a/onnxruntime/core/providers/xnnpack/math/gemm.cc
+++ b/onnxruntime/core/providers/xnnpack/math/gemm.cc
@ -117,6 +117,7 @@ Gemm::Gemm(const OpKernelInfo& info) : GemmBase(info), XnnpackKernel(info, /*ena
 }

 Status Gemm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr,
+                     bool /*save_prepacked_initializers*/,
                     /*out*/ bool& is_packed,
                     /*out*/ PrePackedWeights*) {
  is_packed = false;
--- a/onnxruntime/core/providers/xnnpack/math/gemm.h
+++ b/onnxruntime/core/providers/xnnpack/math/gemm.h
@ -23,6 +23,7 @@ class Gemm : protected GemmBase, public XnnpackKernel {
  static bool IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& graph);

  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

--- a/onnxruntime/core/providers/xnnpack/math/matmul.cc
+++ b/onnxruntime/core/providers/xnnpack/math/matmul.cc
@ -78,6 +78,7 @@ MatMul::MatMul(const OpKernelInfo& info) : XnnpackKernel(info, /*enable_caches*/
 }

 Status MatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                       bool /*save_prepacked_initializers*/,
                       /*out*/ bool& is_packed,
                       /*out*/ PrePackedWeights* /*Not used*/) {
  is_packed = false;
--- a/onnxruntime/core/providers/xnnpack/math/matmul.h
+++ b/onnxruntime/core/providers/xnnpack/math/matmul.h
@ -23,6 +23,7 @@ class MatMul : public XnnpackKernel {
  // Required for checking XNNpack restrictions on ORT side
  static bool IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& graph);
  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;

--- a/onnxruntime/core/providers/xnnpack/nn/conv.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/conv.cc
@ -18,6 +18,7 @@ namespace xnnpack {

 // use PrePack to handle the weight layout change as that's not a simple NCHW -> NHWC transpose
 Status Conv::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                     bool /*save_prepacked_initializers*/,
                     /*out*/ bool& is_packed,
                     /*out*/ PrePackedWeights* /*prepacked_weights*/) {
  is_packed = false;
--- a/onnxruntime/core/providers/xnnpack/nn/conv.h
+++ b/onnxruntime/core/providers/xnnpack/nn/conv.h
@ -19,6 +19,7 @@ class Conv : public ConvBase {

  // use PrePack to handle the weight layout change as that's not a simple NCHW -> NHWC transpose
  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;
 };
--- a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc
@ -15,6 +15,7 @@ namespace xnnpack {

 // use PrePack to handle the weight layout change as that's not a simple NCHW -> NHWC transpose
 Status ConvTranspose::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                              bool /*save_prepacked_initializers*/,
                              /*out*/ bool& is_packed,
                              /*out*/ PrePackedWeights* /*prepacked_weights*/) {
  is_packed = false;
--- a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.h
+++ b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.h
@ -18,6 +18,7 @@ class ConvTranspose : public ConvBase {

  // use PrePack to handle the weight layout change as that's not a simple NCHW -> NHWC transpose
  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool save_prepacked_initializers,
                 /*out*/ bool& is_packed,
                 /*out*/ PrePackedWeights* prepacked_weights) override;
 };
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@ -2027,9 +2027,11 @@ common::Status InferenceSession::Initialize() {
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
    }

+    SessionState::PrePackInitializers pre_packed_initializers;
    ORT_RETURN_IF_ERROR_SESSIONID_(
        session_state_->FinalizeSessionState(model_location_, kernel_registry_manager_,
                                             // need to keep the initializers if saving the optimized model
+                                             pre_packed_initializers,
                                             !saving_model,
                                             saving_ort_format));

@ -2065,11 +2067,47 @@ common::Status InferenceSession::Initialize() {
                  kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes, "1024"));
          Graph::OffsetAlignmentInfo align_info;
          align_info.align_offset = true;
+          bool save_prepacked_constant_initializers =
+              session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsSavePrePackedConstantInitializers, "0") == "1" ? true : false;
+          Graph::PrePackedTensorProtoToSave pre_packed_initializers_tensor_proto;
+          if (save_prepacked_constant_initializers) {
+            LOGS(*session_logger_, WARNING) << "Serialize prepacked initializers option has been turn on."
+                                            << "Use this option only when run model inference on PC with CPU."
+                                            << "Make sure to save and load model in same device as prepack is device specific."
+                                            << "Note: this feature in only work with ONNX model format."
+                                            << "Process of use this option is like below:"
+                                            << "1. Optimize model with external data file with save_prepacked_constant_initializers on:"
+                                            << "       sample: sess_options.add_session_config_entry('session.save_prepacked_constant_initializers',  ' 1 ')"
+                                            << "   With save_prepacked_constant_initializers option, prepacked initializer will be serialized into data file."
+                                            << "2. Load optimized model and external data file in same device, no prepack is need."
+                                            << "3. Run inference with optimized model.";
+
+            if (fbs::utils::IsOrtFormatModel(session_options_.optimized_model_filepath)) {
+              ORT_RETURN_IF_ERROR_SESSIONID_(
+                  ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                                  "Unable to serialize prepacked external constant initializer for ORT format model."
+                                  "Please use ONNX format model with save_prepacked_constant_initializers."));
+            }
+
+            // convert pre_packed_initializers to tensorproto format and save to external data file
+            for (const auto& name_item_pair : pre_packed_initializers.pre_packed_initializers_to_save) {
+              auto initializer_name = name_item_pair.first;
+
+              for (const auto& kernel_name_initializer_item_pair : name_item_pair.second) {
+                auto kernel_name = kernel_name_initializer_item_pair.first;
+                auto prepacked_initializer_name = utils::GetPrepackedInitializerName(initializer_name, kernel_name);
+
+                pre_packed_initializers_tensor_proto[initializer_name][kernel_name] = utils::TensorToTensorProto(kernel_name_initializer_item_pair.second, prepacked_initializer_name);
+              }
+            }
+          }
          ORT_RETURN_IF_ERROR_SESSIONID_(Model::SaveWithExternalInitializers(*model_,
                                                                             session_options_.optimized_model_filepath,
                                                                             optimized_model_external_initializers_file_name,
                                                                             optimized_model_external_initializers_min_size_in_bytes,
-                                                                             align_info));
+                                                                             align_info,
+                                                                             save_prepacked_constant_initializers,
+                                                                             pre_packed_initializers_tensor_proto));
        }
      }
    }
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@ -45,6 +45,7 @@
 #include "core/session/environment.h"
 #include "core/session/IOBinding.h"
 #include "core/session/inference_session_utils.h"
+#include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/onnxruntime_run_options_config_keys.h"
 #include "dummy_provider.h"
@ -64,6 +65,8 @@ using namespace ONNX_NAMESPACE;
 using namespace onnxruntime::logging;
 using namespace onnxruntime::concurrency;

+extern std::unique_ptr<Ort::Env> ort_env;
+
 namespace {
 struct KernelRegistryAndStatus {
  std::shared_ptr<onnxruntime::KernelRegistry> kernel_registry = std::make_shared<onnxruntime::KernelRegistry>();
@ -496,6 +499,57 @@ TEST(InferenceSessionTests, TestModelSerialization) {
  ASSERT_TRUE(session_object_emptyValidation.Initialize().IsOK());
 }

+// Test feature serialize prepack weight is only used in PC with CPU on inference,
+// disable this test for training, other device and eps
+#if !ENABLE_TRAINING && !defined(USE_CUDA) && !defined(__wasm__) && !defined(USE_DNNL) && !defined(USE_QNN) && !defined(__ANDROID__) && !defined(USE_COREML)
+// MLAS dispatcher used in matmul_nbits kernels here is 64 bit only
+#if defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)
+TEST(InferenceSessionTests, TestPrePackSerialization) {
+  SessionOptions so;
+  std::string model_name = "model_with_matmul_nbits";
+
+  const std::string test_model = "testdata/prepack/" + model_name + ".onnx";
+  const std::string optimized_model = "testdata/prepack/" + model_name + "_opt.onnx";
+
+  so.session_logid = "InferenceSessionTests.TestPrepackSerialization";
+  so.enable_cpu_mem_arena = false;
+  so.graph_optimization_level = TransformerLevel::Default;
+  so.optimized_model_filepath = optimized_model;
+  std::string external_initializer_file_name = model_name + "_opt.onnx.data";
+
+  // enable serialize prepack initializer to data file
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsSavePrePackedConstantInitializers,
+                                                    "1"));
+  // always save external initializer to data file for test
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes,
+                                                    "0"));
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsOptimizedModelExternalInitializersFileName,
+                                                    external_initializer_file_name.c_str()));
+
+  // optimize model with serialize prepack constant initializers
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+  ASSERT_TRUE(session_object.Load(test_model).IsOK());
+  ASSERT_TRUE(session_object.Initialize().IsOK());
+
+  // Verify prepack initializers are serialized into optimized model and data file
+  // load optimized model and check initializer are prepacked
+  auto logger = DefaultLoggingManager().CreateLogger("TestPrepackSerialization");
+  std::shared_ptr<Model> model;
+  auto load_status = Model::Load(ToWideString(optimized_model), model, nullptr, *logger);
+  ASSERT_EQ(Status::OK(), load_status);
+  Graph& graph = model->MainGraph();
+
+  bool found_prepack_initializer = false;
+  for (const auto& item : graph.GetAllInitializedTensors()) {
+    if (item.first.find(':') != std::string::npos) {
+      found_prepack_initializer = true;
+    }
+  }
+  ASSERT_TRUE(found_prepack_initializer);
+}
+#endif
+#endif
+
 #ifdef ORT_RUN_EXTERNAL_ONNX_TESTS
 static bool Compare(const InputDefList& f_arg, const InputDefList& s_arg) {
  if (f_arg.size() != s_arg.size()) {
--- a/onnxruntime/test/framework/save_model_with_external_initializers.cc
+++ b/onnxruntime/test/framework/save_model_with_external_initializers.cc
@ -7,6 +7,7 @@
 #include "core/framework/data_types.h"
 #include "core/graph/model.h"
 #include "core/framework/tensorprotoutils.h"
+#include "core/framework/session_state.h"
 #include "test/test_environment.h"
 #include "test_utils.h"
 #include "test/util/include/asserts.h"
@ -19,19 +20,34 @@ using namespace onnxruntime;
 namespace onnxruntime {
 namespace test {

+std::vector<std::string> split(const std::string& str, char delimiter) {
+  std::vector<std::string> result;
+  std::stringstream ss(str);
+  std::string token;
+
+  // Use getline with a delimiter to split the string
+  while (std::getline(ss, token, delimiter)) {
+    result.push_back(token);
+  }
+
+  return result;
+}
+
 Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
                               const std::filesystem::path& input_external_init_file,
                               const std::filesystem::path& output_onnx,
                               const std::filesystem::path& output_external_init_file,
                               size_t initializer_size_threshold,
-                               const Graph::OffsetAlignmentInfo& align_info) {
+                               const Graph::OffsetAlignmentInfo& align_info,
+                               Graph::PrePackedTensorProtoToSave& pre_packed_initializers_tensor_proto,
+                               bool save_prepacked_constant_initializers = false) {
  auto logger = DefaultLoggingManager().CreateLogger("LoadSaveAndCompareModel");
  std::shared_ptr<Model> model;
  ORT_RETURN_IF_ERROR(Model::Load(input_onnx, model, nullptr, *logger));
  std::filesystem::remove(output_onnx);
  std::filesystem::remove(output_external_init_file);
  ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(*model, output_onnx, output_external_init_file, initializer_size_threshold,
-                                                          align_info));
+                                                          align_info, save_prepacked_constant_initializers, pre_packed_initializers_tensor_proto));

  std::shared_ptr<Model> model_from_external;
  ORT_RETURN_IF_ERROR(Model::Load(output_onnx.native(), model_from_external, nullptr, *logger));
@ -50,10 +66,11 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
  // Compare the initializers of the two versions.
  std::filesystem::path model_path{};
  std::filesystem::path external_data_path{};
-  for (const auto& i : initializers) {
+  for (const auto& i : initializers_from_external) {
    const std::string kInitName = i.first;
-    const ONNX_NAMESPACE::TensorProto* tensor_proto = i.second;
-    const ONNX_NAMESPACE::TensorProto* from_external_tensor_proto = initializers_from_external[kInitName];
+    const ONNX_NAMESPACE::TensorProto* from_external_tensor_proto = i.second;
+    // prepack initializer will have name as [original name]:[kernel name] in case initializer used by multiple kernels
+    const ONNX_NAMESPACE::TensorProto* tensor_proto = save_prepacked_constant_initializers ? initializers[split(kInitName, ':')[0]] : initializers[kInitName];

    std::vector<uint8_t> tensor_proto_data;
    model_path = input_onnx;
@ -75,8 +92,12 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
      ORT_RETURN_IF_NOT(from_external_tensor_proto->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL, "location mismatch");
    }

-    ORT_RETURN_IF_NOT(tensor_proto_size == from_external_tensor_proto_size, "size mismatch");
-    ORT_RETURN_IF_NOT(memcmp(tensor_proto_data.data(), from_external_tensor_proto_data.data(), tensor_proto_size) == 0, "data mismatch");
+    if (!save_prepacked_constant_initializers) {
+      ORT_RETURN_IF_NOT(tensor_proto_size == from_external_tensor_proto_size, "size mismatch");
+      ORT_RETURN_IF_NOT(memcmp(tensor_proto_data.data(), from_external_tensor_proto_data.data(), tensor_proto_size) == 0, "data mismatch");
+    } else {
+      ORT_RETURN_IF_NOT(from_external_tensor_proto_size >= tensor_proto_size, "prepack initializer's size is at least same as original tensor, might be larger");
+    }

    if (align_info.align_offset) {
      for (const StringStringEntryProto& entry : from_external_tensor_proto->external_data()) {
@ -89,6 +110,7 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
      }
    }
  }
+
  // Cleanup.
  ORT_RETURN_IF_NOT(std::filesystem::remove(output_onnx), "delete file failed");
  ORT_RETURN_IF_NOT(std::filesystem::remove(external_data_path), "delete file failed");
@ -98,13 +120,15 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
 // Original model does not have external initializers
 TEST(SaveWithExternalInitializers, Mnist) {
  Graph::OffsetAlignmentInfo align_info;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/mnist.onnx"), ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"), ORT_TSTR("mnist_external_initializers.bin"), 100, align_info));
+  Graph::PrePackedTensorProtoToSave pre_packed_initializers_tensor_proto;
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/mnist.onnx"), ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"), ORT_TSTR("mnist_external_initializers.bin"), 100, align_info, pre_packed_initializers_tensor_proto));
 }

 // Original model has external initializers
 TEST(SaveWithExternalInitializers, ModelWithOriginalExternalData) {
  Graph::OffsetAlignmentInfo align_info;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info));
+  Graph::PrePackedTensorProtoToSave pre_packed_initializers_tensor_proto;
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info, pre_packed_initializers_tensor_proto));
 }

 // Original model has external initializers, align offset
@ -112,7 +136,22 @@ TEST(SaveWithExternalInitializers, ModelWithOriginalExternalDataAlignOffset) {
  Graph::OffsetAlignmentInfo align_info;
  align_info.align_offset = true;
  align_info.align_threshold = 0;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info));
+  Graph::PrePackedTensorProtoToSave pre_packed_initializers_tensor_proto;
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info, pre_packed_initializers_tensor_proto));
+}
+
+// Original model has external initializers, align offset and serialize prepacked external initializer to model file
+TEST(SaveWithExternalInitializers, ModelWithOriginalExternalDataAlignOffsetAndSavePrepackTensors) {
+  Graph::OffsetAlignmentInfo align_info;
+  align_info.align_offset = true;
+  align_info.align_threshold = 0;
+  std::shared_ptr<CPUAllocator> alloc = std::make_shared<CPUAllocator>();
+  TensorShape shape = {178};
+  // prepack both initializers for test purpose
+  Graph::PrePackedTensorProtoToSave pre_packed_initializers_tensor_proto;
+  pre_packed_initializers_tensor_proto["MatMul.Weight"]["MatMul_0"] = utils::TensorToTensorProto(Tensor(DataTypeImpl::GetType<uint8_t>(), shape, alloc), "MatMul.Weight:MatMul_0");
+  pre_packed_initializers_tensor_proto["scales"]["MatMul_0"] = utils::TensorToTensorProto(Tensor(DataTypeImpl::GetType<float>(), shape, alloc), "scales:MatMul_0");
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/prepack/model_with_matmul_nbits.onnx"), ORT_TSTR("model_with_matmul_nbits.onnx.data"), ORT_TSTR("testdata/prepack/model_with_matmul_nbits_opt.onnx"), ORT_TSTR("model_with_matmul_nbits_opt.onnx.data"), 0, align_info, pre_packed_initializers_tensor_proto, true));
 }

 }  // namespace test
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@ -372,10 +372,11 @@ class PrePackingTestOpKernel : public OpKernel {
    return Status::OK();
  }

-  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool save_prepacked_initializers,
                 /*out*/ bool& is_packed, /*out*/ PrePackedWeights* prepacked_weights) override {
    ORT_UNUSED_PARAMETER(tensor);
    ORT_UNUSED_PARAMETER(input_idx);
+    ORT_UNUSED_PARAMETER(save_prepacked_initializers);

    size_t weight_packed_len = 8;
    weight_packed_ = IAllocator::MakeUniquePtr<void>(alloc, weight_packed_len, true);
@ -393,9 +394,20 @@ class PrePackingTestOpKernel : public OpKernel {
    return Status::OK();
  }

+  std::optional<Tensor> GetPrePackTensor(int input_idx) override {
+    ORT_UNUSED_PARAMETER(input_idx);
+    ++get_prepack_tensors_count;
+
+    TensorShape shape = {2};
+    packed_tensor = Tensor(DataTypeImpl::GetType<float>(), shape, std::make_shared<CPUAllocator>());
+    return std::move(packed_tensor);
+  }
+
  int prepack_calls_count = 0;
  int store_pre_packed_weight_calls_count = 0;
+  int get_prepack_tensors_count = 0;
  IAllocatorUniquePtr<void> weight_packed_;
+  Tensor packed_tensor;
 };

 static void CreateSimpleGraph(Graph& graph) {
@ -530,6 +542,7 @@ static void PlaceAllNodesToCPUEP(Graph& graph) {
 struct PrepackingTestParam {
  bool test_subgraph;
  bool test_prepacking;
+  bool test_save_prepack_initializer;
 };

 class SessionStatePrepackingTest : public testing::TestWithParam<PrepackingTestParam> {};
@ -572,6 +585,8 @@ TEST_P(SessionStatePrepackingTest, PrePackingTest) {
  sess_options.enable_mem_reuse = true;
  sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] =
      test_param.test_prepacking ? "0" : "1";
+  sess_options.config_options.configurations[kOrtSessionOptionsSavePrePackedConstantInitializers] =
+      test_param.test_save_prepack_initializer ? "1" : "0";

  SessionState session_state(model.MainGraph(),
                             execution_providers,
@ -597,12 +612,47 @@ TEST_P(SessionStatePrepackingTest, PrePackingTest) {
  kernel_registry_manager.RegisterKernelRegistry(kernel_registry);

  PlaceAllNodesToCPUEP(model.MainGraph());
+  SessionState::PrePackInitializers pre_packed_initializers;
  ASSERT_STATUS_OK(session_state.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
-                                                      kernel_registry_manager));
+                                                      kernel_registry_manager,
+                                                      pre_packed_initializers));

  const auto& const_initialized_tensors = session_state.GetConstantInitializedTensors();
  // check prepacking
  ASSERT_EQ(const_initialized_tensors.size(), size_t(test_param.test_prepacking ? 0 : 1));
+
+  // check get prepack tensor method called when set save_prepacked_constant_initializers
+  if (!test_param.test_subgraph) {
+    const auto* kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state.GetKernel(0));
+    ASSERT_EQ(kernel->get_prepack_tensors_count, (test_param.test_prepacking && test_param.test_save_prepack_initializer) ? 1 : 0);
+  } else {
+    auto if_index = 1;
+    if (session_state.GetKernel(0)->Node().OpType() == "If") {
+      if_index = 0;
+    }
+
+    const auto& subgraph_session_states = session_state.GetSubgraphSessionStateMap();
+    const auto& if_node_session_states = subgraph_session_states.at(if_index);
+    const auto& session_state_1_then_branch_session_state = *if_node_session_states.at("then_branch");
+    const auto& session_state_1_else_branch_session_state = *if_node_session_states.at("else_branch");
+
+    const auto* kernel_if_0 = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_1_then_branch_session_state.GetKernel(0));
+    const auto* kernel_if_1 = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_1_else_branch_session_state.GetKernel(0));
+    ASSERT_EQ(kernel_if_0->get_prepack_tensors_count, (test_param.test_prepacking && test_param.test_save_prepack_initializer) ? 1 : 0);
+    ASSERT_EQ(kernel_if_1->get_prepack_tensors_count, (test_param.test_prepacking && test_param.test_save_prepack_initializer) ? 1 : 0);
+  }
+
+  // check pre_packed_initializers_to_save will be set properly when set save_prepacked_constant_initializers
+  if (!test_param.test_subgraph && test_param.test_prepacking && test_param.test_save_prepack_initializer) {
+    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save.size(), size_t(1));
+    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save.count("node_0_input_1"), size_t(1));
+    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save["node_0_input_1"].count("node_0"), size_t(1));
+  } else if (test_param.test_subgraph && test_param.test_prepacking && test_param.test_save_prepack_initializer) {
+    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save.size(), size_t(1));
+    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save.count("if_shared"), size_t(1));
+    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save["if_shared"].count("if_node_1"), size_t(1));
+    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save["if_shared"].count("if_node_0"), size_t(1));
+  }
 }

 class SessionStateTestSharedInitalizersWithPrePacking : public ::testing::Test {
@ -1000,10 +1050,14 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test4) {

 INSTANTIATE_TEST_SUITE_P(SessionStateTests,
                         SessionStatePrepackingTest,
-                         testing::Values(PrepackingTestParam{false, false},
-                                         PrepackingTestParam{false, true},
-                                         PrepackingTestParam{true, false},
-                                         PrepackingTestParam{true, true}));
+                         testing::Values(PrepackingTestParam{false, false, false},
+                                         PrepackingTestParam{false, true, false},
+                                         PrepackingTestParam{true, false, false},
+                                         PrepackingTestParam{true, true, false},
+                                         PrepackingTestParam{false, false, true},
+                                         PrepackingTestParam{false, true, true},
+                                         PrepackingTestParam{true, false, true},
+                                         PrepackingTestParam{true, true, true}));
 #endif

 }  // namespace test
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@ -4600,3 +4600,86 @@ TEST(CApiTest, OrtCustomOp_GetInPlace) {
  ASSERT_EQ(len, static_cast<size_t>(2));
  mock_gqa.ReleaseAliasMap(input_index, output_index);
 }
+
+TEST(CApiTest, Serialize_PrePack_Initializers) {
+  std::string model_name = "model_with_matmul_nbits";
+
+  const std::string test_model = "testdata/prepack/" + model_name + ".onnx";
+  const std::string optimized_model = "testdata/prepack/" + model_name + "_opt.onnx";
+  std::string external_initializer_file_name = model_name + "_opt.onnx.data";
+
+  // Generate optimized with prepacked weights serialized
+  Ort::SessionOptions session_options_opt;
+  session_options_opt.AddConfigEntry(kOrtSessionOptionsOptimizedModelExternalInitializersFileName, external_initializer_file_name.c_str());
+  session_options_opt.AddConfigEntry(kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes, "0");
+  session_options_opt.AddConfigEntry(kOrtSessionOptionsSavePrePackedConstantInitializers, "1");
+
+#if defined(_WIN32) || defined(_WIN64)
+  std::wstring test_model_wide = onnxruntime::ToWideString(test_model);
+  session_options_opt.SetOptimizedModelFilePath(onnxruntime::ToWideString(optimized_model).c_str());
+  Ort::Session session_opt_model(*ort_env, test_model_wide.c_str(), session_options_opt);
+#else
+  session_options_opt.SetOptimizedModelFilePath(optimized_model.c_str());
+  Ort::Session session_opt_model(*ort_env, test_model.c_str(), session_options_opt);
+#endif
+
+  // Do inference with original model and optimized model and check output is identical
+  // set inputs and session options
+  Ort::SessionOptions session_options;
+  const char* input_names[] = {"A"};
+  const char* const output_names[] = {"Y"};
+  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+
+  std::vector<Ort::Value> ort_inputs;
+  std::vector<float> input_0_data = {1.3f};
+  std::vector<int64_t> input_0_dims = {1, 1};
+  ort_inputs.emplace_back(
+      Ort::Value::CreateTensor<float>(info, const_cast<float*>(input_0_data.data()),
+                                      input_0_data.size(), input_0_dims.data(), input_0_dims.size()));
+
+  // run inference with original model
+  // Convert std::string to std::wstring
+#if defined(_WIN32) || defined(_WIN64)
+  Ort::Session session(*ort_env, test_model_wide.c_str(), session_options);
+#else
+  Ort::Session session(*ort_env, test_model.c_str(), session_options);
+#endif
+  auto ort_outputs = session.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(),
+                                 output_names, 1);
+
+  // run inference with optimized model which load serialized prepack initializer
+#if defined(_WIN32) || defined(_WIN64)
+  std::wstring optimized_model_wide = onnxruntime::ToWideString(optimized_model);
+  Ort::Session session_opt(*ort_env, optimized_model_wide.c_str(), session_options);
+#else
+  Ort::Session session_opt(*ort_env, optimized_model.c_str(), session_options);
+#endif
+  auto ort_outputs_opt = session_opt.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(),
+                                         output_names, 1);
+
+  // check output of original model and optimized model are equal
+  ASSERT_EQ(ort_outputs.size(), ort_outputs_opt.size());
+
+  for (size_t i = 0; i < ort_outputs.size(); ++i) {
+    const auto& sequences = ort_outputs[i];
+    ASSERT_TRUE(sequences.IsTensor());
+
+    const auto& sequences_opt = ort_outputs_opt[i];
+    ASSERT_TRUE(sequences_opt.IsTensor());
+
+    auto result_ts = sequences.GetTensorTypeAndShapeInfo();
+    auto result_ts_opt = sequences_opt.GetTensorTypeAndShapeInfo();
+
+    ASSERT_EQ(result_ts.GetElementType(), result_ts_opt.GetElementType());
+
+    ASSERT_EQ(result_ts.GetShape(), result_ts_opt.GetShape());
+
+    const auto* result_vals = sequences.GetTensorData<float>();
+    auto result_span = gsl::make_span(result_vals, ort_outputs.size());
+
+    const auto* result_vals_opt = sequences_opt.GetTensorData<float>();
+    auto result_span_opt = gsl::make_span(result_vals_opt, ort_outputs_opt.size());
+
+    ASSERT_TRUE(std::equal(result_span_opt.begin(), result_span_opt.end(), result_span.begin(), result_span.end()));
+  }
+}
--- a/onnxruntime/test/testdata/model_with_external_initializers.onnx
+++ b/onnxruntime/test/testdata/model_with_external_initializers.onnx
@ -1,7 +1,8 @@
-onnx-example:–
-&
+
+onnx-example:œ
+,
 X
-PadsY"Pad*
+PadsYpad0"Pad*
 mode"constant 
 test-model*"BPadsj
 locationPads.binpZ
@ -16,4 +17,4 @@ test-model*"BPadsj
 Y


-B
+B
--- a/onnxruntime/test/testdata/model_with_external_initializers.py
+++ b/onnxruntime/test/testdata/model_with_external_initializers.py
@ -35,9 +35,10 @@ def GenerateModel(model_name, external_data_name):  # noqa: N802

    # Create a node (NodeProto)
    node_def = helper.make_node(
-        "Pad",  # node name
+        "Pad",  # op type
        ["X", external_data_name],  # inputs
        ["Y"],  # outputs
+        "pad0",  # node name
        mode="constant",  # Attributes
    )

--- a/onnxruntime/test/testdata/model_with_orig_ext_data.onnx
+++ b/onnxruntime/test/testdata/model_with_orig_ext_data.onnx
@ -1,7 +1,8 @@
-	onnx-example:æ
-:
+
+onnx-example:ì
+@
 X
-model_with_orig_ext_dataY"Pad*
+model_with_orig_ext_dataYpad0"Pad*
 mode"constant 
 test-model*JBmodel_with_orig_ext_dataj(
 locationmodel_with_orig_ext_data.binpZ
@ -16,4 +17,4 @@ test-model*JBmodel_with_orig_ext_dataj(
 Y


-B
+B
--- a/onnxruntime/test/testdata/prepack/MatMul.Weight.bin
+++ b/onnxruntime/test/testdata/prepack/MatMul.Weight.bin
--- a/onnxruntime/test/testdata/prepack/model_with_external_initializers_and_prepack_kernel.py
+++ b/onnxruntime/test/testdata/prepack/model_with_external_initializers_and_prepack_kernel.py
@ -0,0 +1,88 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+
+import numpy as np
+import onnx
+from onnx import TensorProto, helper
+from onnx.external_data_helper import set_external_data
+from onnx.numpy_helper import from_array
+
+M = 1
+K = 1
+N = 1
+q_cols = 1
+q_rows = 1
+q_scale_size = 1
+
+
+def create_external_data_tensor(value, tensor_name, data_type):
+    tensor = from_array(np.array(value))
+    tensor.name = tensor_name
+    tensor_filename = f"{tensor_name}.bin"
+    set_external_data(tensor, location=tensor_filename)
+
+    with open(os.path.join(tensor_filename), "wb") as data_file:
+        data_file.write(tensor.raw_data)
+    tensor.ClearField("raw_data")
+    tensor.data_location = onnx.TensorProto.EXTERNAL
+    tensor.data_type = data_type
+    return tensor
+
+
+def create_internal_data_tensor(value, tensor_name, data_type):
+    tensor = helper.make_tensor(name=tensor_name, data_type=data_type, dims=value.shape, vals=value.flatten().tolist())
+    print(tensor)
+    tensor.data_location = onnx.TensorProto.DEFAULT
+    return tensor
+
+
+def GenerateMatmulNBitsModel(model_name, external_data_name):  # noqa: N802
+    A = helper.make_tensor_value_info("A", TensorProto.FLOAT, [M, K])  # noqa: N806
+    Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [M, N])  # noqa: N806
+
+    # Create a node (NodeProto)
+    node_def = helper.make_node(
+        op_type="MatMulNBits",  # op type
+        inputs=["A", external_data_name, "scales"],  # inputs
+        outputs=["Y"],  # outputs
+        name="MatMul_0",  # node name
+        domain="com.microsoft",  # Custom domain for this operator
+        accuracy_level=4,  # Attributes
+        bits=4,  # Attributes
+        block_size=32,  # Attributes
+        K=K,  # Attributes
+        N=N,  # Attributes
+    )
+
+    # Create the graph (GraphProto)
+    graph_def = helper.make_graph(
+        [node_def],
+        "test-model-matmul4bits",
+        [A],
+        [Y],
+        [
+            create_external_data_tensor([[171]], external_data_name, TensorProto.UINT8),
+            create_internal_data_tensor(np.array([1.5], dtype=np.float32), "scales", TensorProto.FLOAT),
+        ],
+    )
+
+    # Create the model
+    model_def = helper.make_model(
+        graph_def,
+        producer_name="onnx-example",
+        opset_imports=[helper.make_operatorsetid("", 14), helper.make_operatorsetid("com.microsoft", 1)],
+    )
+
+    print(f"The ir_version in model: {model_def.ir_version}\n")
+    print(f"The producer_name in model: {model_def.producer_name}\n")
+    print(f"The graph in model:\n{model_def.graph}")
+    onnx.checker.check_model(model_def)
+    print("The model is checked!")
+    with open(model_name, "wb") as model_file:
+        model_file.write(model_def.SerializeToString())
+
+
+if __name__ == "__main__":
+    GenerateMatmulNBitsModel("model_with_matmul_nbits.onnx", "MatMul.Weight")
--- a/onnxruntime/test/testdata/prepack/model_with_matmul_nbits.onnx
+++ b/onnxruntime/test/testdata/prepack/model_with_matmul_nbits.onnx
--- a/orttraining/orttraining/models/bert/main.cc
+++ b/orttraining/orttraining/models/bert/main.cc
@ -42,6 +42,7 @@ static SessionOptions session_options = {
    ExecutionMode::ORT_SEQUENTIAL,     // execution_mode
    ExecutionOrder::PRIORITY_BASED,    // execution_order
    false,                             // enable_profiling
+    false,                             // save prepacked initializer
    ORT_TSTR(""),                      // optimized_model_filepath
    true,                              // enable_mem_pattern
    true,                              // enable_mem_reuse
--- a/orttraining/orttraining/models/pipeline_poc/main.cc
+++ b/orttraining/orttraining/models/pipeline_poc/main.cc
@ -89,6 +89,7 @@ int main(int argc, char* argv[]) {
      ExecutionMode::ORT_SEQUENTIAL,     // execution_mode
      ExecutionOrder::DEFAULT,           // execution_order
      false,                             // enable_profiling
+      false,                             // save prepacked initializer
      ORT_TSTR(""),                      // optimized_model_filepath
      true,                              // enable_mem_pattern
      true,                              // enable_mem_reuse
--- a/orttraining/orttraining/models/runner/training_runner.cc
+++ b/orttraining/orttraining/models/runner/training_runner.cc
@ -37,6 +37,7 @@ static SessionOptions SESSION_OPTION = {
    ExecutionMode::ORT_SEQUENTIAL,     // execution_mode
    ExecutionOrder::PRIORITY_BASED,    // execution_order
    false,                             // enable_profiling
+    false,                             // save prepacked initializer
    ORT_TSTR(""),                      // optimized_model_filepath
    true,                              // enable_mem_pattern
    true,                              // enable_mem_reuse