From a82a0907e0afda1b0bd623792699e2a7cd0dc7eb Mon Sep 17 00:00:00 2001
From: linkerzhang <kezhan@microsoft.com>
Date: Fri, 23 Nov 2018 11:56:14 -0800
Subject: [PATCH 01/16] add ops for quantization support.

---
 onnxruntime/contrib_ops/contrib_ops.cc | 223 +++++++++++++++++++++++++
 1 file changed, 223 insertions(+)

diff --git a/onnxruntime/contrib_ops/contrib_ops.cc b/onnxruntime/contrib_ops/contrib_ops.cc
index 49c8a133ea..d698cdcbd3 100644
--- a/onnxruntime/contrib_ops/contrib_ops.cc
+++ b/onnxruntime/contrib_ops/contrib_ops.cc
@@ -104,6 +104,229 @@ The quantization formula is y = (x / y_scale) + y_zero_point. For (x / y_scale),
 The linear de-quantization operator. It consumes a quantized data, a scale, a zero point and computes the full precision data.
 The dequantization formula is y = (x - x_zero_point) * x_scale.
  Scale and zero point must have same shape. They must be either scalar (per tensor) or 1-D tensor (per 'axis').)DOC");
+
+  const char* auto_pad_doc =
+      "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
+      "default value is NOTSET, which means explicit padding is used. "
+      "SAME_UPPER or SAME_LOWER mean pad the input so that the output size match the input."
+      "In case of odd number add the extra padding at the end for SAME_UPPER and at the "
+      "beginning for SAME_LOWER. VALID mean no padding.";
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearConv)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(R"DOC(
+The convolution operator consumes a quantized input tensor, its scale and zero point, 
+a quantized filter, its scale and zero point, and output’s scale and zero point, 
+and computes the quantized output. Each scale and zero point pair must have same shape.
+It means they must be either scalars (per tensor) or 1-D tensors (per channel).)DOC")
+      .Input(
+          0,
+          "x",
+          "Input data tensor from previous layer; "
+          "has size (N x C x H x W), where N is the batch size, "
+          "C is the number of channels, and H and W are the "
+          "height and width. Note that this is for the 2D image. "
+          "Otherwise the size is (N x C x D1 x D2 ... x Dn). "
+          "Optionally, if dimension denotation is "
+          "in effect, the operation expects input data tensor "
+          "to arrive with the dimension denotation of [DATA_BATCH, "
+          "DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
+          "T1")
+      .Input(1, "x_scale", "Scale tensor for input ‘x’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘x’.", "T3")
+      .Input(2, "x_zero_point", "Zero point tensor for input ‘x’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘x’.", "T1")
+      .Input(
+          3,
+          "w",
+          "The weight tensor that will be used in the "
+          "convolutions; has size (M x C/group x kH x kW), where C "
+          "is the number of channels, and kH and kW are the "
+          "height and width of the kernel, and M is the number "
+          "of feature maps. For more than 2 dimensions, the "
+          "kernel shape will be (M x C/group x k1 x k2 x ... x kn), "
+          "where (k1 x k2 x ... kn) is the dimension of the kernel. "
+          "Optionally, if dimension denotation is in effect, "
+          "the operation expects the weight tensor to arrive "
+          "with the dimension denotation of [FILTER_OUT_CHANNEL, "
+          "FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL ...]. "
+          "X.shape[1] == (W.shape[1] * group) == C "
+          "(assuming zero based indices for the shape array). "
+          "Or in other words FILTER_IN_CHANNEL should be equal to DATA_CHANNEL. ",
+          "T1")
+      .Input(4, "w_scale", "Scale tensor for input ‘w’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘w’.", "T3")
+      .Input(5, "w_zero_point", "Scale tensor for input ‘w’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘w’.", "T1")
+      .Input(6, "y_scale", "Scale tensor for output ‘y’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘y’.", "T3")
+      .Input(7, "y_zero_point", "Scale tensor for output ‘y’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘y’.", "T1")
+      .Input(8, "B", "Optional 1D bias to be added to the convolution, has size of M.", "T2", OpSchema::Optional)
+      .Output(
+          0,
+          "y",
+          "Output data tensor that contains the result of the "
+          "convolution. The output dimensions are functions "
+          "of the kernel size, stride size, and pad lengths.",
+          "T1")
+      .TypeConstraint(
+          "T1",
+          {"tensor(int8)", "tensor(uint8)"},
+          "Constrain input, filter, and output types to 8-bit integer tensors.")
+      .TypeConstraint("T2", {"tensor(int32)", "tensor(uint32)"}, "Constrain bias type to 32-bit integer tensor.")
+      .TypeConstraint("T3", {"tensor(float)"}, "Constrain scale of input, filter and output to float tensor.")
+      .Attr(
+          "auto_pad",
+          auto_pad_doc,
+          AttributeProto::STRING,
+          std::string("NOTSET"))
+      .Attr(
+          "kernel_shape",
+          "The shape of the convolution kernel. If not present, should be inferred from input 'w'.",
+          AttributeProto::INTS,
+          OPTIONAL)
+      .Attr(
+          "dilations",
+          "dilation value along each axis of the filter. If not present, the dilation defaults to 1 along each axis.",
+          AttributeProto::INTS,
+          OPTIONAL)
+      .Attr(
+          "strides", "Stride along each axis. If not present, the stride defaults to 1 along each axis.", AttributeProto::INTS, OPTIONAL)
+      .Attr("pads",
+            "Padding for the beginning and ending along each axis, it can take any value greater than or equal to 0."
+            "The value represent the number of pixels added to the beginning and end part of the corresponding axis."
+            "`pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of"
+            "pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`."
+            "This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults"
+            "to 0 along start and end of each axis.",
+            AttributeProto::INTS, OPTIONAL)
+      .Attr(
+          "group",
+          "number of groups input channels and output channels are divided into. default is 1.",
+          AttributeProto::INT,
+          static_cast<int64_t>(1));
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(ConvInteger)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(R"DOC(
+The integer convolution operator consumes an input tensor, a filter, and a padding value,
+ and computes the output. The production MUST never overflow. The accumulation may overflow
+ if and only if in 32 bits.)DOC")
+      .Input(
+          0,
+          "x",
+          "Input data tensor from previous layer; "
+          "has size (N x C x H x W), where N is the batch size, "
+          "C is the number of channels, and H and W are the "
+          "height and width. Note that this is for the 2D image. "
+          "Otherwise the size is (N x C x D1 x D2 ... x Dn). "
+          "Optionally, if dimension denotation is "
+          "in effect, the operation expects input data tensor "
+          "to arrive with the dimension denotation of [DATA_BATCH, "
+          "DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
+          "T1")
+      .Input(
+          1,
+          "w",
+          "The weight tensor that will be used in the "
+          "convolutions; has size (M x C/group x kH x kW), where C "
+          "is the number of channels, and kH and kW are the "
+          "height and width of the kernel, and M is the number "
+          "of feature maps. For more than 2 dimensions, the "
+          "kernel shape will be (M x C/group x k1 x k2 x ... x kn), "
+          "where (k1 x k2 x ... kn) is the dimension of the kernel. "
+          "Optionally, if dimension denotation is in effect, "
+          "the operation expects the weight tensor to arrive "
+          "with the dimension denotation of [FILTER_OUT_CHANNEL, "
+          "FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL ...]. "
+          "X.shape[1] == (W.shape[1] * group) == C "
+          "(assuming zero based indices for the shape array). "
+          "Or in other words FILTER_IN_CHANNEL should be equal to DATA_CHANNEL. ",
+          "T2")
+      .Input(2, "z", "Padding value (zero_point normally), it's optional and default value is 0.", "T1", OpSchema::Optional)
+      .Output(
+          0,
+          "y",
+          "Output data tensor that contains the result of the "
+          "convolution. The output dimensions are functions "
+          "of the kernel size, stride size, and pad lengths.",
+          "T1")
+      .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input X and Z data types as 8-bit integer tensors")
+      .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input W data types as 8-bit integer tensor")
+      .TypeConstraint("T3",
+                      {"tensor(int32)", "tensor(uint32)"},
+                      "Constrain output Y data types as 32-bits integer tensors."
+                      "T3 must be tensor(uint32) when both T1 and T2 are tensor(uint8),"
+                      "or must be tensor(int32) when either T1 or T2 is tensor(int8).")
+      .Attr(
+          "auto_pad",
+          auto_pad_doc,
+          AttributeProto::STRING,
+          std::string("NOTSET"))
+      .Attr(
+          "kernel_shape",
+          "The shape of the convolution kernel. If not present, should be inferred from input 'w'.",
+          AttributeProto::INTS,
+          OPTIONAL)
+      .Attr(
+          "dilations",
+          "dilation value along each axis of the filter. If not present, the dilation defaults to 1 along each axis.",
+          AttributeProto::INTS,
+          OPTIONAL)
+      .Attr(
+          "strides", "Stride along each axis. If not present, the stride defaults to 1 along each axis.", AttributeProto::INTS, OPTIONAL)
+      .Attr("pads",
+            "Padding for the beginning and ending along each axis, it can take any value greater than or equal to 0."
+            "The value represent the number of pixels added to the beginning and end part of the corresponding axis."
+            "`pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of"
+            "pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`."
+            "This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults"
+            "to 0 along start and end of each axis.",
+            AttributeProto::INTS, OPTIONAL)
+      .Attr(
+          "group",
+          "number of groups input channels and output channels are divided into. default is 1.",
+          AttributeProto::INT,
+          static_cast<int64_t>(1));
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(MatMulInteger)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(R"DOC(
+Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
+ The production MUST never overflow. The accumulation may overflow if and only if in 32 bits.)DOC")
+      .Input(0, "A", "N-dimensional matrix A", "T1")
+      .Input(0, "B", "N-dimensional matrix B", "T2")
+      .Output(0, "Y", "Matrix multiply results from A * B", "T3")
+      .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input A data types as 8-bit integer tensor")
+      .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input B data types as 8-bit integer tensor")
+      .TypeConstraint("T3",
+                      {"tensor(int32)", "tensor(uint32)"},
+                      "Constrain output Y data types as 32-bit integer tensor."
+                      "T3 must be tensor(uint32) when both T1 and T2 are tensor(uint8),"
+                      "or must be tensor(int32) when either T1 or T2 is tensor(int8).");
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(ReduceSumInteger)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(R"DOC(
+Computes the sum of the low-precision input tensor's element along the provided axes.
+The resulting tensor has the same rank as the input if keepdims equal 1. If keepdims equal 0,
+then the resulting tensor have the reduced dimension pruned. The above behavior is similar to numpy,
+with the exception that numpy default keepdims to False instead of True.)DOC")
+      .Input(0, "data", "An input tensor.", "T1")
+      .Output(0, "reduced", "Reduced output tensor.", "T2")
+      .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input type to 8-bit integer tensor.")
+      .TypeConstraint("T2",
+                      {"tensor(int32)", "tensor(uint32)"},
+                      "Constrain output data type to 32-bit integer tensor."
+                      "T2 must be tensor(uint32) when T1 is tensor(uint8),"
+                      "or must be tensor(int32) when T1 is tensor(int8).")
+      .Attr(
+          "axes",
+          "A list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor.",
+          AttributeProto::INTS)
+      .Attr(
+          "keepdims",
+          "Keep the reduced dimension or not, default 1 mean keep reduced dimension.",
+          AttributeProto::INT);
 }
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SampleOp);

From 5d3992f999ec8bc2d30ddf9ba8686b3054bce595 Mon Sep 17 00:00:00 2001
From: Scott McKay <scmckay@microsoft.com>
Date: Sat, 24 Nov 2018 17:51:35 +1000
Subject: [PATCH 02/16] Handle the Scan subgraph producing outputs with a
 symbolic dimension.

If the output has a symbolic dimension
  * Infer the shape if it is a loop state variable as we have the input value, and the shape from the subgraph output must match
  * Use a temporary MLValue for the first subgraph execution if it is a subgraph output with a symbolic dimension.
    * After the first execution make the overall output shape concrete and allocate the full output buffer.
    * Use slices of the full output buffer for all other subgraph executions to avoid copies.

Add unit test to validate.
---
 .../core/providers/cpu/controlflow/scan.cc    | 305 +++++++++++++++---
 .../test/providers/cpu/controlflow/if_test.cc |   4 +-
 .../providers/cpu/controlflow/scan_test.cc    |  57 +++-
 .../test/providers/provider_test_utils.cc     |   4 +-
 .../test/providers/provider_test_utils.h      |  21 +-
 5 files changed, 325 insertions(+), 66 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/controlflow/scan.cc b/onnxruntime/core/providers/cpu/controlflow/scan.cc
index cbd6044c82..0504fe5a2f 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan.cc
@@ -118,6 +118,59 @@ class LoopStateVariable {
   MLValue b_;
 };
 
+/*
+Class that co-ordinates writes to slices of the overall Scan output.
+It will directly update the data returned by OpKernelContextInternal.Output(i).
+*/
+class OutputIterator {
+ public:
+  static Status Create(OpKernelContextInternal& context,
+                       int output_index,
+                       bool is_loop_state_var,
+                       TensorShape final_shape,
+                       std::unique_ptr<OutputIterator>& iterator) {
+    iterator.reset(new OutputIterator(context, output_index, is_loop_state_var, final_shape));
+    return iterator->Initialize();
+  }
+
+  MLValue& operator*();
+  OutputIterator& operator++();
+
+  void ZeroOutCurrent() {
+    auto* tensor = (**this).GetMutable<Tensor>();
+    memset(tensor->MutableDataRaw(), 0, tensor->Size());
+  }
+
+ private:
+  OutputIterator(OpKernelContextInternal& context,
+                 int output_index,
+                 bool is_loop_state_var,
+                 TensorShape final_shape);
+
+  Status Initialize();
+  Status AllocateFinalBuffer();
+  Status MakeConcrete();
+
+  OpKernelContextInternal& context_;
+  const int output_index_;
+  std::vector<int64_t> dims_;
+  TensorShapeProto per_iteration_shape_;
+  TensorShape final_shape_;
+  bool is_loop_state_var_;
+  int64_t num_iterations_;
+  int64_t cur_iteration_;
+
+  bool is_concrete_shape_;
+  std::vector<MLValueTensorSlicer<MLValue>::Iterator> slicer_iterators_;
+  std::vector<MLValueTensorSlicer<MLValue>::Iterator>::iterator cur_slicer_iterator_;
+
+  // if shape is not concrete we need the first output to know the missing dimension before
+  // we can allocate final_output_mlvalue_ and use the slicers.
+  MLValue first_output_;
+
+  MLValue* final_output_mlvalue_;
+};
+
 class ScanImpl {
  public:
   ScanImpl(OpKernelContextInternal& context,
@@ -135,10 +188,10 @@ class ScanImpl {
  private:
   // validate inputs and setup batch size and max sequence length.
   Status ValidateInput();
-  Status ValidateSubgraphInput(int start_input, int end_input, bool has_seq_len_dim,
+  Status ValidateSubgraphInput(int start_input, int end_input, bool is_loop_state_var,
                                const std::vector<const NodeArg*>& graph_inputs);
 
-  Status AllocateOutput(int index, bool has_sequence_len);
+  Status AllocateOutput(int index, bool is_loop_state_var);
   Status AllocateOutputTensors();
   Status CreateLoopStateVariables(std::vector<std::vector<LoopStateVariable>>& loop_state_variables);
 
@@ -147,7 +200,6 @@ class ScanImpl {
 
   Status IterateSequence(std::vector<LoopStateVariable>& loop_state_variables,
                          ConstTensorSlicerIterators& scan_input_stream_iterators,
-                         MutableTensorSlicerIterators& scan_output_stream_iterators,
                          int64_t seq_length);
 
   OpKernelContextInternal& context_;
@@ -166,6 +218,7 @@ class ScanImpl {
   std::vector<int64_t> sequence_lens_;
 
   std::vector<std::string> subgraph_output_names_;
+  std::vector<std::unique_ptr<OutputIterator>> output_iterators_;
 
   std::unordered_map<std::string, const MLValue*> implicit_inputs_;
 };
@@ -249,6 +302,149 @@ void LoopStateVariable::Next() {
   ++iteration_num_;
 }
 
+static Status MakeShapeConcrete(const TensorShape& per_iteration_shape, TensorShape& final_shape) {
+  auto num_dims_per_iteration = per_iteration_shape.NumDimensions();
+  auto final_shape_offset = final_shape.NumDimensions() - num_dims_per_iteration;
+  for (size_t i = 0; i < num_dims_per_iteration; ++i) {
+    auto existing_value = final_shape[i + final_shape_offset];
+    if (existing_value == -1) {
+      final_shape[i + final_shape_offset] = per_iteration_shape[i];
+    } else {
+      if (existing_value != per_iteration_shape[i]) {
+        return ONNXRUNTIME_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                                       "Mismatch between expected shape and shape from first output",
+                                       final_shape, " is not compatible with ", per_iteration_shape);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+OutputIterator::OutputIterator(OpKernelContextInternal& context,
+                               int output_index,
+                               bool is_loop_state_var,
+                               TensorShape final_shape)
+    : context_{context},
+      output_index_{output_index},
+      is_loop_state_var_{is_loop_state_var},
+      final_shape_{final_shape},
+      cur_iteration_{0} {
+  is_concrete_shape_ = final_shape_.Size() >= 0;
+
+  // there are one or two dimensions being iterated depending on whether it's a loop state variable or scan input.
+  auto num_iteration_dims = is_loop_state_var_ ? 1 : 2;
+  num_iterations_ = final_shape_.Slice(0, num_iteration_dims).Size();
+}
+
+Status OutputIterator::Initialize() {
+  Status status = Status::OK();
+
+  if (is_loop_state_var_ && !is_concrete_shape_) {
+    // copy the shape from the input initial value which will have a concrete shape.
+    auto* input = context_.Input<Tensor>(output_index_ + 1);  // +1 to skip the sequence_len input
+    status = MakeShapeConcrete(input->Shape(), final_shape_);
+    ONNXRUNTIME_RETURN_IF_ERROR(status);
+
+    is_concrete_shape_ = true;
+  }
+
+  if (is_concrete_shape_) {
+    status = AllocateFinalBuffer();
+    ONNXRUNTIME_RETURN_IF_ERROR(status);
+  } else {
+    // use first_output_
+  }
+
+  return Status::OK();
+}
+
+Status OutputIterator::AllocateFinalBuffer() {
+  // make sure a single buffer for the full output is created upfront.
+  // we slice this into per-iteration pieces in Execute using MLValueTensorSlicer.
+  auto* tensor = context_.Output(output_index_, final_shape_);
+
+  if (!tensor)
+    return ONNXRUNTIME_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create output tensor for output #", output_index_);
+
+  // get the output tensor we just created as an MLValue
+  final_output_mlvalue_ = context_.GetOutputMLValue(output_index_);
+
+  if (is_loop_state_var_) {
+    // only one entry is required as we slice on a single dimension
+    slicer_iterators_.push_back(MLValueTensorSlicer<MLValue>::Create(*final_output_mlvalue_).begin());
+  } else {
+    auto batch_size = final_shape_[0];
+    for (int i = 0; i < batch_size; ++i) {
+      // the slicer handles the sequence dimension (dim 1) so create an entry for each batch
+      slicer_iterators_.push_back(MLValueTensorSlicer<MLValue>::Create(*final_output_mlvalue_, 1, i).begin());
+    }
+  }
+
+  cur_slicer_iterator_ = slicer_iterators_.begin();
+
+  return Status::OK();
+}
+
+Status OutputIterator::MakeConcrete() {
+  ONNXRUNTIME_ENFORCE(first_output_.IsAllocated(), "First usage of OutputIterator did not result in any output.");
+  Status status = Status::OK();
+
+  auto& tensor = first_output_.Get<Tensor>();
+  auto& tensor_shape = tensor.Shape();
+
+  // update the final shape
+  status = MakeShapeConcrete(tensor_shape, final_shape_);
+  ONNXRUNTIME_RETURN_IF_ERROR(status);
+
+  is_concrete_shape_ = true;
+  status = AllocateFinalBuffer();
+  ONNXRUNTIME_RETURN_IF_ERROR(status);
+
+  // copy first output to final buffer
+  auto input_span = gsl::make_span<const gsl::byte>(static_cast<const gsl::byte*>(tensor.DataRaw()), tensor.Size());
+
+  auto output = (**this).GetMutable<Tensor>();
+  auto output_span = gsl::make_span<gsl::byte>(static_cast<gsl::byte*>(output->MutableDataRaw()), output->Size());
+
+  gsl::copy(input_span, output_span);
+
+  // release the MLValue we used for the first output
+  first_output_ = {};
+
+  return status;
+}
+
+MLValue& OutputIterator::operator*() {
+  ONNXRUNTIME_ENFORCE(cur_iteration_ < num_iterations_);
+
+  if (is_concrete_shape_)
+    return **cur_slicer_iterator_;
+  else
+    return first_output_;
+}
+
+OutputIterator& OutputIterator::operator++() {
+  if (cur_iteration_ < num_iterations_) {
+    if (!is_concrete_shape_) {
+      // we should have an output now, so convert to using the overall output buffer and slicers
+      auto status = MakeConcrete();
+      ONNXRUNTIME_ENFORCE(status.IsOK(), status.ErrorMessage());
+    }
+
+    ++cur_iteration_;
+
+    // if not a loop state var, see if we just finished the current sequence (dim 1)
+    if (!is_loop_state_var_ && cur_iteration_ % final_shape_[1] == 0) {
+      ++cur_slicer_iterator_;
+    } else {
+      ++(*cur_slicer_iterator_);
+    }
+  }
+
+  return *this;
+}
+
 ScanImpl::ScanImpl(OpKernelContextInternal& context,
                    const SessionState& session_state,
                    int64_t num_scan_inputs,
@@ -258,7 +454,7 @@ ScanImpl::ScanImpl(OpKernelContextInternal& context,
       subgraph_{*session_state.GetGraphViewer()},
       directions_{directions},
       implicit_inputs_{context_.GetImplicitInputs()} {
-  //optional first input so may be nullptr
+  // optional first input so may be nullptr
   sequence_lens_tensor_ = context.Input<Tensor>(0);
 
   num_variadic_inputs_ = context_.NumVariadicInputs(1);
@@ -271,12 +467,12 @@ Status ScanImpl::Initialize() {
   auto status = ValidateInput();
   ONNXRUNTIME_RETURN_IF_ERROR(status);
 
-  auto& graph_outputs = subgraph_.GetOutputs();
-  subgraph_output_names_.reserve(graph_outputs.size());
+  auto& subgraph_outputs = subgraph_.GetOutputs();
+  subgraph_output_names_.reserve(subgraph_outputs.size());
 
   // save list of subgraph output names in their provided order to use when fetching the results
   // from each subgraph execution. the Scan outputs will match this order.
-  for (auto& output : graph_outputs) {
+  for (auto& output : subgraph_outputs) {
     subgraph_output_names_.push_back(output->Name());
   }
 
@@ -301,9 +497,10 @@ static const MLValue& GetSubgraphInputMLValue(const OpKernelContextInternal& con
 }
 
 // Validate that the subgraph input has valid shapes
-Status ScanImpl::ValidateSubgraphInput(int start_input, int end_input, bool has_seq_len_dim,
+Status ScanImpl::ValidateSubgraphInput(int start_input, int end_input, bool is_loop_state_var,
                                        const std::vector<const NodeArg*>& graph_inputs) {
   // first dim is batch size. optional sequence dim. dim/s for the data
+  bool has_seq_len_dim = !is_loop_state_var;
   auto min_dims_required = has_seq_len_dim ? 3 : 2;
 
   for (int i = start_input; i < end_input; ++i) {
@@ -355,11 +552,11 @@ Status ScanImpl::ValidateInput() {
   }
 
   // process any loop state variables, which will set the batch size
-  auto status = ValidateSubgraphInput(0, num_loop_state_variables_, false, graph_inputs);
+  auto status = ValidateSubgraphInput(0, num_loop_state_variables_, true, graph_inputs);
   ONNXRUNTIME_RETURN_IF_ERROR(status);
 
   // process the scan inputs. sets/validates batch size and sequence length
-  status = ValidateSubgraphInput(num_loop_state_variables_, num_variadic_inputs_, true, graph_inputs);
+  status = ValidateSubgraphInput(num_loop_state_variables_, num_variadic_inputs_, false, graph_inputs);
   ONNXRUNTIME_RETURN_IF_ERROR(status);
 
   if (sequence_lens_tensor_ != nullptr) {
@@ -386,11 +583,12 @@ Status ScanImpl::ValidateInput() {
   return Status::OK();
 }
 
-Status ScanImpl::AllocateOutput(int index, bool has_sequence_len_dimension) {
+Status ScanImpl::AllocateOutput(int index, bool is_loop_state_var) {
   // use the shape from the subgraph output. we require this to be specified in the model or inferable.
   auto& graph_outputs = subgraph_.GetOutputs();
   auto* graph_output = graph_outputs.at(index);
   auto* graph_output_shape = graph_output->Shape();
+
   if (!graph_output_shape) {
     return ONNXRUNTIME_MAKE_STATUS(ONNXRUNTIME, FAIL, "Subgraph must have the shape set for all outputs but ",
                                    graph_output->Name(), " did not.");
@@ -404,24 +602,16 @@ Status ScanImpl::AllocateOutput(int index, bool has_sequence_len_dimension) {
 
   scan_output_dims.push_back(batch_size_);
 
-  if (has_sequence_len_dimension) {
+  if (!is_loop_state_var) {
     scan_output_dims.push_back(max_sequence_len_);
   }
 
   scan_output_dims.insert(scan_output_dims.cend(), graph_output_dims.cbegin(), graph_output_dims.cend());
 
-  // make sure a single buffer for the full output is created upfront.
-  // we slice this into per-iteration pieces in Execute using MLValueTensorSlicer.
-  auto* tensor = context_.Output(index, TensorShape(scan_output_dims));
+  std::unique_ptr<OutputIterator> output_iter;
+  OutputIterator::Create(context_, index, is_loop_state_var, TensorShape(scan_output_dims), output_iter);
 
-  if (!tensor)
-    return ONNXRUNTIME_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create output tensor for ", graph_output->Name());
-
-  // zero out the output so that any short sequences have deterministic values in unused slots.
-  // strictly speaking this isn't required, and alternatively we could fill with zeros when we
-  // encounter a short sequence and are creating output, but one memset is easy, involves
-  // less code complexity, and should be relatively cheap.
-  memset(tensor->MutableDataRaw(), 0, tensor->Size());
+  output_iterators_.push_back(std::move(output_iter));
 
   return Status::OK();
 }
@@ -435,17 +625,13 @@ Status ScanImpl::AllocateOutputTensors() {
                                    " outputs but Scan expects ", num_variadic_outputs_);
   }
 
-  // TODO: Need to handle shape/type inference for subgraphs.
-  // For now copy shape from subgraph output and expand based on batch size and sequence length
-
   for (int i = 0; i < num_loop_state_variables_; ++i) {
-    const bool has_sequence_len_dimension = false;  // loop state variables don't have a sequence_len dimension;
-    status = AllocateOutput(i, has_sequence_len_dimension);
+    status = AllocateOutput(i, true);
     ONNXRUNTIME_RETURN_IF_ERROR(status);
   }
 
   for (int i = num_loop_state_variables_, end = num_variadic_outputs_; i < end; ++i) {
-    status = AllocateOutput(i, true);
+    status = AllocateOutput(i, false);
     ONNXRUNTIME_RETURN_IF_ERROR(status);
   }
 
@@ -461,9 +647,7 @@ Status ScanImpl::CreateLoopStateVariables(std::vector<std::vector<LoopStateVaria
   //    each iteration of the subgraph. This minimizes copying of data during each iteration.
 
   std::vector<MLValueTensorSlicer<const MLValue>::Iterator> loop_state_input_iterators;
-  std::vector<MLValueTensorSlicer<MLValue>::Iterator> loop_state_output_iterators;
   loop_state_input_iterators.reserve(num_loop_state_variables_);
-  loop_state_output_iterators.reserve(num_loop_state_variables_);
 
   // create the input and output slice iterator for each loop state variable.
   for (int i = 0; i < num_loop_state_variables_; ++i) {
@@ -473,7 +657,6 @@ Status ScanImpl::CreateLoopStateVariables(std::vector<std::vector<LoopStateVaria
     ONNXRUNTIME_ENFORCE(p_mlvalue, "Output MLValue has not been created for loop state variable output ", i);
 
     loop_state_input_iterators.push_back(MLValueTensorSlicer<const MLValue>::Create(mlvalue).begin());
-    loop_state_output_iterators.push_back(MLValueTensorSlicer<MLValue>::Create(*p_mlvalue).begin());
   }
 
   batch_loop_state_variables.clear();
@@ -490,7 +673,7 @@ Status ScanImpl::CreateLoopStateVariables(std::vector<std::vector<LoopStateVaria
 
     for (int i = 0; i < num_loop_state_variables_; ++i) {
       auto& input_iter = loop_state_input_iterators[i];
-      auto& output_iter = loop_state_output_iterators[i];
+      auto& output_iter = *output_iterators_[i];
 
       variables.push_back(LoopStateVariable(*input_iter, *output_iter, sequence_lens_[b], alloc));
 
@@ -533,21 +716,9 @@ Status ScanImpl::Execute() {
       }
     }
 
-    // Setup output MLValue streams
-    std::vector<MLValueTensorSlicer<MLValue>::Iterator> scan_output_stream_iterators;
-    scan_output_stream_iterators.reserve(num_variadic_outputs_);
-
-    for (int i = num_loop_state_variables_, end = num_variadic_outputs_; i < end; ++i) {
-      MLValue* p_mlvalue = context_.GetOutputMLValue(i);
-      ONNXRUNTIME_ENFORCE(p_mlvalue, "Output MLValue has not been created for output ", i);
-
-      scan_output_stream_iterators.push_back(MLValueTensorSlicer<MLValue>::Create(*p_mlvalue, 1, b).begin());
-    }
-
     // Call the subgraph for each item in the sequence
     status = IterateSequence(batch_loop_state_variables[b],
                              scan_input_stream_iterators,
-                             scan_output_stream_iterators,
                              sequence_lens_[b]);
 
     ONNXRUNTIME_RETURN_IF_ERROR(status);
@@ -558,7 +729,6 @@ Status ScanImpl::Execute() {
 
 Status ScanImpl::IterateSequence(std::vector<LoopStateVariable>& loop_state_variables,
                                  ConstTensorSlicerIterators& scan_input_stream_iterators,
-                                 MutableTensorSlicerIterators& scan_output_stream_iterators,
                                  int64_t seq_length) {
   Status status = Status::OK();
   auto& graph_inputs = subgraph_.GetInputs();
@@ -575,9 +745,8 @@ Status ScanImpl::IterateSequence(std::vector<LoopStateVariable>& loop_state_vari
     feeds[entry.first] = *entry.second;
   }
 
-  // as we fill all the outputs with 0 initially, just iterate seq_length not max_seq_length_
-  // as we don't need to pad the output for a short sequence here.
-  for (int64_t seq_no = 0; seq_no < seq_length; ++seq_no) {
+  int64_t seq_no = 0;
+  for (; seq_no < seq_length; ++seq_no) {
     for (int input = 0; input < num_variadic_inputs_; ++input) {
       // the ordering of the Scan inputs should match the ordering of the subgraph inputs
       auto name = graph_inputs[input]->Name();
@@ -596,15 +765,24 @@ Status ScanImpl::IterateSequence(std::vector<LoopStateVariable>& loop_state_vari
 
     fetches.clear();
 
+    bool copy_fetch_to_iter = false;
+
     for (int output = 0, end = num_variadic_outputs_; output < end; ++output) {
       if (output < num_loop_state_variables_) {
         // add loop state variable output
         fetches.push_back(loop_state_variables[output].Output());
       } else {
-        // add sliced output
-        auto& iterator = scan_output_stream_iterators[output - num_loop_state_variables_];
-        fetches.push_back(*iterator);
-        ++iterator;
+        // add MLValue from sliced output
+        auto& iterator = *output_iterators_[output];
+        auto& mlvalue = *iterator;
+        fetches.push_back(mlvalue);
+
+        // If there is a dynamic shape in an output we need to copy it back to the OutputIterator
+        // so it can setup the overall output and avoid copies for all other output values.
+        // The mlvalue in the iterator will point to data once we have the overall output initialized.
+        // Check current value as we don't want to unset copy_fetch_to_iter if it is true.
+        if (!copy_fetch_to_iter)
+          copy_fetch_to_iter = (seq_no == 0) && (mlvalue.IsAllocated() == false);
       }
     }
 
@@ -620,6 +798,27 @@ Status ScanImpl::IterateSequence(std::vector<LoopStateVariable>& loop_state_vari
 
     // cycle the LoopStateVariable input/output in preparation for the next iteration
     std::for_each(loop_state_variables.begin(), loop_state_variables.end(), [](LoopStateVariable& v) { v.Next(); });
+
+    // and move the output iterators.
+    for (int output = num_loop_state_variables_; output < num_variadic_outputs_; ++output) {
+      auto& iterator = *output_iterators_[output];
+
+      // copy the data from fetches to the iterator so it can setup the overall output
+      if (copy_fetch_to_iter && (*iterator).IsAllocated() == false) {
+        *iterator = fetches[output];
+      }
+
+      ++iterator;
+    }
+  }
+
+  // zero out any remaining values in the sequence
+  for (; seq_length < max_sequence_len_; ++seq_length) {
+    for (int output = num_loop_state_variables_; output < num_variadic_outputs_; ++output) {
+      auto& iterator = *output_iterators_[output];
+      iterator.ZeroOutCurrent();
+      ++iterator;
+    }
   }
 
   return status;
diff --git a/onnxruntime/test/providers/cpu/controlflow/if_test.cc b/onnxruntime/test/providers/cpu/controlflow/if_test.cc
index b53b451a0d..858d9c550f 100644
--- a/onnxruntime/test/providers/cpu/controlflow/if_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/if_test.cc
@@ -16,7 +16,7 @@ namespace test {
 
 struct RunOptions {
   bool include_dim_values_in_main_graph = false;
-  bool symbolic_dim_values_in_main_graph = false;
+  int symbolic_dim_value_in_main_graph = -1;
   bool include_dim_values_in_subgraph = true;
 };
 
@@ -181,7 +181,7 @@ void RunTest(bool condition_value,
   IfOpTester test{options};
 
   test.AddShapeToTensorData(options.include_dim_values_in_main_graph,
-                            options.symbolic_dim_values_in_main_graph);
+                            options.symbolic_dim_value_in_main_graph);
 
   // add the main graph inputs and outputs.
   // we will handle the 'If' inputs in the AddNodes override, and as 'If' is the last node
diff --git a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
index 965b7295d8..856bf4e145 100644
--- a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
@@ -261,8 +261,6 @@ void RunTest(const std::string test_name, int64_t batch_size, int64_t max_sequen
 
   ScanOpTester test;
 
-  test.AddShapeToTensorData(options.include_dim_values_in_main_graph);
-
   test.AddAttribute("body", proto);
   test.AddAttribute<int64_t>("num_scan_inputs", 2);
 
@@ -277,6 +275,8 @@ void RunTest(const std::string test_name, int64_t batch_size, int64_t max_sequen
     test.AddInput<int64_t>("sequence_lens", sequence_lens_dims, *sequence_lens);
   }
 
+  test.AddShapeToTensorData(options.include_dim_values_in_main_graph);
+
   test.AddInput<float>("scan_loop_state_in_0", {batch_size, 1}, loop_state_in_0);
 
   std::vector<int64_t> input_shape{batch_size, max_sequence_len, input_size};
@@ -665,5 +665,58 @@ TEST(Scan, MixedTypeInputs) {
   test.Run();
 }
 
+TEST(Scan, UnknownDimInSubgraphOutput) {
+  Model model("ScanBody");
+  auto& graph = model.MainGraph();
+
+  TypeProto float_tensor;
+  float_tensor.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
+  float_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_param("param");
+  TypeProto int_tensor;
+  int_tensor.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT64);
+  int_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_param("param");
+
+  auto& state_in_1 = graph.GetOrCreateNodeArg("state_in_1", &float_tensor);
+  auto& scan_in_1 = graph.GetOrCreateNodeArg("scan_in_1", &float_tensor);
+
+  auto& state_out_1 = graph.GetOrCreateNodeArg("state_out_1", &float_tensor);
+  auto& scan_out_1 = graph.GetOrCreateNodeArg("scan_out_1", &float_tensor);
+
+  graph.AddNode("node1", "Identity", "Copy state_in_1 to scan_out_1", {&state_in_1}, {&scan_out_1});
+  graph.AddNode("node2", "Identity", "Copy scan_in_1 to state_out_1", {&scan_in_1}, {&state_out_1});
+
+  graph.SetInputOrder({&state_in_1, &scan_in_1});
+  graph.SetOutputOrder({&state_out_1, &scan_out_1});
+
+  auto status = graph.Resolve();
+  EXPECT_EQ(status, Status::OK());
+
+  auto& scan_body = graph.ToGraphProto();
+
+  // Construct and run scan test
+  ScanOpTester test;
+
+  int64_t batch_size = 1, sequence_len = 3, input_size = 1;
+  std::vector<int64_t> seq_shape{batch_size, sequence_len, input_size};
+  std::vector<int64_t> state_shape{batch_size, input_size};
+
+  test.AddAttribute("body", scan_body);
+  test.AddAttribute<int64_t>("num_scan_inputs", 1);
+
+  // we add a symbolic dimension to bot the initial state and the scan input so we test the path that handles loop
+  // state variables (prior to execution) and the path that handles subgraph outputs (post first execution).
+  // Note that we cross the values over in the subgraph, so the symbolic dimension in
+  // initial_state_1 affects scan_out_1, and the symbolic dimension in scan_input_1 affects state_out_1.
+  test.AddMissingOptionalInput<int64_t>();
+  test.AddShapeToTensorData(true, 1);  // add shape and symbolic dim in dim 1 for initial_state_1
+  test.AddInput<float>("initial_state_1", state_shape, {0.0});
+  test.AddShapeToTensorData(true, 2);  // add shape and symbolic dim in dim 2 for scan_input_1
+  test.AddInput<float>("scan_input_1", seq_shape, {1.0, 2.0, 3.0});
+
+  test.AddOutput<float>("final_state_1", state_shape, {3.0});
+  test.AddOutput<float>("scan_output_1", seq_shape, {0.0, 1.0, 2.0});
+
+  test.Run();
+}
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/provider_test_utils.cc b/onnxruntime/test/providers/provider_test_utils.cc
index 2c118f1372..7815ab4d37 100644
--- a/onnxruntime/test/providers/provider_test_utils.cc
+++ b/onnxruntime/test/providers/provider_test_utils.cc
@@ -407,7 +407,9 @@ void OpTester::Run(ExpectResult expect_result,
               const auto& expected_shape = expected_data.data_.Get<Tensor>().Shape();
               EXPECT_TRUE(inferred_dims.size() == expected_shape.NumDimensions());
               for (int d = 0; d < inferred_dims.size(); ++d) {
-                EXPECT_EQ(expected_shape[d], inferred_dims[d]);
+                // check equal unless the input involved a symbolic dimension
+                if (inferred_dims[d] != -1)
+                  EXPECT_EQ(expected_shape[d], inferred_dims[d]) << "Output idx = " << idx << " dim = " << d;
               }
             }
             Check(expected_data, mlvalue.Get<Tensor>(), provider_type);
diff --git a/onnxruntime/test/providers/provider_test_utils.h b/onnxruntime/test/providers/provider_test_utils.h
index 0ce06aea34..7c6abcebcc 100644
--- a/onnxruntime/test/providers/provider_test_utils.h
+++ b/onnxruntime/test/providers/provider_test_utils.h
@@ -91,7 +91,11 @@ struct TTypeProto : ONNX_NAMESPACE::TypeProto {
     if (shape) {
       auto mutable_shape = mutable_tensor_type()->mutable_shape();
       for (auto i : *shape) {
-        mutable_shape->add_dim()->set_dim_value(i);
+        auto* mutable_dim = mutable_shape->add_dim();
+        if (i != -1)
+          mutable_dim->set_dim_value(i);
+        else
+          mutable_dim->set_dim_param("symbolic");
       }
     }
   }
@@ -145,10 +149,11 @@ class OpTester {
 
   // Set whether the NodeArg created by AddInput/AddOutput should include shape information
   // for Tensor types. If not added, shape inferencing should resolve. If added, shape inferencing
-  // should validate. Default is to not add.
-  OpTester& AddShapeToTensorData(bool add_shape = true, bool add_symbolic_dim = false) {
+  // should validate. Default is to not add. 
+  // Additionally a symbolic dimension will be added if symbolic_dim matches a dimension in the input. 
+  OpTester& AddShapeToTensorData(bool add_shape = true, int symbolic_dim = -1) {
     add_shape_to_tensor_data_ = add_shape;
-    add_symbolic_dim_to_tensor_data_ = add_symbolic_dim;
+    add_symbolic_dim_to_tensor_data_ = symbolic_dim;
     return *this;
   }
 
@@ -268,7 +273,7 @@ class OpTester {
       ONNXRUNTIME_ENFORCE(shape.Size() == values_count, values_count,
                           " input values doesn't match tensor size of ", shape.Size());
 
-      auto allocator = ::onnxruntime::test::AllocatorManager::Instance().GetAllocator(CPU);
+      auto allocator = test::AllocatorManager::Instance().GetAllocator(CPU);
       auto size_in_bytes = values_count * sizeof(T);
       void* buffer = allocator->Alloc(size_in_bytes);
       auto p_tensor = std::make_unique<Tensor>(DataTypeImpl::GetType<T>(),
@@ -283,8 +288,8 @@ class OpTester {
       }
 
       std::vector<int64_t> dims_for_proto{dims};
-      if (add_symbolic_dim_to_tensor_data_ && !dims.empty()) {
-        dims_for_proto[0] = -1;
+      if (add_symbolic_dim_to_tensor_data_ >= 0 && dims.size() > add_symbolic_dim_to_tensor_data_) {
+        dims_for_proto[add_symbolic_dim_to_tensor_data_] = -1;
       }
 
       TTypeProto<T> type_proto(add_shape_to_tensor_data_ ? &dims_for_proto : nullptr);
@@ -302,7 +307,7 @@ class OpTester {
   const char* domain_;
   int opset_version_;
   bool add_shape_to_tensor_data_ = true;
-  bool add_symbolic_dim_to_tensor_data_ = false;
+  int add_symbolic_dim_to_tensor_data_ = -1;
   std::vector<Data> input_data_;
   std::vector<Data> output_data_;
   std::vector<size_t> initializer_index_;

From e7e801b45e1a2844f5230f4b7fd9fa3340036c63 Mon Sep 17 00:00:00 2001
From: Du Li <duli1@microsoft.com>
Date: Sat, 24 Nov 2018 18:53:42 -0800
Subject: [PATCH 03/16] Adding shaper inference for Op expand_dims

---
 onnxruntime/contrib_ops/contrib_ops.cc        | 34 +++++++++++++++++--
 .../test/contrib_ops/expand_dims_test.cc      |  4 +--
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/contrib_ops/contrib_ops.cc b/onnxruntime/contrib_ops/contrib_ops.cc
index 49c8a133ea..5aef0e96e6 100644
--- a/onnxruntime/contrib_ops/contrib_ops.cc
+++ b/onnxruntime/contrib_ops/contrib_ops.cc
@@ -12,8 +12,8 @@
 namespace onnxruntime {
 namespace contrib {
 using ::ONNX_NAMESPACE::AttributeProto;
-using ::ONNX_NAMESPACE::OPTIONAL;
 using ::ONNX_NAMESPACE::OpSchema;
+using ::ONNX_NAMESPACE::OPTIONAL;
 
 void RegisterContribSchemas() {
   ONNX_CONTRIB_OPERATOR_SCHEMA(SampleOp)
@@ -41,7 +41,37 @@ Sample echo operator.)DOC");
           "T",
           ONNX_NAMESPACE::OpSchema::all_tensor_types(),
           "Constrain to any tensor type. If the dtype attribute is not provided this must be a valid output type.")
-      .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput)
+      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+        // Type inference
+        propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+        // Shape inference
+        if (!hasInputShape(ctx, 0))
+          return;
+
+        auto& input_shape = getInputShape(ctx, 0);
+        const int rank = input_shape.dim_size();
+        const ONNX_NAMESPACE::TensorProto* axis_initializer = ctx.getInputData(1);
+        if (!axis_initializer)
+          return;
+        const int axis = axis_initializer->int32_data()[0];
+        if (axis > rank || axis < -rank - 1) {
+          fail_shape_inference("Input axis is invalid: ", axis);
+        }
+        int pos = axis >= 0 ? axis : rank + axis - 1;
+        ONNX_NAMESPACE::TensorShapeProto output_shape;
+        for (int i = 0; i < pos; ++i) {
+          output_shape.add_dim();
+          *(output_shape.mutable_dim(i)) = input_shape.dim(i);
+        }
+        output_shape.add_dim();
+        output_shape.mutable_dim(pos)->set_dim_value(1);
+        for (int i = pos + 1; i < rank + 1; ++i) {
+          output_shape.add_dim();
+          *(output_shape.mutable_dim(i)) = input_shape.dim(i - 1);
+        }
+        updateOutputShape(ctx, 0, output_shape);
+      })
       .SetDoc(R"DOC(ExpandDims echo operator.)DOC");
 
   ONNX_CONTRIB_OPERATOR_SCHEMA_ELSEWHERE(AttnLSTM, RegisterAttnLSTMContribOpSchema);
diff --git a/onnxruntime/test/contrib_ops/expand_dims_test.cc b/onnxruntime/test/contrib_ops/expand_dims_test.cc
index df34a5eb8d..335a4d3e8f 100644
--- a/onnxruntime/test/contrib_ops/expand_dims_test.cc
+++ b/onnxruntime/test/contrib_ops/expand_dims_test.cc
@@ -9,7 +9,7 @@ namespace test {
 
 TEST(ContribOpTest, ExpandDims_0) {
   OpTester test("ExpandDims", 1, onnxruntime::kMSDomain);
-  test.AddShapeToTensorData(false);  // TODO: re-enable shape inference test
+  test.AddShapeToTensorData(true);  // TODO: re-enable shape inference test
   test.AddInput<float>("X", {2, 3}, std::vector<float>(6, 1.0f));
   test.AddInput<int32_t>("axis", {}, {-1});
   test.AddOutput<float>("Y", {2, 3, 1}, std::vector<float>(6, 1.0f));
@@ -18,7 +18,7 @@ TEST(ContribOpTest, ExpandDims_0) {
 
 TEST(ContribOpTest, ExpandDims_1) {
   OpTester test("ExpandDims", 1, onnxruntime::kMSDomain);
-  test.AddShapeToTensorData(false);  // TODO: re-enable shape inference test
+  test.AddShapeToTensorData(true);  // TODO: re-enable shape inference test
   test.AddInput<float>("X", {2, 3}, std::vector<float>(6, 1.0f));
   test.AddInput<int32_t>("axis", {}, {1});
   test.AddOutput<float>("Y", {2, 1, 3}, std::vector<float>(6, 1.0f));

From eef36d2fbf14986472348e6be2b4d9e5132d3f28 Mon Sep 17 00:00:00 2001
From: Scott McKay <scmckay@microsoft.com>
Date: Mon, 26 Nov 2018 07:40:08 +1000
Subject: [PATCH 04/16] Update some comments

---
 onnxruntime/core/providers/cpu/controlflow/scan.cc | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/controlflow/scan.cc b/onnxruntime/core/providers/cpu/controlflow/scan.cc
index 0504fe5a2f..2952d5f92e 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan.cc
@@ -119,8 +119,10 @@ class LoopStateVariable {
 };
 
 /*
-Class that co-ordinates writes to slices of the overall Scan output.
-It will directly update the data returned by OpKernelContextInternal.Output(i).
+Class that co-ordinates writing to slices of the overall Scan output buffer returned by OpKernelContext.Output(i). 
+If the subgraph has a symbolic dimension in an output it will use a temporary MLValue for the first execution
+in order to discover the output shape. Once the shape is known, it will switch to using the overall output buffer 
+to avoid copies.
 */
 class OutputIterator {
  public:
@@ -136,6 +138,7 @@ class OutputIterator {
   MLValue& operator*();
   OutputIterator& operator++();
 
+  // set the output for the current iteration to zeros. used for short sequence lengths
   void ZeroOutCurrent() {
     auto* tensor = (**this).GetMutable<Tensor>();
     memset(tensor->MutableDataRaw(), 0, tensor->Size());
@@ -160,7 +163,10 @@ class OutputIterator {
   int64_t num_iterations_;
   int64_t cur_iteration_;
 
+  // is the final shape concrete, or does it have symbolic dimensions
   bool is_concrete_shape_;
+
+  // one or more slicers for writing to the output
   std::vector<MLValueTensorSlicer<MLValue>::Iterator> slicer_iterators_;
   std::vector<MLValueTensorSlicer<MLValue>::Iterator>::iterator cur_slicer_iterator_;
 
@@ -302,6 +308,7 @@ void LoopStateVariable::Next() {
   ++iteration_num_;
 }
 
+// fill in a symbolic dimension in the overall output using the output shape from an iteration of the subgraph
 static Status MakeShapeConcrete(const TensorShape& per_iteration_shape, TensorShape& final_shape) {
   auto num_dims_per_iteration = per_iteration_shape.NumDimensions();
   auto final_shape_offset = final_shape.NumDimensions() - num_dims_per_iteration;

From 720aca581a2f2b918b61d7f609848100b35c58f6 Mon Sep 17 00:00:00 2001
From: Scott McKay <scmckay@microsoft.com>
Date: Mon, 26 Nov 2018 08:22:28 +1000
Subject: [PATCH 05/16] Update comments

---
 .../core/providers/cpu/controlflow/scan.cc    | 19 ++++++++++---------
 .../providers/cpu/controlflow/scan_test.cc    |  9 ++++++---
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/controlflow/scan.cc b/onnxruntime/core/providers/cpu/controlflow/scan.cc
index 2952d5f92e..b4f340c3c7 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan.cc
@@ -772,7 +772,8 @@ Status ScanImpl::IterateSequence(std::vector<LoopStateVariable>& loop_state_vari
 
     fetches.clear();
 
-    bool copy_fetch_to_iter = false;
+    // one or more outputs have symbolic dimensions and need the first fetch to be copied to the OutputIterator
+    bool have_symbolic_dim_in_output = false;
 
     for (int output = 0, end = num_variadic_outputs_; output < end; ++output) {
       if (output < num_loop_state_variables_) {
@@ -784,12 +785,11 @@ Status ScanImpl::IterateSequence(std::vector<LoopStateVariable>& loop_state_vari
         auto& mlvalue = *iterator;
         fetches.push_back(mlvalue);
 
-        // If there is a dynamic shape in an output we need to copy it back to the OutputIterator
-        // so it can setup the overall output and avoid copies for all other output values.
-        // The mlvalue in the iterator will point to data once we have the overall output initialized.
-        // Check current value as we don't want to unset copy_fetch_to_iter if it is true.
-        if (!copy_fetch_to_iter)
-          copy_fetch_to_iter = (seq_no == 0) && (mlvalue.IsAllocated() == false);
+        // mlvalue.IsAllocated will be false when the OutputIterator is using a temporary MLValue
+        // and not the overall output buffer.
+        have_symbolic_dim_in_output = seq_no == 0 &&
+                                      (mlvalue.IsAllocated() == false ||
+                                       have_symbolic_dim_in_output);  // don't unset
       }
     }
 
@@ -810,8 +810,9 @@ Status ScanImpl::IterateSequence(std::vector<LoopStateVariable>& loop_state_vari
     for (int output = num_loop_state_variables_; output < num_variadic_outputs_; ++output) {
       auto& iterator = *output_iterators_[output];
 
-      // copy the data from fetches to the iterator so it can setup the overall output
-      if (copy_fetch_to_iter && (*iterator).IsAllocated() == false) {
+      // copy data from the fetch to the iterator so it can setup the overall output when the iterator is incremented.
+      // if the iterator is already using the overall output buffer IsAllocated() will be true and no copy is required.
+      if (have_symbolic_dim_in_output && (*iterator).IsAllocated() == false) {
         *iterator = fetches[output];
       }
 
diff --git a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
index 856bf4e145..9d54e53c4b 100644
--- a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
@@ -665,6 +665,8 @@ TEST(Scan, MixedTypeInputs) {
   test.Run();
 }
 
+// create a subgraph that will have unknown dimensions in both the loop state variable and output
+// after shape inferencing.
 TEST(Scan, UnknownDimInSubgraphOutput) {
   Model model("ScanBody");
   auto& graph = model.MainGraph();
@@ -702,12 +704,13 @@ TEST(Scan, UnknownDimInSubgraphOutput) {
 
   test.AddAttribute("body", scan_body);
   test.AddAttribute<int64_t>("num_scan_inputs", 1);
+  test.AddMissingOptionalInput<int64_t>();
 
-  // we add a symbolic dimension to bot the initial state and the scan input so we test the path that handles loop
-  // state variables (prior to execution) and the path that handles subgraph outputs (post first execution).
+  // we add a symbolic dimension to both the initial state and the scan input so we test
+  // the path that handles loop state variables (OutputIterator::Initialize) and
+  // the path that handles subgraph outputs (OutputIterator::MakeConcrete).
   // Note that we cross the values over in the subgraph, so the symbolic dimension in
   // initial_state_1 affects scan_out_1, and the symbolic dimension in scan_input_1 affects state_out_1.
-  test.AddMissingOptionalInput<int64_t>();
   test.AddShapeToTensorData(true, 1);  // add shape and symbolic dim in dim 1 for initial_state_1
   test.AddInput<float>("initial_state_1", state_shape, {0.0});
   test.AddShapeToTensorData(true, 2);  // add shape and symbolic dim in dim 2 for scan_input_1

From 03d7d25989dc21d08ec2aa70632483092bfdce98 Mon Sep 17 00:00:00 2001
From: Scott McKay <scmckay@microsoft.com>
Date: Mon, 26 Nov 2018 10:23:26 +1000
Subject: [PATCH 06/16] Support scalars (zero dimensions) in Scan by allowing
 the parameters to Scan to have no dimension for the input data.

---
 .../core/framework/mlvalue_tensor_slicer.cc   |  4 +-
 .../core/providers/cpu/controlflow/scan.cc    |  5 +-
 .../providers/cpu/controlflow/scan_test.cc    | 68 +++++++++++++------
 3 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/onnxruntime/core/framework/mlvalue_tensor_slicer.cc b/onnxruntime/core/framework/mlvalue_tensor_slicer.cc
index 11d2fde87b..294e95668f 100644
--- a/onnxruntime/core/framework/mlvalue_tensor_slicer.cc
+++ b/onnxruntime/core/framework/mlvalue_tensor_slicer.cc
@@ -15,8 +15,8 @@ MLValueTensorSlicer<T> MLValueTensorSlicer<T>::Create(T& mlvalue, int64_t slice_
   ONNXRUNTIME_ENFORCE(mlvalue.IsAllocated(), "MLValue has not been allocated so can't be sliced.");
 
   auto& tensor_shape{mlvalue.template Get<Tensor>().Shape()};
-  ONNXRUNTIME_ENFORCE(gsl::narrow_cast<int64_t>(tensor_shape.NumDimensions()) > slice_dimension,
-              "Insufficient dimensions to slice on ", slice_dimension, ". Shape:", tensor_shape);
+  ONNXRUNTIME_ENFORCE(gsl::narrow_cast<int64_t>(tensor_shape.NumDimensions()) >= slice_dimension,
+                      "Insufficient dimensions to slice on ", slice_dimension, ". Shape:", tensor_shape);
 
   auto dim0_size = tensor_shape[0];
   ONNXRUNTIME_ENFORCE(dim0_offset < dim0_size, "Invalid dim0_offset of ", dim0_offset, ". Dimension 0 is ", dim0_size);
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan.cc b/onnxruntime/core/providers/cpu/controlflow/scan.cc
index cbd6044c82..3366c68cb7 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan.cc
@@ -303,8 +303,9 @@ static const MLValue& GetSubgraphInputMLValue(const OpKernelContextInternal& con
 // Validate that the subgraph input has valid shapes
 Status ScanImpl::ValidateSubgraphInput(int start_input, int end_input, bool has_seq_len_dim,
                                        const std::vector<const NodeArg*>& graph_inputs) {
-  // first dim is batch size. optional sequence dim. dim/s for the data
-  auto min_dims_required = has_seq_len_dim ? 3 : 2;
+  // first dim is batch size. optional sequence dim. dim/s for the data.
+  // if there is no dim for the data treat it as a scalar.
+  auto min_dims_required = has_seq_len_dim ? 2 : 1;
 
   for (int i = start_input; i < end_input; ++i) {
     auto& input_tensor = GetSubgraphInputTensor(context_, i);
diff --git a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
index 965b7295d8..b943f90655 100644
--- a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
@@ -17,6 +17,7 @@ struct RunOptions {
   bool include_dim_values_in_subgraph = true;
   bool include_types_in_subgraph = true;
   bool include_outer_scope_add = false;
+  bool scalar_loop_state_value = false;
   bool add_bad_shape = false;
 };
 
@@ -37,13 +38,13 @@ class ScanOpTester : public OpTester {
     // add outer_scope_0 node. push the value through an extra Identity node as a Constant gets lifted into an
     // initializer which results in different treatment by the allocation planner
     {
-      TypeProto float_scalar;
-      float_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
-      auto mutable_dim = float_scalar.mutable_tensor_type()->mutable_shape()->add_dim();
+      TypeProto float_single_value;
+      float_single_value.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
+      auto mutable_dim = float_single_value.mutable_tensor_type()->mutable_shape()->add_dim();
       mutable_dim->set_dim_value(1);
 
       {
-        auto& outer_scope_constant = graph.GetOrCreateNodeArg("outer_scope_constant", &float_scalar);
+        auto& outer_scope_constant = graph.GetOrCreateNodeArg("outer_scope_constant", &float_single_value);
         auto* constant = graph.AddNode("outer_scope_constant", "Constant", "Constant with value kOuterNodeAddValue",
                                        {}, {&outer_scope_constant});
 
@@ -54,7 +55,7 @@ class ScanOpTester : public OpTester {
 
         constant->AddAttribute("value", value_tensor);
 
-        auto& outer_scope_node_arg = graph.GetOrCreateNodeArg("outer_scope_0", &float_scalar);
+        auto& outer_scope_node_arg = graph.GetOrCreateNodeArg("outer_scope_0", &float_single_value);
         graph.AddNode("outer_scope_id", "Identity", "Identity for outer_scope_0",
                       {&outer_scope_constant}, {&outer_scope_node_arg});
       }
@@ -66,7 +67,7 @@ class ScanOpTester : public OpTester {
 };
 
 static void CreateSubgraph(Graph& graph, RunOptions& options, const std::string& failure_message) {
-  bool include_shapes = options.include_dim_values_in_subgraph;
+  bool include_dim_values = options.include_dim_values_in_subgraph;
   bool include_types = options.include_types_in_subgraph;
 
   std::vector<NodeArg*> inputs;
@@ -94,21 +95,27 @@ static void CreateSubgraph(Graph& graph, RunOptions& options, const std::string&
     inputs = {};
     outputs = {};
 
-    TypeProto float_scalar;
+    TypeProto float_input;
     // inputs must have type information and a rank
-    float_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
-    auto mutable_dim = float_scalar.mutable_tensor_type()->mutable_shape()->add_dim();
-    if (include_shapes)
-      mutable_dim->set_dim_value(1);
+    float_input.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
+    auto mutable_shape = float_input.mutable_tensor_type()->mutable_shape();
+    if (options.scalar_loop_state_value) {
+      // no dims
+    } else {
+      auto mutable_dim = mutable_shape->add_dim();  // set rank
+      if (include_dim_values)
+        mutable_dim->set_dim_value(1);
+    }
 
     {
-      auto& output_arg = graph.GetOrCreateNodeArg("constant_1", &float_scalar);
+      auto& output_arg = graph.GetOrCreateNodeArg("constant_1", &float_input);
       outputs.push_back(&output_arg);
 
       auto* constant = graph.AddNode("constant", "Constant", "Constant with value 1", inputs, outputs);
 
       TensorProto value_tensor;
-      value_tensor.add_dims(1);
+      if (!options.scalar_loop_state_value)
+        value_tensor.add_dims(1);
       value_tensor.add_float_data(1.f);
       value_tensor.set_data_type(onnx::TensorProto_DataType_FLOAT);
 
@@ -118,7 +125,7 @@ static void CreateSubgraph(Graph& graph, RunOptions& options, const std::string&
     inputs = outputs;  // start with output from Constant node
     outputs = {};
 
-    auto& input_arg = graph.GetOrCreateNodeArg("loop_state_in_1", &float_scalar);
+    auto& input_arg = graph.GetOrCreateNodeArg("loop_state_in_1", &float_input);
     inputs.push_back(&input_arg);
 
     TypeProto loop_state_output_tensor;
@@ -128,15 +135,17 @@ static void CreateSubgraph(Graph& graph, RunOptions& options, const std::string&
     // it has to come from here.
     bool type_and_shape_required = options.include_dim_values_in_main_graph == false;
 
-    if (include_shapes || type_and_shape_required)
-      loop_state_output_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
+    if (include_dim_values || type_and_shape_required) {
+      mutable_shape = loop_state_output_tensor.mutable_tensor_type()->mutable_shape();
+      if (!options.scalar_loop_state_value)
+        mutable_shape->add_dim()->set_dim_value(1);
+    }
 
     TypeProto* type_proto = include_types || type_and_shape_required ? &loop_state_output_tensor : nullptr;
     auto& output_arg = graph.GetOrCreateNodeArg("loop_state_out_1", type_proto);
     outputs.push_back(&output_arg);
 
-    auto* add = graph.AddNode("add", "Add", "Add 1 to the loop state", inputs, outputs);
-    (void)add;
+    graph.AddNode("add", "Add", "Add 1 to the loop state", inputs, outputs);
   }
 
   // subgraph with multiple inputs and outputs to test variadic behaviour.
@@ -152,7 +161,7 @@ static void CreateSubgraph(Graph& graph, RunOptions& options, const std::string&
     // inputs must have type information and rank, but dimension can have no value if we're not providing shape info.
     concat_input_tensor.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
     auto mutable_dim = concat_input_tensor.mutable_tensor_type()->mutable_shape()->add_dim();
-    if (include_shapes) {
+    if (include_dim_values) {
       mutable_dim->set_dim_value(2);
 
       if (options.add_bad_shape) {
@@ -168,7 +177,7 @@ static void CreateSubgraph(Graph& graph, RunOptions& options, const std::string&
     // one output from concatenate of {4} tensor
     TypeProto concat_output_tensor;
     concat_output_tensor.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
-    if (include_shapes)
+    if (include_dim_values)
       concat_output_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(4);
 
     TypeProto* type_proto = include_types ? &concat_output_tensor : nullptr;
@@ -277,13 +286,18 @@ void RunTest(const std::string test_name, int64_t batch_size, int64_t max_sequen
     test.AddInput<int64_t>("sequence_lens", sequence_lens_dims, *sequence_lens);
   }
 
-  test.AddInput<float>("scan_loop_state_in_0", {batch_size, 1}, loop_state_in_0);
+  std::vector<int64_t> loop_state_shape{batch_size};
+  if (!options.scalar_loop_state_value) {
+    loop_state_shape.push_back(1);
+  }
+
+  test.AddInput<float>("scan_loop_state_in_0", loop_state_shape, loop_state_in_0);
 
   std::vector<int64_t> input_shape{batch_size, max_sequence_len, input_size};
   test.AddInput<float>("scan_input_0", input_shape, input_0);
   test.AddInput<float>("scan_input_1", input_shape, input_1);
 
-  test.AddOutput<float>("scan_loop_state_out_0", {batch_size, 1}, loop_state_out_0);
+  test.AddOutput<float>("scan_loop_state_out_0", loop_state_shape, loop_state_out_0);
 
   std::vector<int64_t> output_shape{batch_size, max_sequence_len, 1};
   test.AddOutput<float>("scan_output_0", output_shape, output_0);
@@ -353,6 +367,16 @@ TEST(Scan, ShortSequenceOneInBatchOneLoopStateVar_NoShapeInMainGraph_NoTypeAndSh
   ShortSequenceOneInBatchOneLoopStateVar(options);
 }
 
+TEST(Scan, OnnxScalarLoopState) {
+  RunOptions options{};
+  options.include_dim_values_in_main_graph = true;
+  options.include_types_in_subgraph = false;
+  options.include_dim_values_in_subgraph = false;
+  options.scalar_loop_state_value = true;
+
+  ShortSequenceOneInBatchOneLoopStateVar(options);
+}
+
 // test when there is an operator in the subgraph that uses a value coming from outer scope
 TEST(Scan, OuterScopeAccess_NoShapeInMainGraph_TypeAndShapeInSubgraph) {
   RunOptions options{};

From eef4db37f335c5f2af220b4019f29c7eb33a59e5 Mon Sep 17 00:00:00 2001
From: Scott McKay <scmckay@microsoft.com>
Date: Mon, 26 Nov 2018 10:31:33 +1000
Subject: [PATCH 07/16] Fix a couple of warnings from the linux and VC 14.11
 toolset builds

---
 onnxruntime/core/providers/cpu/controlflow/scan.cc | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/controlflow/scan.cc b/onnxruntime/core/providers/cpu/controlflow/scan.cc
index b4f340c3c7..be8a1df3f1 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan.cc
@@ -1,6 +1,13 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+// there's no way to use a raw pointer as the copy destination with std::copy_n
+// (which gsl::copy uses with span::data() which returns a raw pointer) with the 14.11 toolset
+// without generating a 4996 warning. going through an iterator is way too much overhead so turn off the warning.
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
 #include "core/providers/cpu/controlflow/scan.h"
 
 #include "core/framework/framework_common.h"
@@ -12,6 +19,10 @@
 
 #include "core/providers/cpu/tensor/utils.h"
 
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
 using namespace ONNX_NAMESPACE;
 using namespace onnxruntime::common;
 
@@ -156,7 +167,6 @@ class OutputIterator {
 
   OpKernelContextInternal& context_;
   const int output_index_;
-  std::vector<int64_t> dims_;
   TensorShapeProto per_iteration_shape_;
   TensorShape final_shape_;
   bool is_loop_state_var_;
@@ -334,8 +344,8 @@ OutputIterator::OutputIterator(OpKernelContextInternal& context,
                                TensorShape final_shape)
     : context_{context},
       output_index_{output_index},
-      is_loop_state_var_{is_loop_state_var},
       final_shape_{final_shape},
+      is_loop_state_var_{is_loop_state_var},
       cur_iteration_{0} {
   is_concrete_shape_ = final_shape_.Size() >= 0;
 

From 84fa1018a3fb939941775e8f6f88f41fb114beb4 Mon Sep 17 00:00:00 2001
From: Pranav Sharma <emailpranav@gmail.com>
Date: Mon, 26 Nov 2018 01:14:09 -0800
Subject: [PATCH 08/16] Create CODEOWNERS (#27)

---
 CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 CODEOWNERS

diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 0000000000..ad7e05b9cf
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1 @@
+@Microsoft/onnxruntime

From 3761d2d718b9147670d3a5c454b5e998d6b1baa2 Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Mon, 26 Nov 2018 14:57:54 -0800
Subject: [PATCH 09/16] Update C_API.md

Rephrasing
---
 docs/C_API.md | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/docs/C_API.md b/docs/C_API.md
index 7661b90519..5c2cbe567b 100644
--- a/docs/C_API.md
+++ b/docs/C_API.md
@@ -1,17 +1,17 @@
 # C API
 
-# Q: Why having a C API? 
-Q: Why not just live in C++ world? Why must C?    
-A: We want to distribute onnxruntime as a DLL, which can be used in .Net languages through [P/Invoke](https://docs.microsoft.com/en-us/cpp/dotnet/how-to-call-native-dlls-from-managed-code-using-pinvoke).
-Then this is the only option we have.
+# Q: Why have a C API? 
+Q: Why not just live in a C++ world? Why C?    
+A: We want to distribute the onnxruntime as a DLL, which can be used in .Net languages through [P/Invoke](https://docs.microsoft.com/en-us/cpp/dotnet/how-to-call-native-dlls-from-managed-code-using-pinvoke).
+This is the only option we have.
 
 Q: Is it only for .Net?    
-A: No. It is designed for
-1. Creating language bindings for onnxruntime.e.g. C#, python, java, ...
-2. Dynamic linking always has some benefits. For example, for solving diamond dependency problem.
+A: No. It is designed for:
+1. Creating language bindings for the onnxruntime. e.g. C#, python, java, ...
+2. Dynamic linking has some benefits. For example, solving diamond dependency problems.
 
 Q: Can I export C++ types and functions across DLL or "Shared Object" Library(.so) boundaries?    
-A: Well, you can, but it's not a good practice. And we won't do it in this project.
+A: Well, you can, but it's not a good practice. We won't do it in this project.
 
 
 ## What's inside
@@ -26,14 +26,12 @@ A: Well, you can, but it's not a good practice. And we won't do it in this proje
 
 ## How to use it
 
-Include [onnxruntime_c_api.h](include/onnxruntime/core/session/onnxruntime_c_api.h) in your source code.
-
-Then,
-1. Call ONNXRuntimeInitialize
-2. Create Session: ONNXRuntimeCreateInferenceSession(env, model_uri, nullptr,...)
-3. Create Tensor
+1. Include [onnxruntime_c_api.h](include/onnxruntime/core/session/onnxruntime_c_api.h).
+2. Call ONNXRuntimeInitialize
+3. Create Session: ONNXRuntimeCreateInferenceSession(env, model_uri, nullptr,...)
+4. Create Tensor
    1) ONNXRuntimeCreateAllocatorInfo
    2) ONNXRuntimeCreateTensorWithDataAsONNXValue
-4. ONNXRuntimeRunInference
+5. ONNXRuntimeRunInference
 
 

From 0bb5cb7a3075905f6d9894151b76f4dffdf295c4 Mon Sep 17 00:00:00 2001
From: Du Li <duli1@microsoft.com>
Date: Mon, 26 Nov 2018 21:02:18 -0800
Subject: [PATCH 10/16] Accomodating PR comments.

---
 onnxruntime/test/contrib_ops/expand_dims_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/expand_dims_test.cc b/onnxruntime/test/contrib_ops/expand_dims_test.cc
index 335a4d3e8f..abf1da0941 100644
--- a/onnxruntime/test/contrib_ops/expand_dims_test.cc
+++ b/onnxruntime/test/contrib_ops/expand_dims_test.cc
@@ -9,7 +9,7 @@ namespace test {
 
 TEST(ContribOpTest, ExpandDims_0) {
   OpTester test("ExpandDims", 1, onnxruntime::kMSDomain);
-  test.AddShapeToTensorData(true);  // TODO: re-enable shape inference test
+  test.AddShapeToTensorData(true);
   test.AddInput<float>("X", {2, 3}, std::vector<float>(6, 1.0f));
   test.AddInput<int32_t>("axis", {}, {-1});
   test.AddOutput<float>("Y", {2, 3, 1}, std::vector<float>(6, 1.0f));
@@ -18,7 +18,7 @@ TEST(ContribOpTest, ExpandDims_0) {
 
 TEST(ContribOpTest, ExpandDims_1) {
   OpTester test("ExpandDims", 1, onnxruntime::kMSDomain);
-  test.AddShapeToTensorData(true);  // TODO: re-enable shape inference test
+  test.AddShapeToTensorData(true);
   test.AddInput<float>("X", {2, 3}, std::vector<float>(6, 1.0f));
   test.AddInput<int32_t>("axis", {}, {1});
   test.AddOutput<float>("Y", {2, 1, 3}, std::vector<float>(6, 1.0f));

From 4fe704521462252754fbe1fe5df3f74db9711db9 Mon Sep 17 00:00:00 2001
From: Faith Xu <txsafx@gmail.com>
Date: Tue, 27 Nov 2018 02:28:55 -0800
Subject: [PATCH 11/16] Faxu documentation (#16)

* Update README.md

* Update CONTRIBUTING.md

* Update README.md

* Update README.md
---
 CONTRIBUTING.md |  3 ---
 README.md       | 71 +++++++++++++++++++++++++++++++++++++------------
 2 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 601f0cea3e..eeba2307ab 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,9 +20,6 @@ New code *must* be accompanied by unit tests.
 # Build
 [Build](BUILD.md)
 
-# Additional Documentation
-   * [Adding a custom operator](docs/AddingCustomOp.md)
-
 # Coding guidelines
 Please see [Coding Conventions and Standards](./docs/Coding_Conventions_and_Standards.md)
 
diff --git a/README.md b/README.md
index 3a7de122f2..91f47ca686 100644
--- a/README.md
+++ b/README.md
@@ -2,34 +2,71 @@
 
 [![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/onnxruntime%20CI%20Pipelines)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=1)
 
-ONNX Runtime is the runtime for [ONNX](https://github.com/onnx/onnx).
+# Introduction 
+ONNX Runtime is an open-source scoring engine for Open Neural Network Exchange (ONNX) models. 
 
-# Engineering Design
-[Engineering Design](docs/HighLevelDesign.md)
+ONNX is an open format for machine learning (ML) models that is supported by various ML and DNN frameworks and tools. This format makes it easier to interoperate between frameworks and to maximize the reach of your hardware optimization investments. Learn more about ONNX on [https://onnx.ai](https://onnx.ai) or view the [Github Repo](https://github.com/onnx/onnx). 
+ 
+# Why use ONNX Runtime 
+## Run any ONNX model
+ONNX Runtime provides comprehensive support of the ONNX spec and can be used to run all models based on ONNX v1.2.1 and higher. See ONNX version release details [here](https://github.com/onnx/onnx/releases).
 
-# API
-| API | CPU package | GPU package |
+In order to support popular and leading AI models, the runtime stays up-to-date with evolving ONNX operators and functionalities. 
+ 
+## Cross Platform 
+ONNX Runtime offers:
+* APIs for Python, C#, and C
+* Available for Linux, Windows, and Mac 
+
+See API documentation and package installation instructions [below](#Installation). 
+ 
+## High Performance 
+You can use the ONNX Runtime with both CPU and GPU hardware. You can also plug in additional execution providers to ONNX Runtime. With many graph optimizations and various accelerators, ONNX Runtime can often provide lower latency and higher efficiency compared to other runtimes. This provides smoother end-to-end customer experiences and lower costs from improved machine utilization.
+
+Currently ONNX Runtime supports CUDA, MKL, and MKL-DNN for computation acceleration, with more coming soon. To add an execution provider, please refer to [this page](docs/AddingExecutionProvider.md).
+ 
+# Getting Started 
+If you need a model:  
+* Check out the [ONNX Model Zoo](https://github.com/onnx/models) for ready-to-use pre-trained models. 
+* To get an ONNX model by exporting from various frameworks, see [ONNX Tutorials](https://github.com/onnx/tutorials).
+
+If you already have an ONNX model, just [install the runtime](#Installation) for your machine to try it out. One easy way to operationalize the model on the cloud is by using [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning-service). See a how-to guide [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-build-deploy-onnx). 
+
+# Installation
+## APIs and Official Builds
+| API Documentation | CPU package | GPU package |
 |-----|-------------|-------------|
 | [Python](https://docs.microsoft.com/en-us/python/api/overview/azure/onnx/intro?view=azure-onnx-py) | [Windows](TODO)<br>[Linux](https://pypi.org/project/onnxruntime/)<br>[Mac](TODO)| [Windows](TODO)<br>[Linux](https://pypi.org/project/onnxruntime-gpu/) |
-| [C#](docs/CSharp_API.md) | [Windows](TODO)| Not available |
-| [C](docs/C_API.md) | [Windows](TODO)<br>[Linux](TODO) | Not available |
+| [C#](docs/CSharp_API.md) | [Windows](TODO)<br>Linux - Coming Soon<br>Mac - Coming Soon| Coming Soon |
+| [C](docs/C_API.md) | [Windows](TODO)<br>[Linux](TODO) | Coming Soon |
 
-# Build
-[Build](BUILD.md)
+## Build Details
+For details on the build configurations and information on how to create a build, see [Build ONNX Runtime](BUILD.md).
+
+## Versioning
+See more details on API and ABI Versioning and ONNX Compatibility in [Versioning](docs/Versioning.md).
+
+# Design and Key Features
+For an overview of the high level architecture and key decisions in the technical design of ONNX Runtime, see [Engineering Design](docs/HighLevelDesign.md).
+
+ONNX Runtime is built with an extensible design that makes it versatile to support a wide array of models with high performance.
+
+* [Add a custom operator/kernel](AddingCustomOp.md)
+* [Add an execution provider](AddingExecutionProvider.md)
+* [Add a new graph
+transform](../include/onnxruntime/core/graph/graph_transformer.h)
+* [Add a new rewrite rule](../include/onnxruntime/core/graph/rewrite_rule.h)
 
 # Contribute
-[Contribute](CONTRIBUTING.md)
+We welcome your contributions! Please see the [contribution guidelines](CONTRIBUTING.md).
 
-# Versioning
-[Versioning](docs/Versioning.md)
+## Feedback
+For any feedback or to report a bug, please file a [GitHub Issue](https://github.com/Microsoft/onnxruntime/issues).
 
-# Feedback
-   * File a bug in [GitHub Issues](https://github.com/Microsoft/onnxruntime/issues)
-
-# Code of Conduct
+## Code of Conduct
 This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
 
 # License
-[LICENSE](LICENSE)
+[MIT License](LICENSE)

From 408fd21a7ff293e4ed4d62675576b8f2e63b60c0 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 28 Nov 2018 00:13:31 +0800
Subject: [PATCH 12/16] Windows CI: enable pybind (#34)

---
 tools/ci_build/github/azure-pipelines/azure-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/azure-pipelines.yml b/tools/ci_build/github/azure-pipelines/azure-pipelines.yml
index 7bfeff5b74..6b78c4be7a 100644
--- a/tools/ci_build/github/azure-pipelines/azure-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/azure-pipelines.yml
@@ -44,7 +44,7 @@ jobs:
     - task: BatchScript@1
       inputs:
         filename: build.bat
-        arguments: ' --enable_onnx_tests'
+        arguments: ' --enable_onnx_tests --enable_pybind'
         workingFolder: "$(Build.SourcesDirectory)"
 
     - task: CmdLine@1

From e75bde2e975647c0ccb617900125cc64a0943aed Mon Sep 17 00:00:00 2001
From: Du Li <duli1@microsoft.com>
Date: Mon, 26 Nov 2018 21:02:18 -0800
Subject: [PATCH 13/16] Accomodating PR comments.

---
 onnxruntime/test/contrib_ops/expand_dims_test.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/expand_dims_test.cc b/onnxruntime/test/contrib_ops/expand_dims_test.cc
index 335a4d3e8f..d1239ad332 100644
--- a/onnxruntime/test/contrib_ops/expand_dims_test.cc
+++ b/onnxruntime/test/contrib_ops/expand_dims_test.cc
@@ -9,7 +9,6 @@ namespace test {
 
 TEST(ContribOpTest, ExpandDims_0) {
   OpTester test("ExpandDims", 1, onnxruntime::kMSDomain);
-  test.AddShapeToTensorData(true);  // TODO: re-enable shape inference test
   test.AddInput<float>("X", {2, 3}, std::vector<float>(6, 1.0f));
   test.AddInput<int32_t>("axis", {}, {-1});
   test.AddOutput<float>("Y", {2, 3, 1}, std::vector<float>(6, 1.0f));
@@ -18,7 +17,6 @@ TEST(ContribOpTest, ExpandDims_0) {
 
 TEST(ContribOpTest, ExpandDims_1) {
   OpTester test("ExpandDims", 1, onnxruntime::kMSDomain);
-  test.AddShapeToTensorData(true);  // TODO: re-enable shape inference test
   test.AddInput<float>("X", {2, 3}, std::vector<float>(6, 1.0f));
   test.AddInput<int32_t>("axis", {}, {1});
   test.AddOutput<float>("Y", {2, 1, 3}, std::vector<float>(6, 1.0f));

From dcd4e0cb8fe839d6693a2d92f694d4feecba3a41 Mon Sep 17 00:00:00 2001
From: RyanUnderhill <ryanhill@microsoft.com>
Date: Tue, 27 Nov 2018 14:21:49 -0800
Subject: [PATCH 14/16] Change debug_alloc to not exit(-1) on the command line
 so that our build tests pass

---
 .../core/platform/windows/debug_alloc.cc      | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/core/platform/windows/debug_alloc.cc b/onnxruntime/core/platform/windows/debug_alloc.cc
index c4ddbb8069..ec9febb37b 100644
--- a/onnxruntime/core/platform/windows/debug_alloc.cc
+++ b/onnxruntime/core/platform/windows/debug_alloc.cc
@@ -150,7 +150,8 @@ void DebugHeapFree(void* p) noexcept {
 
   g_allocationCount--;
   p = static_cast<BYTE*>(p) - sizeof(MemoryBlock);  // Adjust incoming pointer
-  HeapFree(g_heap, 0, p);
+  if (HeapFree(g_heap, 0, p) == 0)
+    __debugbreak();  // If this hits, we either double deleted memory or we somehow tried to delete main heap memory after the leak checker started
 }
 
 static struct Memory_LeakCheck {
@@ -224,17 +225,19 @@ Memory_LeakCheck::~Memory_LeakCheck() {
     _snprintf_s(buffer, _TRUNCATE, "%d bytes of memory leaked in %d allocations", leaked_bytes, leak_count);
     string.append(buffer);
 
-    // Check if we're running on the build machine, if so just exit(-1)
-    size_t requiredSize;
-    if (getenv_s(&requiredSize, nullptr, 0, "AGENT_BUILDDIRECTORY") == 0 && requiredSize > 0) {
+    // If we're being actively debugged, show a message box to get the dev's attention
+    if (IsDebuggerPresent())
+      MessageBoxA(nullptr, string.c_str(), "Warning", MB_OK | MB_ICONWARNING);
+    else {
+      // If we're on the command line (like on a build machine), output to the console and exit(-1)
       std::cout << "\n----- MEMORY LEAKS: " << string.c_str() << "\n";
+#if 0
+      // There is currently a memory leak due to a static thread_local variable not being destroyed on exit in mkldnn_common.h
+      // The bug is caused by sync_api.h using the windows thread pool functions instead of C++ std::async libraries.
       exit(-1);
+#endif
     }
 
-    // Otherwise we're running on a dev system, show a message box to get their attention
-    if (IsDebuggerPresent()) {
-      MessageBoxA(nullptr, string.c_str(), "Warning", MB_OK | MB_ICONWARNING);
-    }
   } else {
     OutputDebugStringA("\n----- No memory leaks detected -----\n\n");
   }

From df1d01f85349540177725ea2615b06a499bc1bdc Mon Sep 17 00:00:00 2001
From: Raymond Yang <zihao.yang@microsoft.com>
Date: Tue, 27 Nov 2018 15:47:08 -0800
Subject: [PATCH 15/16] Update CI configs to test mkldnn

---
 tools/ci_build/github/azure-pipelines/azure-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/azure-pipelines.yml b/tools/ci_build/github/azure-pipelines/azure-pipelines.yml
index 7bfeff5b74..d708b1d20c 100644
--- a/tools/ci_build/github/azure-pipelines/azure-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/azure-pipelines.yml
@@ -44,7 +44,7 @@ jobs:
     - task: BatchScript@1
       inputs:
         filename: build.bat
-        arguments: ' --enable_onnx_tests'
+        arguments: ' --enable_onnx_tests --use_mkldnn'
         workingFolder: "$(Build.SourcesDirectory)"
 
     - task: CmdLine@1

From a9b52f399d9f04f72e330c35cc3fa2e84027317f Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Tue, 27 Nov 2018 18:06:00 -0800
Subject: [PATCH 16/16] Add pre-release notice to c_api (#38)

---
 include/onnxruntime/core/session/onnxruntime_c_api.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 1f449db4e4..3d1fddd195 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -1,6 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+// =====================================================================================================
+// NOTE: This header is PRE-RELEASE and subject to change. Please do not rely on this file not changing.
+// =====================================================================================================
+
 #pragma once
 #include <stdbool.h>
 #include <stdlib.h>