From c1613aa28e35f3ecf5fccfa88591573e1b6a1a63 Mon Sep 17 00:00:00 2001
From: Shah Asaduzzaman <shahasad@microsoft.com>
Date: Wed, 28 Nov 2018 15:09:51 -0800
Subject: [PATCH 01/11] Fixes in the NuGet package meatada. Include sourcelink

---
 .../Microsoft.ML.OnnxRuntime.csproj           | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
index 05ed0811f3..3dd2e09405 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
@@ -12,16 +12,23 @@
     <Authors>Microsoft Corporation</Authors>
     <Description>This package contains Microsoft's implementation of ONNX runtime, usable in .Net platforms</Description>
     <PackageTags>ONNX;ONNX Runtime;Machine Learning</PackageTags>
-    <Copyright>Microsoft Corporation</Copyright>
-    <IncludeSymbols>true</IncludeSymbols>
+    <Copyright>© Microsoft Corporation. All rights reserved.</Copyright>
+    <!--IncludeSymbols>true</IncludeSymbols>
     <PackageLicenseUrl>https://github.com/Microsoft/onnxruntime/blob/master/LICENSE</PackageLicenseUrl>
     <RepositoryUrl>https://github.com/Microsoft/onnxruntime.git</RepositoryUrl>
-    <RepositoryType>git</RepositoryType>
+    <RepositoryType>git</RepositoryType-->
 
     <!--internal build related properties-->
     <OnnxRuntimeCsharpRoot>..\..</OnnxRuntimeCsharpRoot>
     <buildDirectory Condition="'$(buildDirectory)'==''">$(OnnxRuntimeCsharpRoot)\..\build\Windows</buildDirectory>
     <NativeBuildOutputDir>$(buildDirectory)\$(Configuration)\$(Configuration)</NativeBuildOutputDir>
+
+
+    <PublishRepositoryUrl>true</PublishRepositoryUrl>
+     <!-- Optional: Embed source files that are not tracked by the source control manager in the PDB -->
+    <!--EmbedUntrackedSources>true</EmbedUntrackedSources-->
+    <!-- Optional: Include the PDB in the built .nupkg -->
+    <AllowedOutputExtensionsInPackageBuildOutputFolder>$(AllowedOutputExtensionsInPackageBuildOutputFolder);.pdb</AllowedOutputExtensionsInPackageBuildOutputFolder>
   </PropertyGroup>
 
   <!--TODO: this works for single platform only. Need separate packaging scripts for multi-target packaging -->
@@ -66,20 +73,24 @@
       <Output TaskParameter="Lines" ItemName="MajorVersionNumber"/>
     </ReadLinesFromFile>
     <Exec Command="git rev-parse --short HEAD" ConsoleToMSBuild="true">
-        <Output TaskParameter="ConsoleOutput" PropertyName="GitCommitHash" />
+        <Output TaskParameter="ConsoleOutput" PropertyName="GitCommitHashShort" />
+    </Exec>
+    <Exec Command="git rev-parse HEAD" ConsoleToMSBuild="true">
+        <Output TaskParameter="ConsoleOutput" PropertyName="GitCommitHashFull" />
     </Exec>
     
     <PropertyGroup>
-      <RepositoryCommit>$(GitCommitHash)</RepositoryCommit>
+      <!--RepositoryCommit>$(GitCommitHashFull)</RepositoryCommit-->
       <PackageVersion>@(MajorVersionNumber)</PackageVersion>
       <Version>$(PackageVersion)</Version>
-      <PackageVersion Condition="'$(IsReleaseBuild)'==''">$(PackageVersion)-dev-$(GitCommitHash)</PackageVersion>
+      <PackageVersion Condition="'$(IsReleaseBuild)'!='true'">$(PackageVersion)-dev-$(GitCommitHashShort)</PackageVersion>
     </PropertyGroup>
     <Message Importance="High" Text="PackageVersion=$(PackageVersion)" />
   </Target>
 
   <ItemGroup>
     <PackageReference Include="System.Numerics.Tensors" Version="0.1.0" />
+    <PackageReference Include="Microsoft.SourceLink.GitHub" Version="1.0.0-beta-63127-02" PrivateAssets="All"/>
   </ItemGroup>
 
 </Project>

From 6f65a03939251e3f966410a225affe9299e1f567 Mon Sep 17 00:00:00 2001
From: Shah Asaduzzaman <shahasad@microsoft.com>
Date: Wed, 28 Nov 2018 15:21:17 -0800
Subject: [PATCH 02/11] some cleanup

---
 .../Microsoft.ML.OnnxRuntime.csproj                  | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
index 3dd2e09405..b52213d52b 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
@@ -13,21 +13,17 @@
     <Description>This package contains Microsoft's implementation of ONNX runtime, usable in .Net platforms</Description>
     <PackageTags>ONNX;ONNX Runtime;Machine Learning</PackageTags>
     <Copyright>© Microsoft Corporation. All rights reserved.</Copyright>
-    <!--IncludeSymbols>true</IncludeSymbols>
     <PackageLicenseUrl>https://github.com/Microsoft/onnxruntime/blob/master/LICENSE</PackageLicenseUrl>
-    <RepositoryUrl>https://github.com/Microsoft/onnxruntime.git</RepositoryUrl>
-    <RepositoryType>git</RepositoryType-->
 
     <!--internal build related properties-->
     <OnnxRuntimeCsharpRoot>..\..</OnnxRuntimeCsharpRoot>
     <buildDirectory Condition="'$(buildDirectory)'==''">$(OnnxRuntimeCsharpRoot)\..\build\Windows</buildDirectory>
     <NativeBuildOutputDir>$(buildDirectory)\$(Configuration)\$(Configuration)</NativeBuildOutputDir>
 
-
+    <!-- sourcelink flags -->
     <PublishRepositoryUrl>true</PublishRepositoryUrl>
-     <!-- Optional: Embed source files that are not tracked by the source control manager in the PDB -->
+    <!-- Optional: Embed source files that are not tracked by the source control manager in the PDB -->
     <!--EmbedUntrackedSources>true</EmbedUntrackedSources-->
-    <!-- Optional: Include the PDB in the built .nupkg -->
     <AllowedOutputExtensionsInPackageBuildOutputFolder>$(AllowedOutputExtensionsInPackageBuildOutputFolder);.pdb</AllowedOutputExtensionsInPackageBuildOutputFolder>
   </PropertyGroup>
 
@@ -75,12 +71,8 @@
     <Exec Command="git rev-parse --short HEAD" ConsoleToMSBuild="true">
         <Output TaskParameter="ConsoleOutput" PropertyName="GitCommitHashShort" />
     </Exec>
-    <Exec Command="git rev-parse HEAD" ConsoleToMSBuild="true">
-        <Output TaskParameter="ConsoleOutput" PropertyName="GitCommitHashFull" />
-    </Exec>
     
     <PropertyGroup>
-      <!--RepositoryCommit>$(GitCommitHashFull)</RepositoryCommit-->
       <PackageVersion>@(MajorVersionNumber)</PackageVersion>
       <Version>$(PackageVersion)</Version>
       <PackageVersion Condition="'$(IsReleaseBuild)'!='true'">$(PackageVersion)-dev-$(GitCommitHashShort)</PackageVersion>

From 7cb3dfc18a043fe1d9c402cfa75024150742e431 Mon Sep 17 00:00:00 2001
From: Lei Zhang <zhalei@microsoft.com>
Date: Tue, 27 Nov 2018 10:44:19 -0800
Subject: [PATCH 03/11] Optimize ReduceMean/ReduceSum when all reduce axises
 located at the tail of the input tensor's dims by do not make extra copy. And
 use openmp to parallel the reduce on results.

---
 .../providers/cpu/reduction/reduction_ops.cc  | 61 +++++++++++++++----
 1 file changed, 49 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
index ae17d9c563..80ee17cde0 100644
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
@@ -34,14 +34,20 @@ REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSumSquare, 1);
 REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMax, 1);
 REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 1);
 
+// When all reduce axises located at the tail of the dims, quite general cases, copy could be
+// skip to improve performance, if required by check_no_copy = true;
+// return value: true means transposedInputData is not created/copied, input tensor data could
+//               be direct use as row major matrix [block_size, blocks], where blocks is the
+//               size of each reduce.
 template <typename T>
-void PrepareForReduce(OpKernelContext* ctx,
+bool PrepareForReduce(OpKernelContext* ctx,
                       std::vector<T>& transposedInputData,
                       Tensor** reducedTensor,
                       int64_t& block_size,
                       int64_t& blocks,
                       const std::vector<int64_t>& axes_,
-                      bool keepdims_) {
+                      bool keepdims_,
+                      bool check_no_copy = false) {
   const Tensor* input_tensor_ptr = ctx->Input<Tensor>(0);
   ONNXRUNTIME_ENFORCE(input_tensor_ptr != nullptr);
   const Tensor& input = *input_tensor_ptr;
@@ -51,8 +57,6 @@ void PrepareForReduce(OpKernelContext* ctx,
     ONNXRUNTIME_ENFORCE(axe >= 0 && axe < (int64_t)ndim, "Axis attribute out of range");
   }
 
-  transposedInputData.resize(input.Shape().Size(), 0);
-
   std::vector<int64_t> axes = axes_;
   if (axes.empty()) {
     // This is the default case for non-arg kind reductions. Reduce on all dimensions.
@@ -62,6 +66,13 @@ void PrepareForReduce(OpKernelContext* ctx,
 
   std::sort(axes.begin(), axes.end());
 
+  // If all reduced axes are located at the tail of the input shape, then copy could be skipped is required
+  bool need_copy = true;
+  if (axes.size() <= ndim && axes.front() == static_cast<int64_t>(ndim - axes.size()) 
+      && axes.back() == static_cast<int64_t>(ndim) - 1) {
+    need_copy = false;
+  }
+
   vector<bool> keep_axis(ndim, true);
   for (auto i : axes) {
     keep_axis[i] = false;
@@ -96,7 +107,6 @@ void PrepareForReduce(OpKernelContext* ctx,
   }
 
   const T* from_data = input.template Data<T>();
-  T* to_data = &transposedInputData[0];
   size_t count = input.Shape().Size();
 
   //set to-be-reduced axes to one. squeeze is keepdims_ is false
@@ -117,9 +127,15 @@ void PrepareForReduce(OpKernelContext* ctx,
   block_size = input.Shape().Size() / first_dim;
   blocks = first_dim;
 
+  if (!need_copy && check_no_copy) {
+    return true;
+  }
+
+  transposedInputData.resize(input.Shape().Size(), 0);
+  T* to_data = &transposedInputData[0];
   if (num_axes < 2 || n_shared_idxs == num_axes) {
     memcpy(to_data, from_data, count * sizeof(T));
-    return;
+    return false;
   }
 
   int itr_axes = num_axes - n_shared_idxs;
@@ -178,6 +194,7 @@ void PrepareForReduce(OpKernelContext* ctx,
       }
     }
   }
+  return false;
 }
 
 template <typename T>
@@ -272,12 +289,22 @@ Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
   std::vector<T> transposedInputData;
   int64_t block_size, blocks;
   Tensor* reduced;
-  PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_);
+  bool no_copy = PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_, true);
 
   T* output_data = reduced->template MutableData<T>();
 
-  EigenVectorMap<T> out_vec(output_data, block_size);
-  out_vec = ConstEigenMatrixMap<T>(&transposedInputData[0], block_size, blocks).rowwise().mean();
+  if (no_copy) {
+    const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
+
+    #pragma omp parallel for
+    for (int64_t i = 0; i < block_size; ++i) {
+      output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).mean();
+    }
+  }
+  else {
+    EigenVectorMap<T> out_vec(output_data, block_size);
+    out_vec = ConstEigenMatrixMap<T>(&transposedInputData[0], block_size, blocks).rowwise().mean();
+  }
 
   return Status::OK();
 }
@@ -317,12 +344,22 @@ Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
   std::vector<T> transposedInputData;
   int64_t block_size, blocks;
   Tensor* reduced;
-  PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_);
+  bool no_copy = PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_, true);
 
   T* output_data = reduced->template MutableData<T>();
 
-  EigenVectorMap<T> out_vec(output_data, block_size);
-  out_vec = ConstEigenMatrixMap<T>(&transposedInputData[0], block_size, blocks).rowwise().sum();
+  if (no_copy) {
+    const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
+    
+    #pragma omp parallel for
+    for (int64_t i = 0; i < block_size; ++i) {
+      output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).sum();
+    }
+  }
+  else {
+    EigenVectorMap<T> out_vec(output_data, block_size);
+    out_vec = ConstEigenMatrixMap<T>(&transposedInputData[0], block_size, blocks).rowwise().sum();
+  }
 
   return Status::OK();
 }

From 0540d8e5f71835e836a188cf6ce0de7dd23b2d81 Mon Sep 17 00:00:00 2001
From: Lei Zhang <zhalei@microsoft.com>
Date: Wed, 28 Nov 2018 12:20:53 -0800
Subject: [PATCH 04/11] Better name for read.

---
 .../providers/cpu/reduction/reduction_ops.cc     | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
index 80ee17cde0..a332aa2c33 100644
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
@@ -34,8 +34,8 @@ REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSumSquare, 1);
 REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMax, 1);
 REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 1);
 
-// When all reduce axises located at the tail of the dims, quite general cases, copy could be
-// skip to improve performance, if required by check_no_copy = true;
+// When all reduce axises located at the tail of the dims, quite general cases, transpose and extra
+// copy could be skiped to improve performance, if required by check_no_transpose = true;
 // return value: true means transposedInputData is not created/copied, input tensor data could
 //               be direct use as row major matrix [block_size, blocks], where blocks is the
 //               size of each reduce.
@@ -47,7 +47,7 @@ bool PrepareForReduce(OpKernelContext* ctx,
                       int64_t& blocks,
                       const std::vector<int64_t>& axes_,
                       bool keepdims_,
-                      bool check_no_copy = false) {
+                      bool check_no_transpose = false) {
   const Tensor* input_tensor_ptr = ctx->Input<Tensor>(0);
   ONNXRUNTIME_ENFORCE(input_tensor_ptr != nullptr);
   const Tensor& input = *input_tensor_ptr;
@@ -127,7 +127,7 @@ bool PrepareForReduce(OpKernelContext* ctx,
   block_size = input.Shape().Size() / first_dim;
   blocks = first_dim;
 
-  if (!need_copy && check_no_copy) {
+  if (!need_copy && check_no_transpose) {
     return true;
   }
 
@@ -289,11 +289,11 @@ Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
   std::vector<T> transposedInputData;
   int64_t block_size, blocks;
   Tensor* reduced;
-  bool no_copy = PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_, true);
+  bool no_transpose = PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_, true);
 
   T* output_data = reduced->template MutableData<T>();
 
-  if (no_copy) {
+  if (no_transpose) {
     const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
 
     #pragma omp parallel for
@@ -344,11 +344,11 @@ Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
   std::vector<T> transposedInputData;
   int64_t block_size, blocks;
   Tensor* reduced;
-  bool no_copy = PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_, true);
+  bool no_transpose = PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_, true);
 
   T* output_data = reduced->template MutableData<T>();
 
-  if (no_copy) {
+  if (no_transpose) {
     const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
     
     #pragma omp parallel for

From 564907fa1a5b20c5742c3322e63f40877f7514f2 Mon Sep 17 00:00:00 2001
From: "Tang, Cheng" <souptc@gmail.com>
Date: Wed, 28 Nov 2018 16:20:04 -0800
Subject: [PATCH 05/11] remove the non ascii char (#48)

---
 onnxruntime/contrib_ops/contrib_ops.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/contrib_ops/contrib_ops.cc b/onnxruntime/contrib_ops/contrib_ops.cc
index f6151690e7..3b3d948ee9 100644
--- a/onnxruntime/contrib_ops/contrib_ops.cc
+++ b/onnxruntime/contrib_ops/contrib_ops.cc
@@ -147,7 +147,7 @@ The dequantization formula is y = (x - x_zero_point) * x_scale.
       .SinceVersion(1)
       .SetDoc(R"DOC(
 The convolution operator consumes a quantized input tensor, its scale and zero point, 
-a quantized filter, its scale and zero point, and output’s scale and zero point, 
+a quantized filter, its scale and zero point, and output's scale and zero point, 
 and computes the quantized output. Each scale and zero point pair must have same shape.
 It means they must be either scalars (per tensor) or 1-D tensors (per channel).)DOC")
       .Input(
@@ -163,8 +163,8 @@ It means they must be either scalars (per tensor) or 1-D tensors (per channel).)
           "to arrive with the dimension denotation of [DATA_BATCH, "
           "DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
           "T1")
-      .Input(1, "x_scale", "Scale tensor for input ‘x’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘x’.", "T3")
-      .Input(2, "x_zero_point", "Zero point tensor for input ‘x’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘x’.", "T1")
+      .Input(1, "x_scale", "Scale tensor for input 'x'. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it's a 1-D tensor, its number of elements should be equal to the number of channels of input 'x'.", "T3")
+      .Input(2, "x_zero_point", "Zero point tensor for input 'x'. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it's a 1-D tensor, its number of elements should be equal to the number of channels of input 'x'.", "T1")
       .Input(
           3,
           "w",
@@ -183,10 +183,10 @@ It means they must be either scalars (per tensor) or 1-D tensors (per channel).)
           "(assuming zero based indices for the shape array). "
           "Or in other words FILTER_IN_CHANNEL should be equal to DATA_CHANNEL. ",
           "T1")
-      .Input(4, "w_scale", "Scale tensor for input ‘w’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘w’.", "T3")
-      .Input(5, "w_zero_point", "Scale tensor for input ‘w’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘w’.", "T1")
-      .Input(6, "y_scale", "Scale tensor for output ‘y’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘y’.", "T3")
-      .Input(7, "y_zero_point", "Scale tensor for output ‘y’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘y’.", "T1")
+      .Input(4, "w_scale", "Scale tensor for input 'w'. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it's a 1-D tensor, its number of elements should be equal to the number of channels of input 'w'.", "T3")
+      .Input(5, "w_zero_point", "Scale tensor for input 'w'. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it's a 1-D tensor, its number of elements should be equal to the number of channels of input 'w'.", "T1")
+      .Input(6, "y_scale", "Scale tensor for output 'y'. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it's a 1-D tensor, its number of elements should be equal to the number of channels of input 'y'.", "T3")
+      .Input(7, "y_zero_point", "Scale tensor for output 'y'. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it's a 1-D tensor, its number of elements should be equal to the number of channels of input 'y'.", "T1")
       .Input(8, "B", "Optional 1D bias to be added to the convolution, has size of M.", "T2", OpSchema::Optional)
       .Output(
           0,

From 2f234e4e7864dfddf07d9012ebdf357ed1efeab9 Mon Sep 17 00:00:00 2001
From: jignparm <jignparm@microsoft.com>
Date: Thu, 29 Nov 2018 00:30:34 +0000
Subject: [PATCH 06/11] Minor fixes to nuget README.md file (#49)

---
 docs/CSharp_API.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/CSharp_API.md b/docs/CSharp_API.md
index d8339f1fa9..ae333af3d6 100644
--- a/docs/CSharp_API.md
+++ b/docs/CSharp_API.md
@@ -2,14 +2,14 @@
 The ONNX runtime provides a C# .Net binding for running inference on ONNX models in any of the .Net standard platforms. The API is .Net standard 1.1 compliant for maximum portability. This document describes the API. 
 
 ## NuGet Package
-There is a NuGet package Microsoft.ML.OnnxRuntime available for .Net consumers, which includes the prebuilt binaries for ONNX runtime.  The API is portable across all platforms and architectures supported by the .Net standard, although currently the NuGet package contains the prebuilt binaries for Windows 10 platform on x64 CPUs only.
+The Microsoft.ML.OnnxRuntime Nuget package includes the precompiled binaries for ONNX runtime, and includes libraries for Windows 10 platform and X64 CPUs. The APIs conform to .Net Standard 1.1.
 
 ## Getting Started
-Here is simple tutorial for getting started with running inference on an existing ONNX model for a given input data (a.k.a query). Say the model is trained using any of the well-known training frameworks and exported as an ONNX model into a file named `model.onnx`. The runtime incarnation of a model is an `InferenceSession` object. You simply construct an `InferenceSession` object using the model file as parameter --
+Here is simple tutorial for getting started with running inference on an existing ONNX model for a given input data. The model is typically trained using any of the well-known training frameworks and exported into the ONNX format. To start scoring using the model, open a session using the `InferenceSession` class, passing in the file path to the model as a parameter.
     
     var session = new InferenceSession("model.onnx");
 
-Once a session is created, you can run queries on the session using your input data, using the `Run` method of the  `InferenceSession`. Both input and output of `Run` method are represented as collections of .Net `Tensor` objects (as defined in [System.Numerics.Tensor](https://www.nuget.org/packages/System.Numerics.Tensors)) -
+Once a session is created, you can execute queries using the `Run` method of the  `InferenceSession` object. Currently, only `Tensor` type of input and outputs  are supported. The results of the `Run` method are represented as a collection of .Net `Tensor` objects (as defined in [System.Numerics.Tensor](https://www.nuget.org/packages/System.Numerics.Tensors)).
     
     Tensor<float> t1, t2;  // let's say data is fed into the Tensor objects
     var inputs = new List<NamedOnnxValue>()
@@ -19,7 +19,8 @@ Once a session is created, you can run queries on the session using your input d
                  };
     IReadOnlyCollection<NamedOnnxValue> results = session.Run(inputs);
 
-You can load your input data into Tensor<T> objects in several ways. A simple example is to create the Tensor from arrays -
+You can load your input data into Tensor<T> objects in several ways. A simple example is to create the Tensor from arrays.
+
     float[] sourceData;  // assume your data is loaded into a flat float array
     int[] dimensions;    // and the dimensions of the input is stored here
     Tensor<float> t1 = new DenseTensor<float>(sourceData, dimensions);    
@@ -84,7 +85,7 @@ Accessor to the default static option object
 
 #### Methods
     AppendExecutionProvider(ExecutionProvider provider);
-Appends execution provider to the session. For any operator in the graph the first execution provider that implements the operator will be user. ExecutionProvider is defined as the following enum -
+Appends execution provider to the session. For any operator in the graph the first execution provider that implements the operator will be user. ExecutionProvider is defined as the following enum.
 
     enum ExecutionProvider
     {
@@ -112,4 +113,3 @@ The type of Exception that is thrown in most of the error conditions related to
 
 
 
-

From cb1781927fae1d2840f4554324d4d6a73e14370d Mon Sep 17 00:00:00 2001
From: Raymond Yang <zihao.yang@microsoft.com>
Date: Wed, 28 Nov 2018 17:37:49 -0800
Subject: [PATCH 07/11] Remove mklml in Linux python wheel packagaing (#53)

---
 tools/ci_build/github/azure-pipelines/azure-pipelines.yml | 2 +-
 tools/ci_build/github/linux/run_build.sh                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/azure-pipelines.yml b/tools/ci_build/github/azure-pipelines/azure-pipelines.yml
index 1ebfce85de..047e0cda91 100644
--- a/tools/ci_build/github/azure-pipelines/azure-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/azure-pipelines.yml
@@ -5,7 +5,7 @@ jobs:
   pool: Linux-CPU
 
   steps:
-    - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory)'
+    - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x "--use_mklml"'
       displayName: 'Command Line Script'
       env:
         AZURE_BLOB_KEY: $(onnxruntime-storage-key)
diff --git a/tools/ci_build/github/linux/run_build.sh b/tools/ci_build/github/linux/run_build.sh
index 083204a6a6..83110363ce 100755
--- a/tools/ci_build/github/linux/run_build.sh
+++ b/tools/ci_build/github/linux/run_build.sh
@@ -38,6 +38,6 @@ else
         --config Debug Release --build_shared_lib \
         --skip_submodule_sync \
         --enable_pybind \
-        --parallel --use_mkldnn --use_mklml --build_shared_lib $BUILD_EXTR_PAR
+        --parallel --use_mkldnn --build_shared_lib $BUILD_EXTR_PAR
     /home/onnxruntimedev/Release/onnx_test_runner /data/onnx
 fi

From d60507d2e9ea5f55afb72e864195b62a580da464 Mon Sep 17 00:00:00 2001
From: Yulong Wang <f.s@qq.com>
Date: Wed, 28 Nov 2018 18:29:16 -0800
Subject: [PATCH 08/11] [Mac] fix python binding (#54)

---
 cmake/onnxruntime_python.cmake | 10 +++++-----
 setup.py                       |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 8f2f55c69c..2411717001 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -75,13 +75,13 @@ add_dependencies(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_depende
 if (MSVC)
   # if MSVC, pybind11 looks for release version of python lib (pybind11/detail/common.h undefs _DEBUG)
   target_link_libraries(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${PYTHON_LIBRARY_RELEASE} ${ONNXRUNTIME_SO_LINK_FLAG})
+elseif (APPLE)
+  set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+  target_link_libraries(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${ONNXRUNTIME_SO_LINK_FLAG})
+  set_target_properties(onnxruntime_pybind11_state PROPERTIES INSTALL_RPATH "@loader_path")
 else()
   target_link_libraries(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${PYTHON_LIBRARY} ${ONNXRUNTIME_SO_LINK_FLAG})
-  if (APPLE)
-    set_target_properties(onnxruntime_pybind11_state PROPERTIES INSTALL_RPATH "@loader_path")
-  else()
-    set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "-Xlinker -rpath=\$ORIGIN")
-  endif()
+  set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "-Xlinker -rpath=\$ORIGIN")
 endif()
 
 set_target_properties(onnxruntime_pybind11_state PROPERTIES PREFIX "")
diff --git a/setup.py b/setup.py
index ea32fa84bf..2a8954809d 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ except ImportError:
     bdist_wheel = None
 
 # Additional binaries
-if platform.system() == 'Linux':
+if platform.system() == 'Linux' or platform.system() == 'Darwin':
   libs = ['onnxruntime_pybind11_state.so', 'libmkldnn.so.0', 'libmklml_intel.so', 'libiomp5.so']
 else:
   libs = ['onnxruntime_pybind11_state.pyd', 'mkldnn.dll', 'mklml.dll', 'libiomp5md.dll']

From 6371025860dada8dc30671ac148cdb5b179a5698 Mon Sep 17 00:00:00 2001
From: Raymond Yang <zihao.yang@microsoft.com>
Date: Wed, 28 Nov 2018 19:22:29 -0800
Subject: [PATCH 09/11] Add flag for mac compliance (#45)

* Refine windows CI configs

* Add flag for mac compliance
---
 cmake/CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 158356b67f..3d2c7fa4b7 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -110,6 +110,13 @@ else()
       add_definitions(-DUSE_OPENMP)
   endif()
 endif()
+
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    #For Mac compliance
+    message("Adding flags for Mac builds")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector-strong")
+endif()
+
 find_package(PNG)
 set(ENABLE_DATE_TESTING  OFF CACHE BOOL "" FORCE)
 set(USE_SYSTEM_TZ_DB  ON CACHE BOOL "" FORCE)

From 7523e76649b8d6ba4610f7f6532bd73fd04f6b2b Mon Sep 17 00:00:00 2001
From: Faith Xu <txsafx@gmail.com>
Date: Wed, 28 Nov 2018 19:43:03 -0800
Subject: [PATCH 10/11] Minor wording changes to design doc (#51)

* Update HighLevelDesign.md

* Update HighLevelDesign.md

* Update HighLevelDesign.md
---
 docs/HighLevelDesign.md | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/docs/HighLevelDesign.md b/docs/HighLevelDesign.md
index 7b9dc27e9a..8e6b955934 100644
--- a/docs/HighLevelDesign.md
+++ b/docs/HighLevelDesign.md
@@ -1,7 +1,7 @@
 # ONNX Runtime High Level Design
 
 This document outlines the high level design of
-ONNXRuntime - a high performance, cross platform engine.
+ONNX Runtime - a high performance, cross platform engine.
 
 ## Key objectives
 * Maximally and automatically leverage the custom accelerators and runtimes
@@ -10,8 +10,8 @@ available on disparate platforms.
 runtimes. We call this abstraction an [execution
 provider](../include/onnxruntime/core/framework/execution_provider.h). It defines and exposes a set of
 its capabilities to ONNXRuntime: a set of single or fused nodes it can
-execute, its memory allocator and more. Custom accelerators and runtimes are
-instances of execution provider.
+execute, its memory allocator, and more. Custom accelerators and runtimes are
+instances of execution providers.
 * We don't expect that an execution provider can always run an ONNX model fully
 on its device. This means that ONNXRuntime must be able to execute a single
 model in a heterogeneous environment involving multiple execution providers.
@@ -35,46 +35,45 @@ provider using the GetCapability() API.
 
 ![ONNXRuntime high level system architecture](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/228d22d3-6e3e-48b1-811c-1d48353f031c.png)
 
-*Note: TensorRT and nGraph support in the works.*
+*Note: TensorRT and nGraph support are in progress*
 
 ### More about partitioning
-ONNXRuntime partitions a model graph based on the available execution providers
-into subgraphs, each for a distinct provider respectively. ONNXRuntime provides
-a default execution provider that is used for fallback execution for the
+ONNXRuntime partitions a model graph into subgraphs based on the available execution providers, one for each distinct provider. ONNXRuntime provides
+a default execution provider that is used as the fallback execution for the
 operators that cannot be pushed onto the more specialized but more efficient
-execution providers. Intuitively we probably want to push computation to the
-specialized execution providers as much as possible.
+execution providers. Intuitively we want to push computation to more
+specialized execution providers whenever possible.
 
 We use a simple graph partitioning technique. The available execution providers
 will be considered in a specific order, and each will be assigned the maximal
 subgraphs (possibly more than one) that it is able to handle. The
-ONNXRuntime-provided default execution provider will be the last one to be
+ONNXRuntime-provided default execution provider will be the last one
 considered, and it ensures completeness. More sophisticated optimizations can be
 considered in the future (or can even be implemented as a composite execution
 provider).
 
 Conceptually, each partition is reduced to a single fused operator. It is
-created by invoking the execution provider's Compile() method and wrap it as a
+created by invoking the execution provider's Compile() method and wraps it as a
 custom operator. Currently we support only synchronous mode of execution. An execution
 provider exposes its memory allocator, which is used to allocate the input
 tensors for the execution provider. The rewriting and partitioning transform the
-initial model graph into a new graph composed with operators assigned to either
+initial model graph into a new graph composed of operators assigned to either
 the default execution provider or other registered execution
-providers. ONNXRuntime execution engine is responsible for running this graph.
+providers. The ONNXRuntime execution engine is responsible for running this graph.
 
 ## Key design decisions
-* Multiple threads should be able to inovke the Run() method on the same
+* Multiple threads can invoke the Run() method on the same
 inference session object. See [API doc](C_API.md) for more details.
-* To facilitate the above the Compute() function of all kernels is const
+* To facilitate this, the Compute() function of all kernels is const
 implying the kernels are stateless.
-* We call implementations of the operators by execution providers as
+* Implementations of the operators by execution providers are called
 kernels. Each execution provider supports a subset of the (ONNX)
 operators/kernels.
-* ONNXRuntime runtime guarantees that all operators are supported by the default
+* The ONNXRuntime runtime guarantees that all operators are supported by the default
 execution provider.
 * Tensor representation: ONNXRuntime will utilize a standard representation for
 the tensor runtime values. The execution providers can internally use a
-different representation, if they choose to, but it is their responsibility to
+different representation if they choose to, but it is their responsibility to
 convert the values from/to the standard representation at the boundaries of
 their subgraph.
 

From 846044e28289defe2408aca22bebd1710f582f30 Mon Sep 17 00:00:00 2001
From: Yulong Wang <f.s@qq.com>
Date: Wed, 28 Nov 2018 20:01:21 -0800
Subject: [PATCH 11/11] [Mac] support mkldnn for macOS (#56)

---
 cmake/external/mkldnn.cmake    | 6 +++++-
 cmake/onnxruntime_python.cmake | 5 ++++-
 setup.py                       | 4 +++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 69d20b8f77..b8bec9d0d0 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -22,7 +22,11 @@ if(WIN32)
   endif()
   set(MKLDNN_PATCH_COMMAND2 "")
 else()
-  set(MKLDNN_SHARED_LIB libmkldnn.so.0)
+  if (APPLE)
+    set(MKLDNN_SHARED_LIB libmkldnn.0.dylib)
+  else()
+    set(MKLDNN_SHARED_LIB libmkldnn.so.0)
+  endif()
   if(onnxruntime_USE_MKLML)
     set(DOWNLOAD_MKLML ${MKLDNN_SOURCE}/scripts/prepare_mkl.sh)
     set(MKLML_SHARED_LIB libmklml_intel.so)
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 2411717001..22db4d45f2 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -78,7 +78,10 @@ if (MSVC)
 elseif (APPLE)
   set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
   target_link_libraries(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${ONNXRUNTIME_SO_LINK_FLAG})
-  set_target_properties(onnxruntime_pybind11_state PROPERTIES INSTALL_RPATH "@loader_path")
+  set_target_properties(onnxruntime_pybind11_state PROPERTIES
+    INSTALL_RPATH "@loader_path"
+    BUILD_WITH_INSTALL_RPATH TRUE
+    INSTALL_RPATH_USE_LINK_PATH FALSE)
 else()
   target_link_libraries(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${PYTHON_LIBRARY} ${ONNXRUNTIME_SO_LINK_FLAG})
   set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "-Xlinker -rpath=\$ORIGIN")
diff --git a/setup.py b/setup.py
index 2a8954809d..812886f0bd 100644
--- a/setup.py
+++ b/setup.py
@@ -23,8 +23,10 @@ except ImportError:
     bdist_wheel = None
 
 # Additional binaries
-if platform.system() == 'Linux' or platform.system() == 'Darwin':
+if platform.system() == 'Linux':
   libs = ['onnxruntime_pybind11_state.so', 'libmkldnn.so.0', 'libmklml_intel.so', 'libiomp5.so']
+elif platform.system() == "Darwin":
+  libs = ['onnxruntime_pybind11_state.so', 'libmkldnn.0.dylib'] # TODO add libmklml and libiomp5 later.
 else:
   libs = ['onnxruntime_pybind11_state.pyd', 'mkldnn.dll', 'mklml.dll', 'libiomp5md.dll']