From 41d2ff622c49aa3628c04f6b64ed7f33c8d80f30 Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajia.qin@intel.com>
Date: Fri, 15 Sep 2023 08:03:18 +0800
Subject: [PATCH] [js/webgpu] Optimize InstanceNormalization (#17491)

### Description
<!-- Describe your changes. -->
In previous implementation, there are two loops to iterate H * W
elements to calculate the `mean` and `squaredNorm` value in one thread,
meanwhile it outputs H * W elements in one thread. That results it's
very very slow when H * W is a large value. And usually, H * W does be a
large value in a model. For example, in the `candy-8` model, the shapes
of [H, W] are [224,224], [112,112], [56,56] for `InstanceNormalization`
op. And in my ADL, `[1,224,224,32]` consumes 17 ms. See below:
```
[profiling] kernel "23848328|[InstanceNormalization] 23848328" input[0]: [1,224,224,32] | float32, input[1]: [32] | float32, input[2]: [32] | float32, output[0]: [1,224,224,32] | float32, execution time: 17007914 ns
```

In this PR, it uses workgroup memory to optimize the original algorithm.
The advantage is that it can parallelly utilize the 64 (workgroupSize)
threads in one workgroup to calculate `mean` and `squaredNorm` value.
Meanwhile, it only outputs `H * W / workgroupSize` outputs for one
thread, which greatly reduces the overhead for one thread. With this
optimization, `[1,224,224,32]` becomes 3 ms and the main overhead is the
extra two `transpose`. The `createInstanceNormProgramInfo` only needs
`0.64` ms. See below:
```
[profiling] kernel "23003600|[InstanceNormalization] 23003600" input[0]: [1,224,224,32] | float32, output[0]: [1,32,224,224] | float32, execution time: 1543792 ns
program-manager.ts:115
[profiling] kernel "23003600|[InstanceNormalization] 23003600" input[0]: [1,32,224,224] | float32, input[1]: [32] | float32, input[2]: [32] | float32, output[0]: [1,32,224,224] | float32, execution time: 642652 ns
program-manager.ts:115
[profiling] kernel "23003600|[InstanceNormalization] 23003600" input[0]: [1,32,224,224] | float32, output[0]: [1,224,224,32] | float32, execution time: 991608 ns
```
This PR currently only applies the new algorithm to NCHW format. For
NHWC format, one way is to transpose the input so that it can use the
new algorithm. But the disadvantage is that 2 extra transpose are added.
@dakenf also gives another way to optimize NHWC. Details see
[here](https://github.com/microsoft/onnxruntime/blob/d45a96616da9843b037210f2d48d6b4e5bdae5c6/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts).
I checked @dakenf's method. The perf is similar with transpose +
optimized NCHW. But on different GPUs, one is a little better than
another or vice versa. So I prefer this PR only does the NCHW part.
@dakenf can submit his optimization on NHWC.
---
 js/web/lib/wasm/jsep/webgpu/ops/common.ts     |   3 +-
 .../lib/wasm/jsep/webgpu/ops/instance-norm.ts | 118 ++++++++++--------
 js/web/test/data/ops/instance-norm.jsonc      |  79 ++++++++++++
 js/web/test/suite-test-list.jsonc             |   2 +
 4 files changed, 148 insertions(+), 54 deletions(-)
 create mode 100644 js/web/test/data/ops/instance-norm.jsonc
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index f3845e3110..c054da51a3 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -592,7 +592,8 @@ class ShaderHelperImpl implements ShaderHelper {
     const workgroupSizeZ = typeof workgroupSize === 'number' ? 1 : workgroupSize[2];
 
     const is1DimensionDispatch = this.normalizedDispatchGroup[1] === 1 && this.normalizedDispatchGroup[2] === 1;
-    const paramList = is1DimensionDispatch ? '@builtin(global_invocation_id) global_id : vec3<u32>' :
+    const paramList = is1DimensionDispatch ? `@builtin(global_invocation_id) global_id : vec3<u32>,
+    @builtin(local_invocation_id) local_id : vec3<u32>` :
                                              `@builtin(local_invocation_index) local_index : u32,
     @builtin(workgroup_id) workgroup_id : vec3<u32>`;
     const globalIdxDefinition = is1DimensionDispatch ?
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
index f62c766aa9..449073a133 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
@@ -1,83 +1,97 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
 
-import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
+import {inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from './common';
 
 export interface InstanceNormAttributes extends AttributeWithCacheKey {
   epsilon: number;
   format: 'NHWC'|'NCHW';
 }
 
-const validateInputs = (inputs: readonly TensorView[]): void => {
-  if (!inputs || inputs.length !== 3) {
-    throw new Error('instanceNorm requires 3 inputs.');
-  }
-
-  if (inputs[0].dataType !== DataType.float || inputs[1].dataType !== DataType.float) {
-    throw new Error('inputs should be float type');
-  }
-};
-
 const createInstanceNormProgramInfo =
     (metadata: ProgramMetadata, inputs: readonly TensorView[], attributes: InstanceNormAttributes): ProgramInfo => {
       const xShape = inputs[0].dims;
-      const scale = inputs[1];
-      const bias = inputs[2];
 
       const outputShape = xShape;
-      const outputSize = ShapeUtil.size(outputShape);
       const axis = 2;
       const normCount = ShapeUtil.sizeToDimension(xShape, axis);
       const normSize = ShapeUtil.sizeFromDimension(xShape, axis);
       const C = xShape[1];
-
-      const scaleSize = ShapeUtil.size(scale.dims);
-      const biasSize = bias ? ShapeUtil.size(bias.dims) : 0;
-      if (scaleSize !== normSize || (bias && biasSize !== normSize)) {
-        throw new Error(`Size of X.shape()[axis:] == ${normSize}.
-             Size of scale and bias (if provided) must match this. 
-             Got scale size of ${scaleSize} and bias size of ${biasSize}`);
-      }
-
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-
+      const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
+      const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
+      const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
+      const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
+      const variables = [x, scale, bias, output];
+      const dataType = x.type.value;
+      const workgroupSize = 64;
       const getShaderSource = (shaderHelper: ShaderHelper) => `
+
   const C: u32 = ${C};
   const normSize: u32 = ${normSize};
-  const normSizeTyped: ${dataType} = ${normSize};
   const epsilon: f32 = ${attributes.epsilon};
+  var<workgroup> meanShared : ${dataType};
+  var<workgroup> squaredNormShared : ${dataType};
+  var<workgroup> workgroupShared : array<${dataType}, ${workgroupSize}>;
+  const workgroupSize = ${workgroupSize}u;
+  ${shaderHelper.declareVariables(...variables)}
+  ${shaderHelper.mainStart(workgroupSize)}
+    let norm = global_idx / workgroupSize;
+    let batch = norm / C;
+    let channel = norm % C;
+    let localIndex = local_id.x;
 
-  @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
-  @group(0) @binding(1) var<storage, read> scale : array<${dataType}>;
-  @group(0) @binding(2) var<storage, read> bias : array<${dataType}>;
-  @group(0) @binding(3) var<storage, read_write> output : array<${dataType}>;
-
-  ${shaderHelper.mainStart()}
-    let offset = global_idx * normSize;
-    if (offset + normSize >= ${outputSize}) { return; }
-    var mean: ${dataType} = 0;
-
-    for (var h: u32 = 0u; h < normSize; h++) {
-        mean = mean + x[h + offset];
+    // initialize workgroup memory
+    var initial: ${dataType} = 0;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      initial = initial + ${x.get('batch', 'channel', 'h')};
     }
-    mean = mean / normSizeTyped;
+    workgroupShared[localIndex] = initial;
+    workgroupBarrier();
 
-    var squaredNorm: ${dataType} = 0;
-    for (var h: u32 = 0u; h < normSize; h++) {
-        let deviation: f32 = x[h + offset] - mean;
-        squaredNorm = squaredNorm + deviation * deviation;
+    // Calculate the mean of current channel data.
+    for (var currSize = workgroupSize >> 1;  currSize > 0; currSize = currSize >> 1) {
+      if (localIndex < currSize) {
+        workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize];
+      }
+      workgroupBarrier();
     }
-    let invStdDev = 1 / sqrt(squaredNorm / normSizeTyped + epsilon);
-    let channelScale = invStdDev * scale[global_idx % C];
-    let channelShift = bias[global_idx % C] - mean * channelScale;
-    for (var j: u32 = 0; j < normSize; j++) {
-        output[j + offset] = x[j + offset] * channelScale + channelShift;
+    if (localIndex == 0) {
+      meanShared = workgroupShared[0] / ${dataType}(normSize);
+    }
+    workgroupBarrier();
+
+    // reinitialize workgroup memory.
+    initial = 0;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      let deviation =  ${x.get('batch', 'channel', 'h')} - meanShared;
+      initial = initial + deviation * deviation;
+    }
+    workgroupShared[localIndex] = initial;
+    workgroupBarrier();
+
+    // Calculate the sum of square of deviation of current channel data.
+    for (var currSize = workgroupSize >> 1;  currSize > 0; currSize = currSize >> 1) {
+      if (localIndex < currSize) {
+        workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize];
+      }
+      workgroupBarrier();
+    }
+    if (localIndex == 0) {
+      squaredNormShared = workgroupShared[0];
+    }
+    workgroupBarrier();
+
+    let invStdDev = 1 / sqrt(squaredNormShared / ${dataType}(normSize) + epsilon);
+    let channelScale = invStdDev * ${scale.getByOffset('channel')};
+    let channelShift = ${bias.getByOffset('channel')} - meanShared * channelScale;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      let value = ${x.get('batch', 'channel', 'h')} * channelScale + channelShift;
+      ${output.set('batch', 'channel', 'h', 'value')};
     }
   }`;
       return {
@@ -86,7 +100,7 @@ const createInstanceNormProgramInfo =
           {dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default},
         ],
         getShaderSource,
-        dispatchGroup: () => ({x: Math.ceil(normCount / 64 /* workgroup size */)})
+        dispatchGroup: () => ({x: normCount})
       };
     };
 
@@ -118,7 +132,7 @@ const createInstanceNormNHWCProgramInfo =
   ${shaderHelper.mainStart()}
     let currentImageNumber = global_idx / C;
     let currentChannelNumber = global_idx % C;
-    
+
     // offset is channel num * N
     let offset = currentImageNumber * imageSize;
     if (offset >= ${outputSize}) { return; }
@@ -156,8 +170,6 @@ export const parseInstanceNormAttributes = (attributes: InstanceNormAttributes):
     createAttributeWithCacheKey({epsilon: attributes.epsilon, format: attributes.format});
 
 export const instanceNorm = (context: ComputeContext, attributes: InstanceNormAttributes): void => {
-  validateInputs(context.inputs);
-
   const metadata = {
     name: 'InstanceNormalization',
     inputTypes: [GpuDataType.default, GpuDataType.default, GpuDataType.default],
diff --git a/js/web/test/data/ops/instance-norm.jsonc b/js/web/test/data/ops/instance-norm.jsonc
new file mode 100644
index 0000000000..6a4e691240
--- /dev/null
+++ b/js/web/test/data/ops/instance-norm.jsonc
@@ -0,0 +1,79 @@
+[
+  {
+    "name": "Simple test with NHWC",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [4],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617,
+              4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125,
+              12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207
+            ],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NCHW",
+    "operator": "InstanceNormalization",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [4],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617,
+              4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125,
+              12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207
+            ],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index e580259071..94592884cc 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -257,6 +257,7 @@
       "greater.jsonc",
       //"identity.jsonc",
       "image-scaler.jsonc",
+      "instance-norm.jsonc",
       "less.jsonc",
       "log.jsonc",
       "matmul.jsonc",
@@ -1347,6 +1348,7 @@
       "gemm.jsonc",
       "global-average-pool.jsonc",
       "greater.jsonc",
+      "instance-norm.jsonc",
       "less.jsonc",
       "log.jsonc",
       "matmul.jsonc",