[js/webgpu] Optimize InstanceNormalization (#17491)

### Description  In previous implementation, there are two loops to iterate H * W elements to calculate the `mean` and `squaredNorm` value in one thread, meanwhile it outputs H * W elements in one thread. That results it's very very slow when H * W is a large value. And usually, H * W does be a large value in a model. For example, in the `candy-8` model, the shapes of [H, W] are [224,224], [112,112], [56,56] for `InstanceNormalization` op. And in my ADL, `[1,224,224,32]` consumes 17 ms. See below: ``` [profiling] kernel "23848328|[InstanceNormalization] 23848328" input[0]: [1,224,224,32] | float32, input[1]: [32] | float32, input[2]: [32] | float32, output[0]: [1,224,224,32] | float32, execution time: 17007914 ns ``` In this PR, it uses workgroup memory to optimize the original algorithm. The advantage is that it can parallelly utilize the 64 (workgroupSize) threads in one workgroup to calculate `mean` and `squaredNorm` value. Meanwhile, it only outputs `H * W / workgroupSize` outputs for one thread, which greatly reduces the overhead for one thread. With this optimization, `[1,224,224,32]` becomes 3 ms and the main overhead is the extra two `transpose`. The `createInstanceNormProgramInfo` only needs `0.64` ms. See below: ``` [profiling] kernel "23003600|[InstanceNormalization] 23003600" input[0]: [1,224,224,32] | float32, output[0]: [1,32,224,224] | float32, execution time: 1543792 ns program-manager.ts:115 [profiling] kernel "23003600|[InstanceNormalization] 23003600" input[0]: [1,32,224,224] | float32, input[1]: [32] | float32, input[2]: [32] | float32, output[0]: [1,32,224,224] | float32, execution time: 642652 ns program-manager.ts:115 [profiling] kernel "23003600|[InstanceNormalization] 23003600" input[0]: [1,32,224,224] | float32, output[0]: [1,224,224,32] | float32, execution time: 991608 ns ``` This PR currently only applies the new algorithm to NCHW format. For NHWC format, one way is to transpose the input so that it can use the new algorithm. But the disadvantage is that 2 extra transpose are added. @dakenf also gives another way to optimize NHWC. Details see [here](d45a96616d/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts). I checked @dakenf's method. The perf is similar with transpose + optimized NCHW. But on different GPUs, one is a little better than another or vice versa. So I prefer this PR only does the NCHW part. @dakenf can submit his optimization on NHWC.
2026-07-17 18:40:28 +00:00 · 2023-09-15 08:03:18 +08:00 · 2023-09-15 08:03:18 +08:00 · 41d2ff622c
commit 41d2ff622c
parent 46fe08226f
4 changed files with 148 additions and 54 deletions
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@ -592,7 +592,8 @@ class ShaderHelperImpl implements ShaderHelper {
    const workgroupSizeZ = typeof workgroupSize === 'number' ? 1 : workgroupSize[2];

    const is1DimensionDispatch = this.normalizedDispatchGroup[1] === 1 && this.normalizedDispatchGroup[2] === 1;
-    const paramList = is1DimensionDispatch ? '@builtin(global_invocation_id) global_id : vec3<u32>' :
+    const paramList = is1DimensionDispatch ? `@builtin(global_invocation_id) global_id : vec3<u32>,
+    @builtin(local_invocation_id) local_id : vec3<u32>` :
                                             `@builtin(local_invocation_index) local_index : u32,
    @builtin(workgroup_id) workgroup_id : vec3<u32>`;
    const globalIdxDefinition = is1DimensionDispatch ?
--- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
@ -1,83 +1,97 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

-import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';

-import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
+import {inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from './common';

 export interface InstanceNormAttributes extends AttributeWithCacheKey {
  epsilon: number;
  format: 'NHWC'|'NCHW';
 }

-const validateInputs = (inputs: readonly TensorView[]): void => {
-  if (!inputs || inputs.length !== 3) {
-    throw new Error('instanceNorm requires 3 inputs.');
-  }
-
-  if (inputs[0].dataType !== DataType.float || inputs[1].dataType !== DataType.float) {
-    throw new Error('inputs should be float type');
-  }
-};
-
 const createInstanceNormProgramInfo =
    (metadata: ProgramMetadata, inputs: readonly TensorView[], attributes: InstanceNormAttributes): ProgramInfo => {
      const xShape = inputs[0].dims;
-      const scale = inputs[1];
-      const bias = inputs[2];

      const outputShape = xShape;
-      const outputSize = ShapeUtil.size(outputShape);
      const axis = 2;
      const normCount = ShapeUtil.sizeToDimension(xShape, axis);
      const normSize = ShapeUtil.sizeFromDimension(xShape, axis);
      const C = xShape[1];
-
-      const scaleSize = ShapeUtil.size(scale.dims);
-      const biasSize = bias ? ShapeUtil.size(bias.dims) : 0;
-      if (scaleSize !== normSize || (bias && biasSize !== normSize)) {
-        throw new Error(`Size of X.shape()[axis:] == ${normSize}.
-             Size of scale and bias (if provided) must match this. 
-             Got scale size of ${scaleSize} and bias size of ${biasSize}`);
-      }
-
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-
+      const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
+      const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
+      const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
+      const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
+      const variables = [x, scale, bias, output];
+      const dataType = x.type.value;
+      const workgroupSize = 64;
      const getShaderSource = (shaderHelper: ShaderHelper) => `
+
  const C: u32 = ${C};
  const normSize: u32 = ${normSize};
-  const normSizeTyped: ${dataType} = ${normSize};
  const epsilon: f32 = ${attributes.epsilon};
+  var<workgroup> meanShared : ${dataType};
+  var<workgroup> squaredNormShared : ${dataType};
+  var<workgroup> workgroupShared : array<${dataType}, ${workgroupSize}>;
+  const workgroupSize = ${workgroupSize}u;
+  ${shaderHelper.declareVariables(...variables)}
+  ${shaderHelper.mainStart(workgroupSize)}
+    let norm = global_idx / workgroupSize;
+    let batch = norm / C;
+    let channel = norm % C;
+    let localIndex = local_id.x;

-  @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
-  @group(0) @binding(1) var<storage, read> scale : array<${dataType}>;
-  @group(0) @binding(2) var<storage, read> bias : array<${dataType}>;
-  @group(0) @binding(3) var<storage, read_write> output : array<${dataType}>;
-
-  ${shaderHelper.mainStart()}
-    let offset = global_idx * normSize;
-    if (offset + normSize >= ${outputSize}) { return; }
-    var mean: ${dataType} = 0;
-
-    for (var h: u32 = 0u; h < normSize; h++) {
-        mean = mean + x[h + offset];
+    // initialize workgroup memory
+    var initial: ${dataType} = 0;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      initial = initial + ${x.get('batch', 'channel', 'h')};
    }
-    mean = mean / normSizeTyped;
+    workgroupShared[localIndex] = initial;
+    workgroupBarrier();

-    var squaredNorm: ${dataType} = 0;
-    for (var h: u32 = 0u; h < normSize; h++) {
-        let deviation: f32 = x[h + offset] - mean;
-        squaredNorm = squaredNorm + deviation * deviation;
+    // Calculate the mean of current channel data.
+    for (var currSize = workgroupSize >> 1;  currSize > 0; currSize = currSize >> 1) {
+      if (localIndex < currSize) {
+        workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize];
+      }
+      workgroupBarrier();
    }
-    let invStdDev = 1 / sqrt(squaredNorm / normSizeTyped + epsilon);
-    let channelScale = invStdDev * scale[global_idx % C];
-    let channelShift = bias[global_idx % C] - mean * channelScale;
-    for (var j: u32 = 0; j < normSize; j++) {
-        output[j + offset] = x[j + offset] * channelScale + channelShift;
+    if (localIndex == 0) {
+      meanShared = workgroupShared[0] / ${dataType}(normSize);
+    }
+    workgroupBarrier();
+
+    // reinitialize workgroup memory.
+    initial = 0;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      let deviation =  ${x.get('batch', 'channel', 'h')} - meanShared;
+      initial = initial + deviation * deviation;
+    }
+    workgroupShared[localIndex] = initial;
+    workgroupBarrier();
+
+    // Calculate the sum of square of deviation of current channel data.
+    for (var currSize = workgroupSize >> 1;  currSize > 0; currSize = currSize >> 1) {
+      if (localIndex < currSize) {
+        workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize];
+      }
+      workgroupBarrier();
+    }
+    if (localIndex == 0) {
+      squaredNormShared = workgroupShared[0];
+    }
+    workgroupBarrier();
+
+    let invStdDev = 1 / sqrt(squaredNormShared / ${dataType}(normSize) + epsilon);
+    let channelScale = invStdDev * ${scale.getByOffset('channel')};
+    let channelShift = ${bias.getByOffset('channel')} - meanShared * channelScale;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      let value = ${x.get('batch', 'channel', 'h')} * channelScale + channelShift;
+      ${output.set('batch', 'channel', 'h', 'value')};
    }
  }`;
      return {
@ -86,7 +100,7 @@ const createInstanceNormProgramInfo =
          {dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default},
        ],
        getShaderSource,
-        dispatchGroup: () => ({x: Math.ceil(normCount / 64 /* workgroup size */)})
+        dispatchGroup: () => ({x: normCount})
      };
    };

@ -118,7 +132,7 @@ const createInstanceNormNHWCProgramInfo =
  ${shaderHelper.mainStart()}
    let currentImageNumber = global_idx / C;
    let currentChannelNumber = global_idx % C;
-    
+
    // offset is channel num * N
    let offset = currentImageNumber * imageSize;
    if (offset >= ${outputSize}) { return; }
@ -156,8 +170,6 @@ export const parseInstanceNormAttributes = (attributes: InstanceNormAttributes):
    createAttributeWithCacheKey({epsilon: attributes.epsilon, format: attributes.format});

 export const instanceNorm = (context: ComputeContext, attributes: InstanceNormAttributes): void => {
-  validateInputs(context.inputs);
-
  const metadata = {
    name: 'InstanceNormalization',
    inputTypes: [GpuDataType.default, GpuDataType.default, GpuDataType.default],
--- a/js/web/test/data/ops/instance-norm.jsonc
+++ b/js/web/test/data/ops/instance-norm.jsonc
@ -0,0 +1,79 @@
+[
+  {
+    "name": "Simple test with NHWC",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [4],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617,
+              4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125,
+              12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207
+            ],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NCHW",
+    "operator": "InstanceNormalization",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [4],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617,
+              4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125,
+              12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207
+            ],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@ -257,6 +257,7 @@
      "greater.jsonc",
      //"identity.jsonc",
      "image-scaler.jsonc",
+      "instance-norm.jsonc",
      "less.jsonc",
      "log.jsonc",
      "matmul.jsonc",
@ -1347,6 +1348,7 @@
      "gemm.jsonc",
      "global-average-pool.jsonc",
      "greater.jsonc",
+      "instance-norm.jsonc",
      "less.jsonc",
      "log.jsonc",
      "matmul.jsonc",