[js/webgpu] Optimize InstanceNormalization (#17491)

### Description
<!-- Describe your changes. -->
In previous implementation, there are two loops to iterate H * W
elements to calculate the `mean` and `squaredNorm` value in one thread,
meanwhile it outputs H * W elements in one thread. That results it's
very very slow when H * W is a large value. And usually, H * W does be a
large value in a model. For example, in the `candy-8` model, the shapes
of [H, W] are [224,224], [112,112], [56,56] for `InstanceNormalization`
op. And in my ADL, `[1,224,224,32]` consumes 17 ms. See below:
```
[profiling] kernel "23848328|[InstanceNormalization] 23848328" input[0]: [1,224,224,32] | float32, input[1]: [32] | float32, input[2]: [32] | float32, output[0]: [1,224,224,32] | float32, execution time: 17007914 ns
```

In this PR, it uses workgroup memory to optimize the original algorithm.
The advantage is that it can parallelly utilize the 64 (workgroupSize)
threads in one workgroup to calculate `mean` and `squaredNorm` value.
Meanwhile, it only outputs `H * W / workgroupSize` outputs for one
thread, which greatly reduces the overhead for one thread. With this
optimization, `[1,224,224,32]` becomes 3 ms and the main overhead is the
extra two `transpose`. The `createInstanceNormProgramInfo` only needs
`0.64` ms. See below:
```
[profiling] kernel "23003600|[InstanceNormalization] 23003600" input[0]: [1,224,224,32] | float32, output[0]: [1,32,224,224] | float32, execution time: 1543792 ns
program-manager.ts:115 
[profiling] kernel "23003600|[InstanceNormalization] 23003600" input[0]: [1,32,224,224] | float32, input[1]: [32] | float32, input[2]: [32] | float32, output[0]: [1,32,224,224] | float32, execution time: 642652 ns
program-manager.ts:115 
[profiling] kernel "23003600|[InstanceNormalization] 23003600" input[0]: [1,32,224,224] | float32, output[0]: [1,224,224,32] | float32, execution time: 991608 ns
```
This PR currently only applies the new algorithm to NCHW format. For
NHWC format, one way is to transpose the input so that it can use the
new algorithm. But the disadvantage is that 2 extra transpose are added.
@dakenf also gives another way to optimize NHWC. Details see
[here](d45a96616d/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts).
I checked @dakenf's method. The perf is similar with transpose +
optimized NCHW. But on different GPUs, one is a little better than
another or vice versa. So I prefer this PR only does the NCHW part.
@dakenf can submit his optimization on NHWC.
This commit is contained in:
Jiajia Qin 2023-09-15 08:03:18 +08:00 committed by GitHub
parent 46fe08226f
commit 41d2ff622c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 148 additions and 54 deletions

View file

@ -592,7 +592,8 @@ class ShaderHelperImpl implements ShaderHelper {
const workgroupSizeZ = typeof workgroupSize === 'number' ? 1 : workgroupSize[2];
const is1DimensionDispatch = this.normalizedDispatchGroup[1] === 1 && this.normalizedDispatchGroup[2] === 1;
const paramList = is1DimensionDispatch ? '@builtin(global_invocation_id) global_id : vec3<u32>' :
const paramList = is1DimensionDispatch ? `@builtin(global_invocation_id) global_id : vec3<u32>,
@builtin(local_invocation_id) local_id : vec3<u32>` :
`@builtin(local_invocation_index) local_index : u32,
@builtin(workgroup_id) workgroup_id : vec3<u32>`;
const globalIdxDefinition = is1DimensionDispatch ?

View file

@ -1,83 +1,97 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
import {DataType} from '../../../wasm-common';
import {TensorView} from '../../tensor';
import {ShapeUtil} from '../../util';
import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
import {inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from './common';
export interface InstanceNormAttributes extends AttributeWithCacheKey {
epsilon: number;
format: 'NHWC'|'NCHW';
}
const validateInputs = (inputs: readonly TensorView[]): void => {
if (!inputs || inputs.length !== 3) {
throw new Error('instanceNorm requires 3 inputs.');
}
if (inputs[0].dataType !== DataType.float || inputs[1].dataType !== DataType.float) {
throw new Error('inputs should be float type');
}
};
const createInstanceNormProgramInfo =
(metadata: ProgramMetadata, inputs: readonly TensorView[], attributes: InstanceNormAttributes): ProgramInfo => {
const xShape = inputs[0].dims;
const scale = inputs[1];
const bias = inputs[2];
const outputShape = xShape;
const outputSize = ShapeUtil.size(outputShape);
const axis = 2;
const normCount = ShapeUtil.sizeToDimension(xShape, axis);
const normSize = ShapeUtil.sizeFromDimension(xShape, axis);
const C = xShape[1];
const scaleSize = ShapeUtil.size(scale.dims);
const biasSize = bias ? ShapeUtil.size(bias.dims) : 0;
if (scaleSize !== normSize || (bias && biasSize !== normSize)) {
throw new Error(`Size of X.shape()[axis:] == ${normSize}.
Size of scale and bias (if provided) must match this.
Got scale size of ${scaleSize} and bias size of ${biasSize}`);
}
const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
const variables = [x, scale, bias, output];
const dataType = x.type.value;
const workgroupSize = 64;
const getShaderSource = (shaderHelper: ShaderHelper) => `
const C: u32 = ${C};
const normSize: u32 = ${normSize};
const normSizeTyped: ${dataType} = ${normSize};
const epsilon: f32 = ${attributes.epsilon};
var<workgroup> meanShared : ${dataType};
var<workgroup> squaredNormShared : ${dataType};
var<workgroup> workgroupShared : array<${dataType}, ${workgroupSize}>;
const workgroupSize = ${workgroupSize}u;
${shaderHelper.declareVariables(...variables)}
${shaderHelper.mainStart(workgroupSize)}
let norm = global_idx / workgroupSize;
let batch = norm / C;
let channel = norm % C;
let localIndex = local_id.x;
@group(0) @binding(0) var<storage, read> x : array<${dataType}>;
@group(0) @binding(1) var<storage, read> scale : array<${dataType}>;
@group(0) @binding(2) var<storage, read> bias : array<${dataType}>;
@group(0) @binding(3) var<storage, read_write> output : array<${dataType}>;
${shaderHelper.mainStart()}
let offset = global_idx * normSize;
if (offset + normSize >= ${outputSize}) { return; }
var mean: ${dataType} = 0;
for (var h: u32 = 0u; h < normSize; h++) {
mean = mean + x[h + offset];
// initialize workgroup memory
var initial: ${dataType} = 0;
for (var h = localIndex; h < normSize; h += workgroupSize) {
initial = initial + ${x.get('batch', 'channel', 'h')};
}
mean = mean / normSizeTyped;
workgroupShared[localIndex] = initial;
workgroupBarrier();
var squaredNorm: ${dataType} = 0;
for (var h: u32 = 0u; h < normSize; h++) {
let deviation: f32 = x[h + offset] - mean;
squaredNorm = squaredNorm + deviation * deviation;
// Calculate the mean of current channel data.
for (var currSize = workgroupSize >> 1; currSize > 0; currSize = currSize >> 1) {
if (localIndex < currSize) {
workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize];
}
workgroupBarrier();
}
let invStdDev = 1 / sqrt(squaredNorm / normSizeTyped + epsilon);
let channelScale = invStdDev * scale[global_idx % C];
let channelShift = bias[global_idx % C] - mean * channelScale;
for (var j: u32 = 0; j < normSize; j++) {
output[j + offset] = x[j + offset] * channelScale + channelShift;
if (localIndex == 0) {
meanShared = workgroupShared[0] / ${dataType}(normSize);
}
workgroupBarrier();
// reinitialize workgroup memory.
initial = 0;
for (var h = localIndex; h < normSize; h += workgroupSize) {
let deviation = ${x.get('batch', 'channel', 'h')} - meanShared;
initial = initial + deviation * deviation;
}
workgroupShared[localIndex] = initial;
workgroupBarrier();
// Calculate the sum of square of deviation of current channel data.
for (var currSize = workgroupSize >> 1; currSize > 0; currSize = currSize >> 1) {
if (localIndex < currSize) {
workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize];
}
workgroupBarrier();
}
if (localIndex == 0) {
squaredNormShared = workgroupShared[0];
}
workgroupBarrier();
let invStdDev = 1 / sqrt(squaredNormShared / ${dataType}(normSize) + epsilon);
let channelScale = invStdDev * ${scale.getByOffset('channel')};
let channelShift = ${bias.getByOffset('channel')} - meanShared * channelScale;
for (var h = localIndex; h < normSize; h += workgroupSize) {
let value = ${x.get('batch', 'channel', 'h')} * channelScale + channelShift;
${output.set('batch', 'channel', 'h', 'value')};
}
}`;
return {
@ -86,7 +100,7 @@ const createInstanceNormProgramInfo =
{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default},
],
getShaderSource,
dispatchGroup: () => ({x: Math.ceil(normCount / 64 /* workgroup size */)})
dispatchGroup: () => ({x: normCount})
};
};
@ -118,7 +132,7 @@ const createInstanceNormNHWCProgramInfo =
${shaderHelper.mainStart()}
let currentImageNumber = global_idx / C;
let currentChannelNumber = global_idx % C;
// offset is channel num * N
let offset = currentImageNumber * imageSize;
if (offset >= ${outputSize}) { return; }
@ -156,8 +170,6 @@ export const parseInstanceNormAttributes = (attributes: InstanceNormAttributes):
createAttributeWithCacheKey({epsilon: attributes.epsilon, format: attributes.format});
export const instanceNorm = (context: ComputeContext, attributes: InstanceNormAttributes): void => {
validateInputs(context.inputs);
const metadata = {
name: 'InstanceNormalization',
inputTypes: [GpuDataType.default, GpuDataType.default, GpuDataType.default],

View file

@ -0,0 +1,79 @@
[
{
"name": "Simple test with NHWC",
"operator": "InstanceNormalization",
"inputShapeDefinitions": "rankOnly",
"opset": { "domain": "", "version": 17 },
"cases": [
{
"name": "Simple test",
"inputs": [
{
"data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4],
"dims": [1, 4, 2, 2],
"type": "float32"
},
{
"data": [1, 2, 3, 4],
"dims": [4],
"type": "float32"
},
{
"data": [4, 5, 6, 7],
"dims": [4],
"type": "float32"
}
],
"outputs": [
{
"data": [
2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617,
4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125,
12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207
],
"dims": [1, 4, 2, 2],
"type": "float32"
}
]
}
]
},
{
"name": "Simple test with NCHW",
"operator": "InstanceNormalization",
"opset": { "domain": "", "version": 17 },
"cases": [
{
"name": "Simple test",
"inputs": [
{
"data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4],
"dims": [1, 4, 2, 2],
"type": "float32"
},
{
"data": [1, 2, 3, 4],
"dims": [4],
"type": "float32"
},
{
"data": [4, 5, 6, 7],
"dims": [4],
"type": "float32"
}
],
"outputs": [
{
"data": [
2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617,
4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125,
12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207
],
"dims": [1, 4, 2, 2],
"type": "float32"
}
]
}
]
}
]

View file

@ -257,6 +257,7 @@
"greater.jsonc",
//"identity.jsonc",
"image-scaler.jsonc",
"instance-norm.jsonc",
"less.jsonc",
"log.jsonc",
"matmul.jsonc",
@ -1347,6 +1348,7 @@
"gemm.jsonc",
"global-average-pool.jsonc",
"greater.jsonc",
"instance-norm.jsonc",
"less.jsonc",
"log.jsonc",
"matmul.jsonc",