mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-26 03:00:54 +00:00
### Description <!-- Describe your changes. --> Optimize conv1d to go to the conv2d path to utilize the conv2d's optimization path. See whisper-tiny-encoder model becomes 158.66 ms from 532.28 ms. Conv goes to Conv2DMatMul(8 ms) instead of GroupedConv(382 ms). Old profiling result: Kernel | Time (ms) | Percentage (%) -- | -- | -- Conv\|GroupedConv | 382.99 | 71.95 MatMul | 126.16 | 23.70 Softmax | 7.01 | 1.32 Transpose | 4.59 | 0.86 Add | 4.39 | 0.82 Mul | 2.36 | 0.44 Div | 1.44 | 0.27 ReduceMean\|ReduceMeanShared | 1.25 | 0.23 Erf | 0.85 | 0.16 Sub | 0.72 | 0.14 Pow | 0.46 | 0.09 Sqrt | 0.07 | 0.01 Sum | 532.28 | New profiling result with this PR: Kernel | Time (ms) | Percentage (%) -- | -- | -- MatMul | 127.07 | 80.09 Conv\|Conv2DMatMul | 8.00 | 5.04 Softmax | 6.95 | 4.38 Transpose | 4.65 | 2.93 Add | 4.26 | 2.68 Mul | 2.56 | 1.61 Div | 1.51 | 0.95 ReduceMean\|ReduceMeanShared | 1.31 | 0.83 Erf | 0.85 | 0.54 Sub | 0.79 | 0.50 Pow | 0.46 | 0.29 Conv\|Transpose | 0.26 | 0.17 Sqrt | 0.00 | 0.00 Sum | 158.66 | --------- Co-authored-by: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
206 lines
7.9 KiB
TypeScript
206 lines
7.9 KiB
TypeScript
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
// Licensed under the MIT License.
|
|
|
|
import { DataType } from '../../../wasm-common';
|
|
import { TensorView } from '../../tensor-view';
|
|
import { BroadcastUtil, ShapeUtil } from '../../util';
|
|
import { ComputeContext, ProgramInfo, ProgramUniform } from '../types';
|
|
|
|
import { createMatmulProgramInfo } from './3rd-party/matmul_packed_webgpu';
|
|
import {
|
|
createTensorShapeVariables,
|
|
getBroadcastDims,
|
|
getMaxComponents,
|
|
IndicesHelper,
|
|
inputVariable,
|
|
internalVariable,
|
|
outputVariable,
|
|
ShaderHelper,
|
|
tensorTypeToWsglStorageType,
|
|
UniformsArrayType,
|
|
} from './common';
|
|
import {
|
|
appendActivationUniforms,
|
|
appendActivationUniformsData,
|
|
getActivationSnippet,
|
|
InternalActivationAttributes,
|
|
} from './fuse-utils';
|
|
|
|
export const createNaiveMatmulProgramInfo = (
|
|
inputs: readonly TensorView[],
|
|
activationAttributes: InternalActivationAttributes,
|
|
outputShape: readonly number[],
|
|
reshapedOutputShape?: readonly number[],
|
|
isChannelsLast = false /* only used for conv2dByMatMul*/,
|
|
squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
|
|
): ProgramInfo => {
|
|
const aShape = inputs[0].dims;
|
|
const bShape = inputs[1].dims;
|
|
|
|
const M = aShape[aShape.length - 2];
|
|
const N = bShape[bShape.length - 1];
|
|
const K = aShape[aShape.length - 1];
|
|
const components = getMaxComponents(N);
|
|
const aComponents = getMaxComponents(K);
|
|
const outputNumber = getMaxComponents(M);
|
|
const outputSize = ShapeUtil.size(outputShape) / components / outputNumber;
|
|
const hasBias = inputs.length > 2;
|
|
const outerDims = reshapedOutputShape ? reshapedOutputShape.slice(0, -2) : outputShape.slice(0, -2);
|
|
const batchSize = ShapeUtil.size(outerDims);
|
|
const outputShapeInShader = [batchSize, M, N];
|
|
|
|
const programUniforms: ProgramUniform[] = [
|
|
{ type: DataType.uint32, data: outputSize },
|
|
{ type: DataType.uint32, data: M },
|
|
{ type: DataType.uint32, data: N },
|
|
{ type: DataType.uint32, data: K },
|
|
];
|
|
appendActivationUniformsData(activationAttributes, programUniforms);
|
|
programUniforms.push(...createTensorShapeVariables(outerDims, aShape, bShape));
|
|
if (hasBias) {
|
|
programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
|
|
}
|
|
programUniforms.push(...createTensorShapeVariables(outputShapeInShader));
|
|
|
|
const getShaderSource = (shaderHelper: ShaderHelper) => {
|
|
const batchDims = internalVariable('batch_dims', inputs[0].dataType, outerDims.length);
|
|
const a = inputVariable('a', inputs[0].dataType, aShape.length, aComponents);
|
|
const b = inputVariable('b', inputs[1].dataType, bShape.length, components);
|
|
const output = outputVariable('output', inputs[0].dataType, outputShapeInShader.length, components);
|
|
const baseType = tensorTypeToWsglStorageType(output.type.tensor);
|
|
const applyActivation = getActivationSnippet(activationAttributes, output.type.value, baseType);
|
|
const inputVariables = [a, b];
|
|
let processBias = '';
|
|
if (hasBias) {
|
|
const biasComponents = isChannelsLast ? components : 1;
|
|
inputVariables.push(inputVariable('bias', inputs[2].dataType, inputs[2].dims.length, biasComponents));
|
|
processBias = `${
|
|
isChannelsLast ? `value += bias[col / ${biasComponents}];` : `value += ${output.type.value}(bias[row + i]);`
|
|
}`;
|
|
}
|
|
|
|
const outerDimsA = aShape.slice(0, -2);
|
|
const outerDimsB = bShape.slice(0, -2);
|
|
const broadCastADims = getBroadcastDims(outerDimsA, outerDims);
|
|
const broadCastBDims = getBroadcastDims(outerDimsB, outerDims);
|
|
const uniforms: UniformsArrayType = [
|
|
{ name: 'output_size', type: 'u32' },
|
|
{ name: 'M', type: 'u32' },
|
|
{ name: 'N', type: 'u32' },
|
|
{ name: 'K', type: 'u32' },
|
|
];
|
|
appendActivationUniforms(activationAttributes, uniforms);
|
|
|
|
const getIndices = (variable: IndicesHelper, broadCastDims: number[]) => {
|
|
const rank = variable.rank;
|
|
const name = variable.name;
|
|
if (rank === 2) {
|
|
return `var ${name}_indices = ${variable.type.indices}(0u, 0u);`;
|
|
}
|
|
const batchRank = batchDims.rank;
|
|
let resStr = `var ${name}_indices: ${variable.type.indices};`;
|
|
for (let i = rank - 2 - 1, j = batchRank - 1; i >= 0; i--, j--) {
|
|
resStr += `\n${name}_indices[${i}] = ${batchRank > 1 ? `batch_indices[${j}]` : 'batch_indices'};`;
|
|
}
|
|
broadCastDims.forEach((i) => {
|
|
resStr += `\n${name}_indices[${i}] = 0;`;
|
|
});
|
|
resStr += `${name}_indices[${rank - 2}] = 0u;
|
|
${name}_indices[${rank - 1}] = 0u;`;
|
|
return resStr;
|
|
};
|
|
|
|
const calcResult = (): string => {
|
|
let calcStr = `var a_data: ${a.type.value};`;
|
|
for (let i = 0; i < aComponents; i++) {
|
|
calcStr += `
|
|
let b_data${i} = b[(b_offset + (k + ${i}) * uniforms.N + col) / ${components}];`;
|
|
}
|
|
for (let i = 0; i < outputNumber; i++) {
|
|
calcStr += `a_data = a[(a_offset + (row + ${i}) * uniforms.K + k) / ${aComponents}];`;
|
|
|
|
for (let j = 0; j < aComponents; j++) {
|
|
calcStr += `
|
|
values[${i}] = fma(${b.type.value}(a_data${aComponents === 1 ? '' : `[${j}]`}), b_data${j}, values[${i}]);\n`;
|
|
}
|
|
}
|
|
return calcStr;
|
|
};
|
|
|
|
return `
|
|
${shaderHelper
|
|
.registerUniforms(uniforms)
|
|
.registerInternalVariables(batchDims)
|
|
.declareVariables(...inputVariables, output)}
|
|
${shaderHelper.mainStart()}
|
|
${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
|
|
let col = (global_idx % (uniforms.N / ${components})) * ${components};
|
|
var index1 = global_idx / (uniforms.N / ${components});
|
|
let stride1 = uniforms.M / ${outputNumber};
|
|
let row = (index1 % stride1) * ${outputNumber};
|
|
let batch = index1 / stride1;
|
|
|
|
${outputShape.length === 2 ? '' : `let batch_indices = ${batchDims.offsetToIndices('batch')};`}
|
|
${getIndices(a, broadCastADims)}
|
|
let a_offset = ${a.indicesToOffset('a_indices')};
|
|
${getIndices(b, broadCastBDims)}
|
|
let b_offset = ${b.indicesToOffset('b_indices')};
|
|
var values: array<${output.type.value}, ${outputNumber}>;
|
|
for (var k: u32 = 0u; k < uniforms.K; k = k + ${aComponents}) {
|
|
${calcResult()}
|
|
}
|
|
for (var i = 0u; i < ${outputNumber}u; i++) {
|
|
var value = values[i];
|
|
${processBias}
|
|
${applyActivation}
|
|
let cur_indices = ${output.type.indices}(batch, row + i, col);
|
|
let offset = ${output.indicesToOffset('cur_indices')};
|
|
${output.setByOffset(`offset / ${components}`, 'value')};
|
|
}
|
|
}
|
|
`;
|
|
};
|
|
return {
|
|
name: 'MatMulNaive',
|
|
shaderCache: {
|
|
hint: `${activationAttributes.activation};${components};${aComponents};${outputNumber};${isChannelsLast}`,
|
|
inputDependencies: hasBias ? ['rank', 'rank', 'rank'] : ['rank', 'rank'],
|
|
},
|
|
getRunData: () => ({
|
|
outputs: [
|
|
{
|
|
dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
|
|
dataType: inputs[0].dataType,
|
|
},
|
|
],
|
|
dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) },
|
|
programUniforms,
|
|
}),
|
|
getShaderSource,
|
|
};
|
|
};
|
|
|
|
const validateInputs = (inputs: readonly TensorView[]): void => {
|
|
if (!inputs || inputs.length !== 2) {
|
|
throw new Error('MatMul requires 2 inputs.');
|
|
}
|
|
|
|
if (inputs[0].dims[inputs[0].dims.length - 1] !== inputs[1].dims[inputs[1].dims.length - 2]) {
|
|
throw new Error('shared dimension does not match.');
|
|
}
|
|
};
|
|
|
|
export const matMul = (context: ComputeContext): void => {
|
|
validateInputs(context.inputs);
|
|
const outputShape = BroadcastUtil.calcShape(context.inputs[0].dims, context.inputs[1].dims, true);
|
|
if (!outputShape) {
|
|
throw new Error("Can't use matmul on the given tensors");
|
|
}
|
|
const N = outputShape[outputShape.length - 1];
|
|
const K = context.inputs[0].dims[context.inputs[0].dims.length - 1];
|
|
if (N < 8 && K < 8) {
|
|
context.compute(createNaiveMatmulProgramInfo(context.inputs, { activation: '' }, outputShape));
|
|
} else {
|
|
context.compute(createMatmulProgramInfo(context.inputs, { activation: '' }, outputShape));
|
|
}
|
|
};
|