mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-18 21:21:17 +00:00
This CL make WebGPU backend support subgroup features and thus allow using subgroup optimizations in the future. ### Description With this CL WebGPU backends will create devices with subgroups and subgroups-f16 features (both are under origin trial in Chrome) or chromium-experimental-subgroups feature enabled whenever available. ### Motivation and Context This CL would allow WebGPU operator shaders to use subgroup optimizations in the future, and might get some significant speedup with these optimization.
952 lines
35 KiB
TypeScript
952 lines
35 KiB
TypeScript
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
// Licensed under the MIT License.
|
|
|
|
import { Env, Tensor, TRACE, TRACE_FUNC_BEGIN, TRACE_FUNC_END } from 'onnxruntime-common';
|
|
|
|
import { DataType, tensorDataTypeEnumToString } from '../wasm-common';
|
|
|
|
import { configureLogger, LOG_DEBUG } from './log';
|
|
import { createView, TensorView } from './tensor-view';
|
|
import { createGpuDataManager, downloadGpuData, GpuDataManager } from './webgpu/gpu-data-manager';
|
|
import { RunFunction, WEBGPU_OP_RESOLVE_RULES } from './webgpu/op-resolve-rules';
|
|
import { ProgramManager } from './webgpu/program-manager';
|
|
import {
|
|
AdapterInfo,
|
|
ComputeContext,
|
|
DeviceInfo,
|
|
GpuArchitecture,
|
|
GpuData,
|
|
GpuVendor,
|
|
ProgramInfo,
|
|
ProgramInputTensorInfoDependency,
|
|
SessionState,
|
|
TimestampQuery,
|
|
} from './webgpu/types';
|
|
|
|
interface CommandInfo {
|
|
readonly kernelId: number;
|
|
readonly computePipeline: GPUComputePipeline;
|
|
readonly bindGroup: GPUBindGroup;
|
|
readonly dispatchGroup: [number, number, number];
|
|
}
|
|
|
|
interface KernelInfo {
|
|
readonly kernelType: string;
|
|
readonly kernelName: string;
|
|
readonly kernelEntry: RunFunction;
|
|
readonly attributes: [((attribute: unknown) => unknown) | undefined, unknown];
|
|
}
|
|
|
|
interface PendingKernelInfo {
|
|
readonly kernelId: number;
|
|
readonly programName: string;
|
|
readonly inputTensorViews: readonly TensorView[];
|
|
readonly outputTensorViews: readonly TensorView[];
|
|
}
|
|
|
|
const getProgramInputTensorInfoDependencyKey = (
|
|
inputTensors: readonly TensorView[],
|
|
inputDependencies: readonly ProgramInputTensorInfoDependency[],
|
|
): string => {
|
|
if (inputDependencies.length !== inputTensors.length) {
|
|
throw new Error(
|
|
`inputDependencies length ${inputDependencies.length} is not equal to inputTensors length ${
|
|
inputTensors.length
|
|
}.`,
|
|
);
|
|
}
|
|
|
|
const inputInfos: string[] = [];
|
|
for (let i = 0; i < inputTensors.length; ++i) {
|
|
const type = inputTensors[i].dataType;
|
|
switch (inputDependencies[i]) {
|
|
case 'none': {
|
|
inputInfos.push('');
|
|
break;
|
|
}
|
|
case 'type': {
|
|
inputInfos.push(`${type}`);
|
|
break;
|
|
}
|
|
case 'rank': {
|
|
const rank = inputTensors[i].dims.length;
|
|
inputInfos.push(`${type};${rank}`);
|
|
break;
|
|
}
|
|
case 'dims': {
|
|
const dims = inputTensors[i].dims.join(',');
|
|
inputInfos.push(`${type};${dims}`);
|
|
break;
|
|
}
|
|
default:
|
|
throw new Error(`unsupported input dependency: ${inputDependencies[i]}`);
|
|
}
|
|
}
|
|
|
|
return inputInfos.join('|');
|
|
};
|
|
|
|
/**
|
|
* get a unique key representing the program from the program info, input shapes and types.
|
|
*
|
|
* @returns a unique key is a shorter string than the shader source, which contains all the information to identify a
|
|
* program. if the key is the same, the program shader source should be the same, so we can reuse the program.
|
|
*
|
|
*/
|
|
const getProgramInfoUniqueKey = (
|
|
programInfo: ProgramInfo,
|
|
inputTensors: readonly TensorView[],
|
|
is1DimensionDispatch: boolean,
|
|
): string => {
|
|
// final key format:
|
|
// <PROGRAM_NAME>[<PROGRAM_CUSTOM_CACHE_HINT>]:is1DimensionDispatch:<INPUTS_INFO_0>|<INPUTS_INFO_1>|...
|
|
let key = programInfo.name;
|
|
if (programInfo.shaderCache?.hint) {
|
|
key += '[' + programInfo.shaderCache.hint + ']';
|
|
}
|
|
key +=
|
|
':' +
|
|
is1DimensionDispatch +
|
|
`:${getProgramInputTensorInfoDependencyKey(
|
|
inputTensors,
|
|
programInfo.shaderCache?.inputDependencies ??
|
|
new Array<ProgramInputTensorInfoDependency>(inputTensors.length).fill('dims'),
|
|
)}`;
|
|
return key;
|
|
};
|
|
|
|
class AdapterInfoImpl implements AdapterInfo {
|
|
readonly architecture?: string;
|
|
readonly vendor?: string;
|
|
|
|
constructor(adapterInfo: GPUAdapterInfo) {
|
|
if (adapterInfo) {
|
|
this.architecture = adapterInfo.architecture;
|
|
this.vendor = adapterInfo.vendor;
|
|
}
|
|
}
|
|
|
|
isArchitecture(architecture: GpuArchitecture): boolean {
|
|
return this.architecture === architecture;
|
|
}
|
|
|
|
isVendor(vendor: GpuVendor): boolean {
|
|
return this.vendor === vendor;
|
|
}
|
|
}
|
|
|
|
class DeviceInfoImpl implements DeviceInfo {
|
|
readonly subgroupsSupported: boolean;
|
|
readonly subgroupsF16Supported: boolean;
|
|
readonly subgroupSizeRange?: readonly [number, number];
|
|
|
|
constructor(device: GPUDevice) {
|
|
this.subgroupsSupported = device.features.has('subgroups' as GPUFeatureName);
|
|
this.subgroupsF16Supported = device.features.has('subgroups' as GPUFeatureName);
|
|
// Currently subgroups feature is still experimental and size attributes are not in the WebGPU IDL, so we have to
|
|
// workaround the IDL type checks.
|
|
// TODO: clean this after subgroups feature is settled in IDL.
|
|
const deviceSubgroupsLimits = device.limits as { minSubgroupSize?: number; maxSubgroupSize?: number };
|
|
if (!this.subgroupsSupported || !deviceSubgroupsLimits.minSubgroupSize || !deviceSubgroupsLimits.maxSubgroupSize) {
|
|
this.subgroupSizeRange = undefined;
|
|
} else {
|
|
this.subgroupSizeRange = [deviceSubgroupsLimits.minSubgroupSize, deviceSubgroupsLimits.maxSubgroupSize];
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* this class is designed to store status and being used as a singleton for JSEP. It will be passed to jsepInit() as
|
|
* the first parameter so that it is stored for future use.
|
|
*/
|
|
export class WebGpuBackend {
|
|
adapterInfo: AdapterInfoImpl;
|
|
device: GPUDevice;
|
|
deviceInfo: DeviceInfoImpl;
|
|
/**
|
|
* an instance of GpuDataManager to manage a GpuDataId -> GpuBuffer mapping
|
|
*/
|
|
gpuDataManager: GpuDataManager;
|
|
/**
|
|
* an instance of ProgramManager to build and run WebGPU compute shader program, and manage a ProgramKey -> Program
|
|
* artifacts mapping
|
|
*/
|
|
programManager: ProgramManager;
|
|
|
|
/**
|
|
* representing the session ID of which is currently being run.
|
|
* `null` means no session is being run.
|
|
* only valid when session.run is executed.
|
|
*/
|
|
currentSessionId: number | null = null;
|
|
|
|
/**
|
|
* representing the kernel ID of which is currently being computed (CPU code perspective).
|
|
* `null` means no kernel is being computed.
|
|
* only one kernel can be computed at a moment.
|
|
*/
|
|
currentKernelId: number | null = null;
|
|
/**
|
|
* a list of temporary GPU data for the current kernel. should release when the kernel done computation.
|
|
*/
|
|
private temporaryData: GpuData[];
|
|
/**
|
|
* a KernelID -> a GPU data list, which stores persistent GPU data owned by the specific kernel.
|
|
*/
|
|
private kernelPersistentData: Map<number, GpuData[]>;
|
|
/**
|
|
* a KernelID -> a custom data, which stores custom data owned by the specific kernel.
|
|
*/
|
|
private kernelCustomData: Map<number, { [key: string]: unknown }>;
|
|
/**
|
|
* get the custom data of the current kernel
|
|
*/
|
|
get currentKernelCustomData(): { [key: string]: unknown } {
|
|
if (this.currentKernelId === null) {
|
|
throw new Error('currentKernelCustomData(): currentKernelId is null. (should not happen)');
|
|
}
|
|
|
|
let data = this.kernelCustomData.get(this.currentKernelId);
|
|
if (!data) {
|
|
data = {};
|
|
this.kernelCustomData.set(this.currentKernelId, data);
|
|
}
|
|
|
|
return data;
|
|
}
|
|
|
|
// KernelID -> kernelInfo mapping
|
|
kernels: Map<number, KernelInfo>;
|
|
private commandEncoder: GPUCommandEncoder | null = null;
|
|
private computePassEncoder: GPUComputePassEncoder | null = null;
|
|
maxDispatchNumber = 16;
|
|
pendingDispatchNumber = 0;
|
|
|
|
// info of kernels pending submission for a single batch
|
|
private pendingKernels: PendingKernelInfo[] = [];
|
|
// queryReadBuffer -> pendingKernels mapping for all the batches
|
|
private pendingQueries: Map<GPUBuffer, PendingKernelInfo[]> = new Map();
|
|
private queryResolveBuffer?: GPUBuffer;
|
|
private querySet?: GPUQuerySet;
|
|
private queryTimeBase?: bigint;
|
|
queryType: TimestampQuery;
|
|
|
|
env: Env;
|
|
sessionStatus: SessionState = 'default';
|
|
/**
|
|
* a SessionID -> CommandInfo[] mapping. It's used to record all GPU commands for corresponding session.
|
|
*/
|
|
capturedCommandList: Map<number, CommandInfo[]> = new Map();
|
|
|
|
/**
|
|
* a SessionID -> PendingKernelInfo[] mapping for profiling.
|
|
*/
|
|
private capturedPendingKernels: Map<number, PendingKernelInfo[]> = new Map();
|
|
|
|
/**
|
|
* a SessionID -> a Map of (InputOutputIndex -> [ID, GPUBuffer]) mapping.
|
|
*/
|
|
sessionExternalDataMapping: Map<number, Map<number, [number, GPUBuffer]>> = new Map();
|
|
|
|
async initialize(env: Env, adapter: GPUAdapter): Promise<void> {
|
|
this.env = env;
|
|
const requiredFeatures: GPUFeatureName[] = [];
|
|
const deviceDescriptor: GPUDeviceDescriptor = {
|
|
requiredLimits: {
|
|
maxComputeWorkgroupStorageSize: adapter.limits.maxComputeWorkgroupStorageSize,
|
|
maxComputeWorkgroupsPerDimension: adapter.limits.maxComputeWorkgroupsPerDimension,
|
|
maxStorageBufferBindingSize: adapter.limits.maxStorageBufferBindingSize,
|
|
maxBufferSize: adapter.limits.maxBufferSize,
|
|
maxComputeInvocationsPerWorkgroup: adapter.limits.maxComputeInvocationsPerWorkgroup,
|
|
maxComputeWorkgroupSizeX: adapter.limits.maxComputeWorkgroupSizeX,
|
|
maxComputeWorkgroupSizeY: adapter.limits.maxComputeWorkgroupSizeY,
|
|
maxComputeWorkgroupSizeZ: adapter.limits.maxComputeWorkgroupSizeZ,
|
|
},
|
|
requiredFeatures,
|
|
};
|
|
|
|
// Try requiring WebGPU features
|
|
const requireFeatureIfAvailable = (feature: GPUFeatureName) =>
|
|
adapter.features.has(feature) && requiredFeatures.push(feature) && true;
|
|
// Try chromium-experimental-timestamp-query-inside-passes and fallback to timestamp-query
|
|
if (!requireFeatureIfAvailable('chromium-experimental-timestamp-query-inside-passes' as GPUFeatureName)) {
|
|
requireFeatureIfAvailable('timestamp-query');
|
|
}
|
|
requireFeatureIfAvailable('shader-f16');
|
|
// Try subgroups
|
|
if (requireFeatureIfAvailable('subgroups' as GPUFeatureName)) {
|
|
// If subgroups feature is available, also try subgroups-f16
|
|
requireFeatureIfAvailable('subgroups-f16' as GPUFeatureName);
|
|
}
|
|
|
|
this.device = await adapter.requestDevice(deviceDescriptor);
|
|
this.deviceInfo = new DeviceInfoImpl(this.device);
|
|
this.adapterInfo = new AdapterInfoImpl(adapter.info || (await adapter.requestAdapterInfo()));
|
|
this.gpuDataManager = createGpuDataManager(this);
|
|
this.programManager = new ProgramManager(this);
|
|
this.kernels = new Map();
|
|
this.kernelPersistentData = new Map();
|
|
this.kernelCustomData = new Map();
|
|
|
|
// set up flags for logger
|
|
configureLogger(env.logLevel!, !!env.debug);
|
|
|
|
// TODO: set up flags
|
|
|
|
this.device.onuncapturederror = (ev) => {
|
|
if (ev.error instanceof GPUValidationError) {
|
|
// eslint-disable-next-line no-console
|
|
console.error(`An uncaught WebGPU validation error was raised: ${ev.error.message}`);
|
|
}
|
|
};
|
|
|
|
Object.defineProperty(this.env.webgpu, 'device', {
|
|
value: this.device,
|
|
writable: false,
|
|
enumerable: true,
|
|
configurable: false,
|
|
});
|
|
Object.defineProperty(this.env.webgpu, 'adapter', {
|
|
value: adapter,
|
|
writable: false,
|
|
enumerable: true,
|
|
configurable: false,
|
|
});
|
|
|
|
// init queryType, which is necessary for InferenceSession.create
|
|
this.setQueryType();
|
|
}
|
|
|
|
dispose(): void {
|
|
if (typeof this.querySet !== 'undefined') {
|
|
this.querySet.destroy();
|
|
}
|
|
this.gpuDataManager.dispose();
|
|
}
|
|
|
|
getCommandEncoder(): GPUCommandEncoder {
|
|
if (!this.commandEncoder) {
|
|
this.commandEncoder = this.device.createCommandEncoder();
|
|
}
|
|
return this.commandEncoder;
|
|
}
|
|
|
|
getComputePassEncoder(): GPUComputePassEncoder {
|
|
if (!this.computePassEncoder) {
|
|
const commandEncoder = this.getCommandEncoder();
|
|
const computePassDescriptor: GPUComputePassDescriptor = {};
|
|
|
|
if (this.queryType === 'at-passes') {
|
|
computePassDescriptor.timestampWrites = {
|
|
querySet: this.querySet!,
|
|
beginningOfPassWriteIndex: this.pendingDispatchNumber * 2,
|
|
endOfPassWriteIndex: this.pendingDispatchNumber * 2 + 1,
|
|
};
|
|
}
|
|
|
|
this.computePassEncoder = commandEncoder.beginComputePass(computePassDescriptor);
|
|
}
|
|
return this.computePassEncoder;
|
|
}
|
|
|
|
endComputePass(): void {
|
|
if (this.computePassEncoder) {
|
|
this.computePassEncoder.end();
|
|
this.computePassEncoder = null;
|
|
}
|
|
}
|
|
|
|
flush(): void {
|
|
if (!this.commandEncoder) {
|
|
return;
|
|
}
|
|
|
|
TRACE_FUNC_BEGIN();
|
|
|
|
this.endComputePass();
|
|
let queryReadBuffer: GPUBuffer;
|
|
if (this.queryType !== 'none') {
|
|
this.commandEncoder.resolveQuerySet(
|
|
this.querySet!,
|
|
0,
|
|
this.pendingDispatchNumber * 2,
|
|
this.queryResolveBuffer!,
|
|
0,
|
|
);
|
|
|
|
queryReadBuffer = this.device.createBuffer(
|
|
// eslint-disable-next-line no-bitwise
|
|
{ size: this.pendingDispatchNumber * 2 * 8, usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST },
|
|
);
|
|
|
|
this.pendingQueries.set(queryReadBuffer, this.pendingKernels);
|
|
this.pendingKernels = [];
|
|
this.commandEncoder.copyBufferToBuffer(
|
|
this.queryResolveBuffer!,
|
|
0,
|
|
queryReadBuffer,
|
|
0,
|
|
this.pendingDispatchNumber * 2 * 8,
|
|
);
|
|
}
|
|
|
|
this.device.queue.submit([this.commandEncoder.finish()]);
|
|
this.gpuDataManager.refreshPendingBuffers();
|
|
this.commandEncoder = null;
|
|
this.pendingDispatchNumber = 0;
|
|
|
|
if (this.queryType !== 'none') {
|
|
void queryReadBuffer!.mapAsync(GPUMapMode.READ).then(() => {
|
|
const mappedData = new BigUint64Array(queryReadBuffer.getMappedRange());
|
|
const pendingKernels = this.pendingQueries.get(queryReadBuffer)!;
|
|
for (let i = 0; i < mappedData.length / 2; i++) {
|
|
const pendingKernelInfo = pendingKernels[i];
|
|
const kernelId = pendingKernelInfo.kernelId;
|
|
const kernelInfo = this.kernels.get(kernelId)!;
|
|
const kernelType = kernelInfo.kernelType;
|
|
const kernelName = kernelInfo.kernelName;
|
|
const programName = pendingKernelInfo.programName;
|
|
const inputTensorViews = pendingKernelInfo.inputTensorViews;
|
|
const outputTensorViews = pendingKernelInfo.outputTensorViews;
|
|
const startTimeU64 = mappedData[i * 2];
|
|
const endTimeU64 = mappedData[i * 2 + 1];
|
|
|
|
if (typeof this.queryTimeBase === 'undefined') {
|
|
this.queryTimeBase = startTimeU64;
|
|
}
|
|
|
|
const startTime = Number(startTimeU64 - this.queryTimeBase);
|
|
const endTime = Number(endTimeU64 - this.queryTimeBase);
|
|
|
|
if (!Number.isSafeInteger(startTime) || !Number.isSafeInteger(endTime)) {
|
|
throw new RangeError('incorrect timestamp range');
|
|
}
|
|
|
|
if (this.env.webgpu.profiling?.ondata) {
|
|
this.env.webgpu.profiling.ondata({
|
|
version: 1,
|
|
inputsMetadata: inputTensorViews.map((value) => ({
|
|
dims: value.dims,
|
|
dataType: tensorDataTypeEnumToString(value.dataType),
|
|
})),
|
|
outputsMetadata: outputTensorViews.map((value) => ({
|
|
dims: value.dims,
|
|
dataType: tensorDataTypeEnumToString(value.dataType),
|
|
})),
|
|
kernelId,
|
|
kernelType,
|
|
kernelName,
|
|
programName,
|
|
startTime,
|
|
endTime,
|
|
});
|
|
} else {
|
|
// if no callback is provided, print the profiling message to console
|
|
let inputShapes = '';
|
|
inputTensorViews.forEach((value, i) => {
|
|
inputShapes += `input[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
|
|
});
|
|
let outputShapes = '';
|
|
outputTensorViews.forEach((value, i) => {
|
|
outputShapes += `output[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
|
|
});
|
|
// eslint-disable-next-line no-console
|
|
console.log(
|
|
`[profiling] kernel "${kernelId}|${kernelType}|${kernelName}|${programName}" ${inputShapes}${
|
|
outputShapes
|
|
}execution time: ${endTime - startTime} ns`,
|
|
);
|
|
}
|
|
TRACE('GPU', `${programName}::${startTimeU64}::${endTimeU64}`);
|
|
}
|
|
queryReadBuffer.unmap();
|
|
this.pendingQueries.delete(queryReadBuffer);
|
|
});
|
|
}
|
|
TRACE_FUNC_END();
|
|
}
|
|
|
|
/**
|
|
* run a WebGPU program.
|
|
* @param program a ProgramInfo instance
|
|
* @param inputTensorViews a TensorView array. each element represents a value already exists in GPU.
|
|
* @param outputIndices an indices array. each element can be either -1 (temporary data), -2 (persistent data) or an
|
|
* index to the kernel's output.
|
|
* @param createKernelOutput a callback function that create a value to kernel's output with the given index
|
|
* @param createIntermediateOutput a callback function that create a value as a intermediate value, either temporary
|
|
* or persistent (owned by the current kernel)
|
|
* @returns a TensorView array representing the result.
|
|
*/
|
|
run(
|
|
program: ProgramInfo,
|
|
inputTensorViews: readonly TensorView[],
|
|
outputIndices: readonly number[],
|
|
createKernelOutput: (index: number, dataType: number, dims: readonly number[]) => TensorView,
|
|
createIntermediateOutput: (dataType: number, dims: readonly number[]) => TensorView,
|
|
outputCount: number,
|
|
): TensorView[] {
|
|
TRACE_FUNC_BEGIN(program.name);
|
|
// create info for inputs
|
|
const inputDatas: GpuData[] = [];
|
|
for (let i = 0; i < inputTensorViews.length; ++i) {
|
|
const data = inputTensorViews[i].data;
|
|
// if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it.
|
|
if (data === 0) {
|
|
continue;
|
|
}
|
|
const gpuData = this.gpuDataManager.get(data);
|
|
if (!gpuData) {
|
|
throw new Error(`no GPU data for input: ${data}`);
|
|
}
|
|
inputDatas.push(gpuData);
|
|
}
|
|
|
|
const { outputs, dispatchGroup, programUniforms } = program.getRunData(inputTensorViews);
|
|
|
|
// check output indices
|
|
const validatedOutputIndices = outputIndices.length === 0 ? outputs.map((_, i) => i) : outputIndices;
|
|
if (validatedOutputIndices.length !== outputs.length) {
|
|
throw new Error(`Output size ${validatedOutputIndices.length} must be equal to ${outputs.length}.`);
|
|
}
|
|
|
|
// create info for outputs
|
|
const outputTensorViews: TensorView[] = [];
|
|
const outputDatas: GpuData[] = [];
|
|
for (let i = 0; i < outputs.length; ++i) {
|
|
// value -1 and -2 are used for creating temporary and persistent outputs.
|
|
// value -3 is used for placeholder output. So -3, -2, -1 and 0, 1, 2, ... are valid
|
|
// output indices. see type definition of ComputeContextInputsOutputsMapping for more details.
|
|
if (
|
|
!Number.isInteger(validatedOutputIndices[i]) ||
|
|
validatedOutputIndices[i] < -3 ||
|
|
validatedOutputIndices[i] >= outputCount
|
|
) {
|
|
throw new Error(`Invalid output index: ${validatedOutputIndices[i]}`);
|
|
}
|
|
if (validatedOutputIndices[i] === -3) {
|
|
continue;
|
|
}
|
|
const isTemporary = validatedOutputIndices[i] === -1;
|
|
const isPersistent = validatedOutputIndices[i] === -2;
|
|
const tensorView =
|
|
isTemporary || isPersistent
|
|
? createIntermediateOutput(outputs[i].dataType, outputs[i].dims)
|
|
: createKernelOutput(validatedOutputIndices[i], outputs[i].dataType, outputs[i].dims);
|
|
outputTensorViews.push(tensorView);
|
|
// if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it.
|
|
if (tensorView.data === 0) {
|
|
continue;
|
|
}
|
|
const gpuData = this.gpuDataManager.get(tensorView.data);
|
|
if (!gpuData) {
|
|
throw new Error(`no GPU data for output: ${tensorView.data}`);
|
|
}
|
|
if (isTemporary) {
|
|
this.temporaryData.push(gpuData);
|
|
}
|
|
if (isPersistent) {
|
|
let persistentData = this.kernelPersistentData.get(this.currentKernelId!);
|
|
if (!persistentData) {
|
|
persistentData = [];
|
|
this.kernelPersistentData.set(this.currentKernelId!, persistentData);
|
|
}
|
|
persistentData.push(gpuData);
|
|
}
|
|
outputDatas.push(gpuData);
|
|
}
|
|
|
|
// when there are any zero-sized tensor in the inputs or outputs, we should report error unless all outputs are
|
|
// zero-sized tensors.
|
|
if (inputDatas.length !== inputTensorViews.length || outputDatas.length !== outputTensorViews.length) {
|
|
// if all outputs are zero-sized tensors, there is no need to run the program.
|
|
if (outputDatas.length === 0) {
|
|
TRACE_FUNC_END(program.name);
|
|
return outputTensorViews;
|
|
}
|
|
// if some outputs are zero-sized tensors, report an error.
|
|
//
|
|
// TODO: so far we don't see any use case that outputs include both zero-sized tensors and non-zero-sized tensors.
|
|
// If we see such use case, we need to make a change here to support it.
|
|
throw new Error(
|
|
`Program ${program.name} has zero-sized tensor(s) in inputs or outputs. This is not supported now.`,
|
|
);
|
|
}
|
|
|
|
// load uniforms
|
|
// TODO: add cache for uniform (is it necessary?)
|
|
//
|
|
let uniformBufferBinding: GPUBindingResource | undefined;
|
|
if (programUniforms) {
|
|
let currentOffset = 0;
|
|
const offsets: number[] = [];
|
|
|
|
programUniforms.forEach((v) => {
|
|
const data = typeof v.data === 'number' ? [v.data] : v.data;
|
|
if (data.length === 0) {
|
|
return;
|
|
}
|
|
// https://www.w3.org/TR/WGSL/#alignof
|
|
const sizeOfElement = v.type === DataType.float16 ? 2 : 4;
|
|
let sizeOfVecOrMat;
|
|
let baseAlignment;
|
|
if (v.type === DataType.float16) {
|
|
baseAlignment = data.length > 4 ? 16 : data.length > 2 ? 8 : data.length * sizeOfElement;
|
|
sizeOfVecOrMat = data.length > 4 ? 16 : sizeOfElement * data.length;
|
|
} else {
|
|
baseAlignment = data.length <= 2 ? data.length * sizeOfElement : 16;
|
|
sizeOfVecOrMat = 16;
|
|
}
|
|
currentOffset = Math.ceil(currentOffset / baseAlignment) * baseAlignment;
|
|
offsets.push(currentOffset);
|
|
// For non-float16 type, when data.length > 4, the uniform variable is of type array<vec4<i32|u32|f32>,N>, where
|
|
// N = Math.ceil(data.length / 4) and SizeOf(vec4<i32|u32|f32>) = 16. The total byte length is N *
|
|
// SizeOf(vec4<i32|u32|f32>). For float16 type, when data.length > 4, the uniform variable is of type
|
|
// array<mat2x4<f16>,N>, where N = Math.ceil(data.length / 8) and SizeOf(mat2x4<f16>) = 16. The total byte
|
|
// length is N * SizeOf(mat2x4<f16>).
|
|
const elementPerVecOrMat = v.type === DataType.float16 ? 8 : 4;
|
|
currentOffset +=
|
|
data.length > 4 ? Math.ceil(data.length / elementPerVecOrMat) * sizeOfVecOrMat : data.length * sizeOfElement;
|
|
});
|
|
|
|
// Meet alignment of struct here: https://www.w3.org/TR/WGSL/#alignment-and-size. For simplicity, set
|
|
// maxAlignmentOfField to 16 since the underlying buffer has been rounded up to 16.
|
|
const maxAlignmentOfField = 16;
|
|
currentOffset = Math.ceil(currentOffset / maxAlignmentOfField) * maxAlignmentOfField;
|
|
const arrayBuffer = new ArrayBuffer(currentOffset);
|
|
programUniforms.forEach((v, i) => {
|
|
const offset = offsets[i];
|
|
const data = typeof v.data === 'number' ? [v.data] : v.data;
|
|
if (v.type === DataType.int32) {
|
|
new Int32Array(arrayBuffer, offset, data.length).set(data);
|
|
} else if (v.type === DataType.uint32) {
|
|
new Uint32Array(arrayBuffer, offset, data.length).set(data);
|
|
} else if (v.type === DataType.float16) {
|
|
new Uint16Array(arrayBuffer, offset, data.length).set(data);
|
|
} else if (v.type === DataType.float) {
|
|
new Float32Array(arrayBuffer, offset, data.length).set(data);
|
|
} else {
|
|
throw new Error(`Unsupported uniform type: ${tensorDataTypeEnumToString(v.type)}`);
|
|
}
|
|
});
|
|
|
|
const uniformBufferData =
|
|
// eslint-disable-next-line no-bitwise
|
|
this.gpuDataManager.create(currentOffset, GPUBufferUsage.COPY_DST | GPUBufferUsage.UNIFORM);
|
|
this.device.queue.writeBuffer(uniformBufferData.buffer, 0, arrayBuffer, 0, currentOffset);
|
|
this.gpuDataManager.release(uniformBufferData.id);
|
|
uniformBufferBinding = { offset: 0, size: currentOffset, buffer: uniformBufferData.buffer };
|
|
}
|
|
|
|
const normalizedDispatchGroup = this.programManager.normalizeDispatchGroupSize(dispatchGroup);
|
|
const is1DimensionDispatch = normalizedDispatchGroup[1] === 1 && normalizedDispatchGroup[2] === 1;
|
|
// get program info
|
|
const key = getProgramInfoUniqueKey(program, inputTensorViews, is1DimensionDispatch);
|
|
let artifact = this.programManager.getArtifact(key);
|
|
if (!artifact) {
|
|
artifact = this.programManager.build(program, normalizedDispatchGroup);
|
|
this.programManager.setArtifact(key, artifact);
|
|
LOG_DEBUG('info', () => `[artifact] key: ${key}, programName: ${program.name}`);
|
|
}
|
|
|
|
// validate uniform variables
|
|
if (programUniforms && artifact.uniformVariablesInfo) {
|
|
if (programUniforms.length !== artifact.uniformVariablesInfo.length) {
|
|
throw new Error(
|
|
`Uniform variables count mismatch: expect ${artifact.uniformVariablesInfo.length}, got ${
|
|
programUniforms.length
|
|
} in program "${artifact.programInfo.name}".`,
|
|
);
|
|
}
|
|
for (let i = 0; i < programUniforms.length; i++) {
|
|
const uniform = programUniforms[i];
|
|
const actualType = uniform.type;
|
|
const actualLength = typeof uniform.data === 'number' ? 1 : uniform.data.length;
|
|
const [type, length] = artifact.uniformVariablesInfo[i];
|
|
if (actualType !== type || actualLength !== length) {
|
|
throw new Error(
|
|
`Uniform variable ${i} mismatch: expect type ${type} with size ${length}, got type ${
|
|
actualType
|
|
} with size ${actualLength} in program "${artifact.programInfo.name}".`,
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
LOG_DEBUG(
|
|
'info',
|
|
() =>
|
|
`[ProgramManager] run "${program.name}" (key=${key}) with ${normalizedDispatchGroup[0]}x${
|
|
normalizedDispatchGroup[1]
|
|
}x${normalizedDispatchGroup[2]}`,
|
|
);
|
|
|
|
if (this.queryType !== 'none' || this.sessionStatus === 'capturing') {
|
|
const pendingKernelInfo: PendingKernelInfo = {
|
|
kernelId: this.currentKernelId!,
|
|
programName: artifact.programInfo.name,
|
|
inputTensorViews,
|
|
outputTensorViews,
|
|
};
|
|
this.pendingKernels.push(pendingKernelInfo);
|
|
|
|
if (this.sessionStatus === 'capturing') {
|
|
const sessionPendingKernels = this.capturedPendingKernels.get(this.currentSessionId!);
|
|
sessionPendingKernels!.push(pendingKernelInfo);
|
|
}
|
|
}
|
|
|
|
this.programManager.run(artifact, inputDatas, outputDatas, normalizedDispatchGroup, uniformBufferBinding);
|
|
|
|
TRACE_FUNC_END(program.name);
|
|
return outputTensorViews;
|
|
}
|
|
|
|
upload(gpuDataId: number, data: Uint8Array): void {
|
|
this.gpuDataManager.upload(gpuDataId, data);
|
|
}
|
|
|
|
memcpy(src: number, dst: number): void {
|
|
this.gpuDataManager.memcpy(src, dst);
|
|
}
|
|
|
|
async download(gpuDataId: number, getTargetBuffer: () => Uint8Array): Promise<void> {
|
|
// the underlying buffer may be changed after the async function is called. so we use a getter function to make sure
|
|
// the buffer is up-to-date.
|
|
await this.gpuDataManager.download(gpuDataId, getTargetBuffer);
|
|
}
|
|
|
|
alloc(size: number): number {
|
|
return this.gpuDataManager.create(size).id;
|
|
}
|
|
|
|
free(ptr: number): number {
|
|
return this.gpuDataManager.release(ptr);
|
|
}
|
|
|
|
createKernel(kernelType: string, kernelId: number, attribute: unknown, kernelName: string): void {
|
|
const op = WEBGPU_OP_RESOLVE_RULES.get(kernelType);
|
|
if (!op) {
|
|
throw new Error(`kernel not implemented: ${kernelType}`);
|
|
}
|
|
|
|
const kernelInfo: KernelInfo = {
|
|
kernelType,
|
|
kernelName,
|
|
kernelEntry: op[0],
|
|
attributes: [op[1], attribute],
|
|
};
|
|
this.kernels.set(kernelId, kernelInfo);
|
|
}
|
|
|
|
releaseKernel(kernelId: number): void {
|
|
const persistentData = this.kernelPersistentData.get(kernelId);
|
|
if (persistentData) {
|
|
for (const data of persistentData) {
|
|
this.gpuDataManager.release(data.id);
|
|
}
|
|
this.kernelPersistentData.delete(kernelId);
|
|
}
|
|
|
|
this.kernelCustomData.delete(kernelId);
|
|
this.kernels.delete(kernelId);
|
|
}
|
|
|
|
computeKernel(kernelId: number, context: ComputeContext, errors: Array<Promise<string | null>>): number {
|
|
const kernel = this.kernels.get(kernelId);
|
|
if (!kernel) {
|
|
throw new Error(`kernel not created: ${kernelId}`);
|
|
}
|
|
const kernelType = kernel.kernelType;
|
|
const kernelName = kernel.kernelName;
|
|
const kernelEntry = kernel.kernelEntry;
|
|
const attributes = kernel.attributes;
|
|
if (this.currentKernelId !== null) {
|
|
throw new Error(`kernel "[${kernelType}] ${kernelName}" is not allowed to be called recursively`);
|
|
}
|
|
this.currentKernelId = kernelId;
|
|
|
|
// parse attributes if necessary
|
|
if (attributes[0]) {
|
|
attributes[1] = attributes[0](attributes[1]);
|
|
attributes[0] = undefined;
|
|
}
|
|
|
|
LOG_DEBUG('info', () => `[WebGPU] Start to run kernel "[${kernelType}] ${kernelName}"...`);
|
|
|
|
const useErrorScope = this.env.debug;
|
|
|
|
this.temporaryData = [];
|
|
try {
|
|
if (useErrorScope) {
|
|
this.device.pushErrorScope('validation');
|
|
}
|
|
|
|
kernelEntry(context, attributes[1]);
|
|
return 0; // ORT_OK
|
|
} catch (e) {
|
|
errors.push(Promise.resolve(`[WebGPU] Kernel "[${kernelType}] ${kernelName}" failed. ${e}`));
|
|
return 1; // ORT_FAIL
|
|
} finally {
|
|
if (useErrorScope) {
|
|
errors.push(
|
|
this.device
|
|
.popErrorScope()
|
|
.then((err) =>
|
|
err ? `GPU validation error for kernel "[${kernelType}] ${kernelName}": ${err.message}` : null,
|
|
),
|
|
);
|
|
}
|
|
|
|
for (const data of this.temporaryData) {
|
|
this.gpuDataManager.release(data.id);
|
|
}
|
|
this.temporaryData = [];
|
|
this.currentKernelId = null;
|
|
}
|
|
}
|
|
|
|
// #region external buffer
|
|
registerBuffer(sessionId: number, index: number, buffer: GPUBuffer, size: number): number {
|
|
let sessionInputOutputMapping = this.sessionExternalDataMapping.get(sessionId);
|
|
if (!sessionInputOutputMapping) {
|
|
sessionInputOutputMapping = new Map();
|
|
this.sessionExternalDataMapping.set(sessionId, sessionInputOutputMapping);
|
|
}
|
|
|
|
// the buffer may be user created, or managed by GPU data manager.
|
|
// The GPU data manager will not manage these buffers. we register them as external buffers.
|
|
//
|
|
// The map `sessionInputOutputMapping` is used to store the data ID and buffer for each input/output. Once a
|
|
// specific input/output is registered, the data ID will not change.
|
|
const previousBuffer = sessionInputOutputMapping.get(index);
|
|
const id = this.gpuDataManager.registerExternalBuffer(buffer, size, previousBuffer);
|
|
sessionInputOutputMapping.set(index, [id, buffer]);
|
|
return id;
|
|
}
|
|
unregisterBuffers(sessionId: number): void {
|
|
const sessionInputOutputMapping = this.sessionExternalDataMapping.get(sessionId);
|
|
if (sessionInputOutputMapping) {
|
|
sessionInputOutputMapping.forEach((bufferInfo) => this.gpuDataManager.unregisterExternalBuffer(bufferInfo[0]));
|
|
this.sessionExternalDataMapping.delete(sessionId);
|
|
}
|
|
}
|
|
getBuffer(gpuDataId: number): GPUBuffer {
|
|
const gpuData = this.gpuDataManager.get(gpuDataId);
|
|
if (!gpuData) {
|
|
throw new Error(`no GPU data for buffer: ${gpuDataId}`);
|
|
}
|
|
return gpuData.buffer;
|
|
}
|
|
createDownloader(
|
|
gpuBuffer: GPUBuffer,
|
|
size: number,
|
|
type: Tensor.GpuBufferDataTypes,
|
|
): () => Promise<Tensor.DataType> {
|
|
return async () => {
|
|
const data = await downloadGpuData(this, gpuBuffer, size);
|
|
return createView(data.buffer, type);
|
|
};
|
|
}
|
|
// #endregion
|
|
writeTimestamp(index: number): void {
|
|
if (this.queryType !== 'inside-passes') {
|
|
return;
|
|
}
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
(this.computePassEncoder as any).writeTimestamp(this.querySet, index);
|
|
}
|
|
setQueryType(): void {
|
|
this.queryType = 'none';
|
|
if (
|
|
this.env.webgpu.profiling?.mode === 'default' ||
|
|
(typeof this.env.trace === 'undefined' ? this.env.wasm.trace : this.env.trace)
|
|
) {
|
|
if (this.device.features.has('chromium-experimental-timestamp-query-inside-passes')) {
|
|
this.queryType = 'inside-passes';
|
|
} else if (this.device.features.has('timestamp-query')) {
|
|
this.queryType = 'at-passes';
|
|
}
|
|
|
|
if (this.queryType !== 'none' && typeof this.querySet === 'undefined') {
|
|
this.querySet = this.device.createQuerySet({
|
|
type: 'timestamp',
|
|
count: this.maxDispatchNumber * 2,
|
|
});
|
|
this.queryResolveBuffer = this.device.createBuffer(
|
|
// eslint-disable-next-line no-bitwise
|
|
{ size: this.maxDispatchNumber * 2 * 8, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.QUERY_RESOLVE },
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
captureBegin(): void {
|
|
LOG_DEBUG('info', 'captureBegin');
|
|
if (!this.capturedCommandList.get(this.currentSessionId!)) {
|
|
this.capturedCommandList.set(this.currentSessionId!, []);
|
|
}
|
|
if (!this.capturedPendingKernels.get(this.currentSessionId!)) {
|
|
this.capturedPendingKernels.set(this.currentSessionId!, []);
|
|
}
|
|
// flush the left commands before we change the status.
|
|
this.flush();
|
|
this.sessionStatus = 'capturing';
|
|
}
|
|
captureEnd(): void {
|
|
LOG_DEBUG('info', 'captureEnd');
|
|
// flush the left commands before we change the status.
|
|
this.flush();
|
|
this.sessionStatus = 'default';
|
|
}
|
|
replay(): void {
|
|
LOG_DEBUG('info', 'replay');
|
|
this.sessionStatus = 'replaying';
|
|
const sessionCommandList = this.capturedCommandList.get(this.currentSessionId!);
|
|
const sessionPendingKernels = this.capturedPendingKernels.get(this.currentSessionId!);
|
|
const length = sessionCommandList!.length;
|
|
this.pendingKernels = [];
|
|
for (let i = 0; i < length; i++) {
|
|
const computePassEncoder = this.getComputePassEncoder();
|
|
const command = sessionCommandList![i];
|
|
this.writeTimestamp(this.pendingDispatchNumber * 2);
|
|
computePassEncoder.setPipeline(command.computePipeline);
|
|
computePassEncoder.setBindGroup(0, command.bindGroup);
|
|
computePassEncoder.dispatchWorkgroups(...command.dispatchGroup);
|
|
this.writeTimestamp(this.pendingDispatchNumber * 2 + 1);
|
|
this.pendingDispatchNumber++;
|
|
if (this.queryType !== 'none') {
|
|
this.pendingKernels.push(sessionPendingKernels![i]);
|
|
}
|
|
if (this.pendingDispatchNumber >= this.maxDispatchNumber || this.queryType === 'at-passes') {
|
|
this.endComputePass();
|
|
}
|
|
if (this.pendingDispatchNumber >= this.maxDispatchNumber) {
|
|
this.flush();
|
|
}
|
|
}
|
|
// flush the left commands before we change the status.
|
|
this.flush();
|
|
this.sessionStatus = 'default';
|
|
}
|
|
|
|
onCreateSession(): void {
|
|
this.gpuDataManager.onCreateSession();
|
|
}
|
|
|
|
onReleaseSession(sessionId: number): void {
|
|
this.unregisterBuffers(sessionId);
|
|
if (this.capturedCommandList.has(sessionId)) {
|
|
this.capturedCommandList.delete(sessionId);
|
|
}
|
|
if (this.capturedPendingKernels.has(sessionId)) {
|
|
this.capturedPendingKernels.delete(sessionId);
|
|
}
|
|
this.gpuDataManager.onReleaseSession(sessionId);
|
|
}
|
|
|
|
onRunStart(sessionId: number): void {
|
|
this.currentSessionId = sessionId;
|
|
this.setQueryType();
|
|
}
|
|
}
|