onnxruntime/js/web/lib/wasm/jsep/backend-webgpu.ts

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

import { Env, Tensor, TRACE, TRACE_FUNC_BEGIN, TRACE_FUNC_END } from 'onnxruntime-common';

import { DataType, tensorDataTypeEnumToString } from '../wasm-common';

import { configureLogger, LOG_DEBUG } from './log';
import { createView, TensorView } from './tensor-view';
import { createGpuDataManager, downloadGpuData, GpuDataManager } from './webgpu/gpu-data-manager';
import { RunFunction, WEBGPU_OP_RESOLVE_RULES } from './webgpu/op-resolve-rules';
import { ProgramManager } from './webgpu/program-manager';
import {
  AdapterInfo,
  ComputeContext,
  DeviceInfo,
  GpuArchitecture,
  GpuData,
  GpuVendor,
  ProgramInfo,
  ProgramInputTensorInfoDependency,
  SessionState,
  TimestampQuery,
} from './webgpu/types';

interface CommandInfo {
  readonly kernelId: number;
  readonly computePipeline: GPUComputePipeline;
  readonly bindGroup: GPUBindGroup;
  readonly dispatchGroup: [number, number, number];
}

interface KernelInfo {
  readonly kernelType: string;
  readonly kernelName: string;
  readonly kernelEntry: RunFunction;
  readonly attributes: [((attribute: unknown) => unknown) | undefined, unknown];
}

interface PendingKernelInfo {
  readonly kernelId: number;
  readonly programName: string;
  readonly inputTensorViews: readonly TensorView[];
  readonly outputTensorViews: readonly TensorView[];
}

const getProgramInputTensorInfoDependencyKey = (
  inputTensors: readonly TensorView[],
  inputDependencies: readonly ProgramInputTensorInfoDependency[],
): string => {
  if (inputDependencies.length !== inputTensors.length) {
    throw new Error(
      `inputDependencies length ${inputDependencies.length} is not equal to inputTensors length ${
        inputTensors.length
      }.`,
    );
  }

  const inputInfos: string[] = [];
  for (let i = 0; i < inputTensors.length; ++i) {
    const type = inputTensors[i].dataType;
    switch (inputDependencies[i]) {
      case 'none': {
        inputInfos.push('');
        break;
      }
      case 'type': {
        inputInfos.push(`${type}`);
        break;
      }
      case 'rank': {
        const rank = inputTensors[i].dims.length;
        inputInfos.push(`${type};${rank}`);
        break;
      }
      case 'dims': {
        const dims = inputTensors[i].dims.join(',');
        inputInfos.push(`${type};${dims}`);
        break;
      }
      default:
        throw new Error(`unsupported input dependency: ${inputDependencies[i]}`);
    }
  }

  return inputInfos.join('|');
};

/**
 * get a unique key representing the program from the program info, input shapes and types.
 *
 * @returns a unique key is a shorter string than the shader source, which contains all the information to identify a
 * program. if the key is the same, the program shader source should be the same, so we can reuse the program.
 *
 */
const getProgramInfoUniqueKey = (
  programInfo: ProgramInfo,
  inputTensors: readonly TensorView[],
  is1DimensionDispatch: boolean,
): string => {
  // final key format:
  // <PROGRAM_NAME>[<PROGRAM_CUSTOM_CACHE_HINT>]:is1DimensionDispatch:<INPUTS_INFO_0>|<INPUTS_INFO_1>|...
  let key = programInfo.name;
  if (programInfo.shaderCache?.hint) {
    key += '[' + programInfo.shaderCache.hint + ']';
  }
  key +=
    ':' +
    is1DimensionDispatch +
    `:${getProgramInputTensorInfoDependencyKey(
      inputTensors,
      programInfo.shaderCache?.inputDependencies ??
        new Array<ProgramInputTensorInfoDependency>(inputTensors.length).fill('dims'),
    )}`;
  return key;
};

class AdapterInfoImpl implements AdapterInfo {
  readonly architecture?: string;
  readonly vendor?: string;

  constructor(adapterInfo: GPUAdapterInfo) {
    if (adapterInfo) {
      this.architecture = adapterInfo.architecture;
      this.vendor = adapterInfo.vendor;
    }
  }

  isArchitecture(architecture: GpuArchitecture): boolean {
    return this.architecture === architecture;
  }

  isVendor(vendor: GpuVendor): boolean {
    return this.vendor === vendor;
  }
}

class DeviceInfoImpl implements DeviceInfo {
  readonly subgroupsSupported: boolean;
  readonly subgroupsF16Supported: boolean;
  readonly subgroupSizeRange?: readonly [number, number];

  constructor(device: GPUDevice) {
    this.subgroupsSupported = device.features.has('subgroups' as GPUFeatureName);
    this.subgroupsF16Supported = device.features.has('subgroups' as GPUFeatureName);
    // Currently subgroups feature is still experimental and size attributes are not in the WebGPU IDL, so we have to
    // workaround the IDL type checks.
    // TODO: clean this after subgroups feature is settled in IDL.
    const deviceSubgroupsLimits = device.limits as { minSubgroupSize?: number; maxSubgroupSize?: number };
    if (!this.subgroupsSupported || !deviceSubgroupsLimits.minSubgroupSize || !deviceSubgroupsLimits.maxSubgroupSize) {
      this.subgroupSizeRange = undefined;
    } else {
      this.subgroupSizeRange = [deviceSubgroupsLimits.minSubgroupSize, deviceSubgroupsLimits.maxSubgroupSize];
    }
  }
}

/**
 * this class is designed to store status and being used as a singleton for JSEP. It will be passed to jsepInit() as
 * the first parameter so that it is stored for future use.
 */
export class WebGpuBackend {
  adapterInfo: AdapterInfoImpl;
  device: GPUDevice;
  deviceInfo: DeviceInfoImpl;
  /**
   * an instance of GpuDataManager to manage a GpuDataId -> GpuBuffer mapping
   */
  gpuDataManager: GpuDataManager;
  /**
   * an instance of ProgramManager to build and run WebGPU compute shader program, and manage a ProgramKey -> Program
   * artifacts mapping
   */
  programManager: ProgramManager;

  /**
   * representing the session ID of which is currently being run.
   * `null` means no session is being run.
   * only valid when session.run is executed.
   */
  currentSessionId: number | null = null;

  /**
   * representing the kernel ID of which is currently being computed (CPU code perspective).
   * `null` means no kernel is being computed.
   * only one kernel can be computed at a moment.
   */
  currentKernelId: number | null = null;
  /**
   * a list of temporary GPU data for the current kernel. should release when the kernel done computation.
   */
  private temporaryData: GpuData[];
  /**
   * a KernelID -> a GPU data list, which stores persistent GPU data owned by the specific kernel.
   */
  private kernelPersistentData: Map<number, GpuData[]>;
  /**
   * a KernelID -> a custom data, which stores custom data owned by the specific kernel.
   */
  private kernelCustomData: Map<number, { [key: string]: unknown }>;
  /**
   * get the custom data of the current kernel
   */
  get currentKernelCustomData(): { [key: string]: unknown } {
    if (this.currentKernelId === null) {
      throw new Error('currentKernelCustomData(): currentKernelId is null. (should not happen)');
    }

    let data = this.kernelCustomData.get(this.currentKernelId);
    if (!data) {
      data = {};
      this.kernelCustomData.set(this.currentKernelId, data);
    }

    return data;
  }

  // KernelID -> kernelInfo mapping
  kernels: Map<number, KernelInfo>;
  private commandEncoder: GPUCommandEncoder | null = null;
  private computePassEncoder: GPUComputePassEncoder | null = null;
  maxDispatchNumber = 16;
  pendingDispatchNumber = 0;

  // info of kernels pending submission for a single batch
  private pendingKernels: PendingKernelInfo[] = [];
  // queryReadBuffer -> pendingKernels mapping for all the batches
  private pendingQueries: Map<GPUBuffer, PendingKernelInfo[]> = new Map();
  private queryResolveBuffer?: GPUBuffer;
  private querySet?: GPUQuerySet;
  private queryTimeBase?: bigint;
  queryType: TimestampQuery;

  env: Env;
  sessionStatus: SessionState = 'default';
  /**
   * a SessionID -> CommandInfo[] mapping. It's used to record all GPU commands for corresponding session.
   */
  capturedCommandList: Map<number, CommandInfo[]> = new Map();

  /**
   * a SessionID -> PendingKernelInfo[] mapping for profiling.
   */
  private capturedPendingKernels: Map<number, PendingKernelInfo[]> = new Map();

  /**
   * a SessionID -> a Map of (InputOutputIndex -> [ID, GPUBuffer]) mapping.
   */
  sessionExternalDataMapping: Map<number, Map<number, [number, GPUBuffer]>> = new Map();

  async initialize(env: Env, adapter: GPUAdapter): Promise<void> {
    this.env = env;
    const requiredFeatures: GPUFeatureName[] = [];
    const deviceDescriptor: GPUDeviceDescriptor = {
      requiredLimits: {
        maxComputeWorkgroupStorageSize: adapter.limits.maxComputeWorkgroupStorageSize,
        maxComputeWorkgroupsPerDimension: adapter.limits.maxComputeWorkgroupsPerDimension,
        maxStorageBufferBindingSize: adapter.limits.maxStorageBufferBindingSize,
        maxBufferSize: adapter.limits.maxBufferSize,
        maxComputeInvocationsPerWorkgroup: adapter.limits.maxComputeInvocationsPerWorkgroup,
        maxComputeWorkgroupSizeX: adapter.limits.maxComputeWorkgroupSizeX,
        maxComputeWorkgroupSizeY: adapter.limits.maxComputeWorkgroupSizeY,
        maxComputeWorkgroupSizeZ: adapter.limits.maxComputeWorkgroupSizeZ,
      },
      requiredFeatures,
    };

    // Try requiring WebGPU features
    const requireFeatureIfAvailable = (feature: GPUFeatureName) =>
      adapter.features.has(feature) && requiredFeatures.push(feature) && true;
    // Try chromium-experimental-timestamp-query-inside-passes and fallback to timestamp-query
    if (!requireFeatureIfAvailable('chromium-experimental-timestamp-query-inside-passes' as GPUFeatureName)) {
      requireFeatureIfAvailable('timestamp-query');
    }
    requireFeatureIfAvailable('shader-f16');
    // Try subgroups
    if (requireFeatureIfAvailable('subgroups' as GPUFeatureName)) {
      // If subgroups feature is available, also try subgroups-f16
      requireFeatureIfAvailable('subgroups-f16' as GPUFeatureName);
    }

    this.device = await adapter.requestDevice(deviceDescriptor);
    this.deviceInfo = new DeviceInfoImpl(this.device);
    this.adapterInfo = new AdapterInfoImpl(adapter.info || (await adapter.requestAdapterInfo()));
    this.gpuDataManager = createGpuDataManager(this);
    this.programManager = new ProgramManager(this);
    this.kernels = new Map();
    this.kernelPersistentData = new Map();
    this.kernelCustomData = new Map();

    // set up flags for logger
    configureLogger(env.logLevel!, !!env.debug);

    // TODO: set up flags

    this.device.onuncapturederror = (ev) => {
      if (ev.error instanceof GPUValidationError) {
        // eslint-disable-next-line no-console
        console.error(`An uncaught WebGPU validation error was raised: ${ev.error.message}`);
      }
    };

    Object.defineProperty(this.env.webgpu, 'device', {
      value: this.device,
      writable: false,
      enumerable: true,
      configurable: false,
    });
    Object.defineProperty(this.env.webgpu, 'adapter', {
      value: adapter,
      writable: false,
      enumerable: true,
      configurable: false,
    });

    // init queryType, which is necessary for InferenceSession.create
    this.setQueryType();
  }

  dispose(): void {
    if (typeof this.querySet !== 'undefined') {
      this.querySet.destroy();
    }
    this.gpuDataManager.dispose();
  }

  getCommandEncoder(): GPUCommandEncoder {
    if (!this.commandEncoder) {
      this.commandEncoder = this.device.createCommandEncoder();
    }
    return this.commandEncoder;
  }

  getComputePassEncoder(): GPUComputePassEncoder {
    if (!this.computePassEncoder) {
      const commandEncoder = this.getCommandEncoder();
      const computePassDescriptor: GPUComputePassDescriptor = {};

      if (this.queryType === 'at-passes') {
        computePassDescriptor.timestampWrites = {
          querySet: this.querySet!,
          beginningOfPassWriteIndex: this.pendingDispatchNumber * 2,
          endOfPassWriteIndex: this.pendingDispatchNumber * 2 + 1,
        };
      }

      this.computePassEncoder = commandEncoder.beginComputePass(computePassDescriptor);
    }
    return this.computePassEncoder;
  }

  endComputePass(): void {
    if (this.computePassEncoder) {
      this.computePassEncoder.end();
      this.computePassEncoder = null;
    }
  }

  flush(): void {
    if (!this.commandEncoder) {
      return;
    }

    TRACE_FUNC_BEGIN();

    this.endComputePass();
    let queryReadBuffer: GPUBuffer;
    if (this.queryType !== 'none') {
      this.commandEncoder.resolveQuerySet(
        this.querySet!,
        0,
        this.pendingDispatchNumber * 2,
        this.queryResolveBuffer!,
        0,
      );

      queryReadBuffer = this.device.createBuffer(
        // eslint-disable-next-line no-bitwise
        { size: this.pendingDispatchNumber * 2 * 8, usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST },
      );

      this.pendingQueries.set(queryReadBuffer, this.pendingKernels);
      this.pendingKernels = [];
      this.commandEncoder.copyBufferToBuffer(
        this.queryResolveBuffer!,
        0,
        queryReadBuffer,
        0,
        this.pendingDispatchNumber * 2 * 8,
      );
    }

    this.device.queue.submit([this.commandEncoder.finish()]);
    this.gpuDataManager.refreshPendingBuffers();
    this.commandEncoder = null;
    this.pendingDispatchNumber = 0;

    if (this.queryType !== 'none') {
      void queryReadBuffer!.mapAsync(GPUMapMode.READ).then(() => {
        const mappedData = new BigUint64Array(queryReadBuffer.getMappedRange());
        const pendingKernels = this.pendingQueries.get(queryReadBuffer)!;
        for (let i = 0; i < mappedData.length / 2; i++) {
          const pendingKernelInfo = pendingKernels[i];
          const kernelId = pendingKernelInfo.kernelId;
          const kernelInfo = this.kernels.get(kernelId)!;
          const kernelType = kernelInfo.kernelType;
          const kernelName = kernelInfo.kernelName;
          const programName = pendingKernelInfo.programName;
          const inputTensorViews = pendingKernelInfo.inputTensorViews;
          const outputTensorViews = pendingKernelInfo.outputTensorViews;
          const startTimeU64 = mappedData[i * 2];
          const endTimeU64 = mappedData[i * 2 + 1];

          if (typeof this.queryTimeBase === 'undefined') {
            this.queryTimeBase = startTimeU64;
          }

          const startTime = Number(startTimeU64 - this.queryTimeBase);
          const endTime = Number(endTimeU64 - this.queryTimeBase);

          if (!Number.isSafeInteger(startTime) || !Number.isSafeInteger(endTime)) {
            throw new RangeError('incorrect timestamp range');
          }

          if (this.env.webgpu.profiling?.ondata) {
            this.env.webgpu.profiling.ondata({
              version: 1,
              inputsMetadata: inputTensorViews.map((value) => ({
                dims: value.dims,
                dataType: tensorDataTypeEnumToString(value.dataType),
              })),
              outputsMetadata: outputTensorViews.map((value) => ({
                dims: value.dims,
                dataType: tensorDataTypeEnumToString(value.dataType),
              })),
              kernelId,
              kernelType,
              kernelName,
              programName,
              startTime,
              endTime,
            });
          } else {
            // if no callback is provided, print the profiling message to console
            let inputShapes = '';
            inputTensorViews.forEach((value, i) => {
              inputShapes += `input[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
            });
            let outputShapes = '';
            outputTensorViews.forEach((value, i) => {
              outputShapes += `output[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
            });
            // eslint-disable-next-line no-console
            console.log(
              `[profiling] kernel "${kernelId}|${kernelType}|${kernelName}|${programName}" ${inputShapes}${
                outputShapes
              }execution time: ${endTime - startTime} ns`,
            );
          }
          TRACE('GPU', `${programName}::${startTimeU64}::${endTimeU64}`);
        }
        queryReadBuffer.unmap();
        this.pendingQueries.delete(queryReadBuffer);
      });
    }
    TRACE_FUNC_END();
  }

  /**
   * run a WebGPU program.
   * @param program a ProgramInfo instance
   * @param inputTensorViews a TensorView array. each element represents a value already exists in GPU.
   * @param outputIndices an indices array. each element can be either -1 (temporary data), -2 (persistent data) or an
   * index to the kernel's output.
   * @param createKernelOutput a callback function that create a value to kernel's output with the given index
   * @param createIntermediateOutput a callback function that create a value as a intermediate value, either temporary
   * or persistent (owned by the current kernel)
   * @returns a TensorView array representing the result.
   */
  run(
    program: ProgramInfo,
    inputTensorViews: readonly TensorView[],
    outputIndices: readonly number[],
    createKernelOutput: (index: number, dataType: number, dims: readonly number[]) => TensorView,
    createIntermediateOutput: (dataType: number, dims: readonly number[]) => TensorView,
    outputCount: number,
  ): TensorView[] {
    TRACE_FUNC_BEGIN(program.name);
    // create info for inputs
    const inputDatas: GpuData[] = [];
    for (let i = 0; i < inputTensorViews.length; ++i) {
      const data = inputTensorViews[i].data;
      // if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it.
      if (data === 0) {
        continue;
      }
      const gpuData = this.gpuDataManager.get(data);
      if (!gpuData) {
        throw new Error(`no GPU data for input: ${data}`);
      }
      inputDatas.push(gpuData);
    }

    const { outputs, dispatchGroup, programUniforms } = program.getRunData(inputTensorViews);

    // check output indices
    const validatedOutputIndices = outputIndices.length === 0 ? outputs.map((_, i) => i) : outputIndices;
    if (validatedOutputIndices.length !== outputs.length) {
      throw new Error(`Output size ${validatedOutputIndices.length} must be equal to ${outputs.length}.`);
    }

    // create info for outputs
    const outputTensorViews: TensorView[] = [];
    const outputDatas: GpuData[] = [];
    for (let i = 0; i < outputs.length; ++i) {
      // value -1 and -2 are used for creating temporary and persistent outputs.
      // value -3 is used for placeholder output. So -3, -2, -1 and 0, 1, 2, ... are valid
      // output indices. see type definition of ComputeContextInputsOutputsMapping for more details.
      if (
        !Number.isInteger(validatedOutputIndices[i]) ||
        validatedOutputIndices[i] < -3 ||
        validatedOutputIndices[i] >= outputCount
      ) {
        throw new Error(`Invalid output index: ${validatedOutputIndices[i]}`);
      }
      if (validatedOutputIndices[i] === -3) {
        continue;
      }
      const isTemporary = validatedOutputIndices[i] === -1;
      const isPersistent = validatedOutputIndices[i] === -2;
      const tensorView =
        isTemporary || isPersistent
          ? createIntermediateOutput(outputs[i].dataType, outputs[i].dims)
          : createKernelOutput(validatedOutputIndices[i], outputs[i].dataType, outputs[i].dims);
      outputTensorViews.push(tensorView);
      // if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it.
      if (tensorView.data === 0) {
        continue;
      }
      const gpuData = this.gpuDataManager.get(tensorView.data);
      if (!gpuData) {
        throw new Error(`no GPU data for output: ${tensorView.data}`);
      }
      if (isTemporary) {
        this.temporaryData.push(gpuData);
      }
      if (isPersistent) {
        let persistentData = this.kernelPersistentData.get(this.currentKernelId!);
        if (!persistentData) {
          persistentData = [];
          this.kernelPersistentData.set(this.currentKernelId!, persistentData);
        }
        persistentData.push(gpuData);
      }
      outputDatas.push(gpuData);
    }

    // when there are any zero-sized tensor in the inputs or outputs, we should report error unless all outputs are
    // zero-sized tensors.
    if (inputDatas.length !== inputTensorViews.length || outputDatas.length !== outputTensorViews.length) {
      // if all outputs are zero-sized tensors, there is no need to run the program.
      if (outputDatas.length === 0) {
        TRACE_FUNC_END(program.name);
        return outputTensorViews;
      }
      // if some outputs are zero-sized tensors, report an error.
      //
      // TODO: so far we don't see any use case that outputs include both zero-sized tensors and non-zero-sized tensors.
      // If we see such use case, we need to make a change here to support it.
      throw new Error(
        `Program ${program.name} has zero-sized tensor(s) in inputs or outputs. This is not supported now.`,
      );
    }

    // load uniforms
    // TODO: add cache for uniform (is it necessary?)
    //
    let uniformBufferBinding: GPUBindingResource | undefined;
    if (programUniforms) {
      let currentOffset = 0;
      const offsets: number[] = [];

      programUniforms.forEach((v) => {
        const data = typeof v.data === 'number' ? [v.data] : v.data;
        if (data.length === 0) {
          return;
        }
        // https://www.w3.org/TR/WGSL/#alignof
        const sizeOfElement = v.type === DataType.float16 ? 2 : 4;
        let sizeOfVecOrMat;
        let baseAlignment;
        if (v.type === DataType.float16) {
          baseAlignment = data.length > 4 ? 16 : data.length > 2 ? 8 : data.length * sizeOfElement;
          sizeOfVecOrMat = data.length > 4 ? 16 : sizeOfElement * data.length;
        } else {
          baseAlignment = data.length <= 2 ? data.length * sizeOfElement : 16;
          sizeOfVecOrMat = 16;
        }
        currentOffset = Math.ceil(currentOffset / baseAlignment) * baseAlignment;
        offsets.push(currentOffset);
        // For non-float16 type, when data.length > 4, the uniform variable is of type array<vec4<i32|u32|f32>,N>, where
        // N = Math.ceil(data.length / 4) and SizeOf(vec4<i32|u32|f32>) = 16. The total byte length is N *
        // SizeOf(vec4<i32|u32|f32>). For float16 type, when data.length > 4, the uniform variable is of type
        // array<mat2x4<f16>,N>, where N = Math.ceil(data.length / 8) and SizeOf(mat2x4<f16>) = 16. The total byte
        // length is N * SizeOf(mat2x4<f16>).
        const elementPerVecOrMat = v.type === DataType.float16 ? 8 : 4;
        currentOffset +=
          data.length > 4 ? Math.ceil(data.length / elementPerVecOrMat) * sizeOfVecOrMat : data.length * sizeOfElement;
      });

      // Meet alignment of struct here: https://www.w3.org/TR/WGSL/#alignment-and-size. For simplicity, set
      // maxAlignmentOfField to 16 since the underlying buffer has been rounded up to 16.
      const maxAlignmentOfField = 16;
      currentOffset = Math.ceil(currentOffset / maxAlignmentOfField) * maxAlignmentOfField;
      const arrayBuffer = new ArrayBuffer(currentOffset);
      programUniforms.forEach((v, i) => {
        const offset = offsets[i];
        const data = typeof v.data === 'number' ? [v.data] : v.data;
        if (v.type === DataType.int32) {
          new Int32Array(arrayBuffer, offset, data.length).set(data);
        } else if (v.type === DataType.uint32) {
          new Uint32Array(arrayBuffer, offset, data.length).set(data);
        } else if (v.type === DataType.float16) {
          new Uint16Array(arrayBuffer, offset, data.length).set(data);
        } else if (v.type === DataType.float) {
          new Float32Array(arrayBuffer, offset, data.length).set(data);
        } else {
          throw new Error(`Unsupported uniform type: ${tensorDataTypeEnumToString(v.type)}`);
        }
      });

      const uniformBufferData =
        // eslint-disable-next-line no-bitwise
        this.gpuDataManager.create(currentOffset, GPUBufferUsage.COPY_DST | GPUBufferUsage.UNIFORM);
      this.device.queue.writeBuffer(uniformBufferData.buffer, 0, arrayBuffer, 0, currentOffset);
      this.gpuDataManager.release(uniformBufferData.id);
      uniformBufferBinding = { offset: 0, size: currentOffset, buffer: uniformBufferData.buffer };
    }

    const normalizedDispatchGroup = this.programManager.normalizeDispatchGroupSize(dispatchGroup);
    const is1DimensionDispatch = normalizedDispatchGroup[1] === 1 && normalizedDispatchGroup[2] === 1;
    // get program info
    const key = getProgramInfoUniqueKey(program, inputTensorViews, is1DimensionDispatch);
    let artifact = this.programManager.getArtifact(key);
    if (!artifact) {
      artifact = this.programManager.build(program, normalizedDispatchGroup);
      this.programManager.setArtifact(key, artifact);
      LOG_DEBUG('info', () => `[artifact] key: ${key}, programName: ${program.name}`);
    }

    // validate uniform variables
    if (programUniforms && artifact.uniformVariablesInfo) {
      if (programUniforms.length !== artifact.uniformVariablesInfo.length) {
        throw new Error(
          `Uniform variables count mismatch: expect ${artifact.uniformVariablesInfo.length}, got ${
            programUniforms.length
          } in program "${artifact.programInfo.name}".`,
        );
      }
      for (let i = 0; i < programUniforms.length; i++) {
        const uniform = programUniforms[i];
        const actualType = uniform.type;
        const actualLength = typeof uniform.data === 'number' ? 1 : uniform.data.length;
        const [type, length] = artifact.uniformVariablesInfo[i];
        if (actualType !== type || actualLength !== length) {
          throw new Error(
            `Uniform variable ${i} mismatch: expect type ${type} with size ${length}, got type ${
              actualType
            } with size ${actualLength} in program "${artifact.programInfo.name}".`,
          );
        }
      }
    }

    LOG_DEBUG(
      'info',
      () =>
        `[ProgramManager] run "${program.name}" (key=${key}) with ${normalizedDispatchGroup[0]}x${
          normalizedDispatchGroup[1]
        }x${normalizedDispatchGroup[2]}`,
    );

    if (this.queryType !== 'none' || this.sessionStatus === 'capturing') {
      const pendingKernelInfo: PendingKernelInfo = {
        kernelId: this.currentKernelId!,
        programName: artifact.programInfo.name,
        inputTensorViews,
        outputTensorViews,
      };
      this.pendingKernels.push(pendingKernelInfo);

      if (this.sessionStatus === 'capturing') {
        const sessionPendingKernels = this.capturedPendingKernels.get(this.currentSessionId!);
        sessionPendingKernels!.push(pendingKernelInfo);
      }
    }

    this.programManager.run(artifact, inputDatas, outputDatas, normalizedDispatchGroup, uniformBufferBinding);

    TRACE_FUNC_END(program.name);
    return outputTensorViews;
  }

  upload(gpuDataId: number, data: Uint8Array): void {
    this.gpuDataManager.upload(gpuDataId, data);
  }

  memcpy(src: number, dst: number): void {
    this.gpuDataManager.memcpy(src, dst);
  }

  async download(gpuDataId: number, getTargetBuffer: () => Uint8Array): Promise<void> {
    // the underlying buffer may be changed after the async function is called. so we use a getter function to make sure
    // the buffer is up-to-date.
    await this.gpuDataManager.download(gpuDataId, getTargetBuffer);
  }

  alloc(size: number): number {
    return this.gpuDataManager.create(size).id;
  }

  free(ptr: number): number {
    return this.gpuDataManager.release(ptr);
  }

  createKernel(kernelType: string, kernelId: number, attribute: unknown, kernelName: string): void {
    const op = WEBGPU_OP_RESOLVE_RULES.get(kernelType);
    if (!op) {
      throw new Error(`kernel not implemented: ${kernelType}`);
    }

    const kernelInfo: KernelInfo = {
      kernelType,
      kernelName,
      kernelEntry: op[0],
      attributes: [op[1], attribute],
    };
    this.kernels.set(kernelId, kernelInfo);
  }

  releaseKernel(kernelId: number): void {
    const persistentData = this.kernelPersistentData.get(kernelId);
    if (persistentData) {
      for (const data of persistentData) {
        this.gpuDataManager.release(data.id);
      }
      this.kernelPersistentData.delete(kernelId);
    }

    this.kernelCustomData.delete(kernelId);
    this.kernels.delete(kernelId);
  }

  computeKernel(kernelId: number, context: ComputeContext, errors: Array<Promise<string | null>>): number {
    const kernel = this.kernels.get(kernelId);
    if (!kernel) {
      throw new Error(`kernel not created: ${kernelId}`);
    }
    const kernelType = kernel.kernelType;
    const kernelName = kernel.kernelName;
    const kernelEntry = kernel.kernelEntry;
    const attributes = kernel.attributes;
    if (this.currentKernelId !== null) {
      throw new Error(`kernel "[${kernelType}] ${kernelName}" is not allowed to be called recursively`);
    }
    this.currentKernelId = kernelId;

    // parse attributes if necessary
    if (attributes[0]) {
      attributes[1] = attributes[0](attributes[1]);
      attributes[0] = undefined;
    }

    LOG_DEBUG('info', () => `[WebGPU] Start to run kernel "[${kernelType}] ${kernelName}"...`);

    const useErrorScope = this.env.debug;

    this.temporaryData = [];
    try {
      if (useErrorScope) {
        this.device.pushErrorScope('validation');
      }

      kernelEntry(context, attributes[1]);
      return 0; // ORT_OK
    } catch (e) {
      errors.push(Promise.resolve(`[WebGPU] Kernel "[${kernelType}] ${kernelName}" failed. ${e}`));
      return 1; // ORT_FAIL
    } finally {
      if (useErrorScope) {
        errors.push(
          this.device
            .popErrorScope()
            .then((err) =>
              err ? `GPU validation error for kernel "[${kernelType}] ${kernelName}": ${err.message}` : null,
            ),
        );
      }

      for (const data of this.temporaryData) {
        this.gpuDataManager.release(data.id);
      }
      this.temporaryData = [];
      this.currentKernelId = null;
    }
  }

  // #region external buffer
  registerBuffer(sessionId: number, index: number, buffer: GPUBuffer, size: number): number {
    let sessionInputOutputMapping = this.sessionExternalDataMapping.get(sessionId);
    if (!sessionInputOutputMapping) {
      sessionInputOutputMapping = new Map();
      this.sessionExternalDataMapping.set(sessionId, sessionInputOutputMapping);
    }

    // the buffer may be user created, or managed by GPU data manager.
    // The GPU data manager will not manage these buffers. we register them as external buffers.
    //
    // The map `sessionInputOutputMapping` is used to store the data ID and buffer for each input/output. Once a
    // specific input/output is registered, the data ID will not change.
    const previousBuffer = sessionInputOutputMapping.get(index);
    const id = this.gpuDataManager.registerExternalBuffer(buffer, size, previousBuffer);
    sessionInputOutputMapping.set(index, [id, buffer]);
    return id;
  }
  unregisterBuffers(sessionId: number): void {
    const sessionInputOutputMapping = this.sessionExternalDataMapping.get(sessionId);
    if (sessionInputOutputMapping) {
      sessionInputOutputMapping.forEach((bufferInfo) => this.gpuDataManager.unregisterExternalBuffer(bufferInfo[0]));
      this.sessionExternalDataMapping.delete(sessionId);
    }
  }
  getBuffer(gpuDataId: number): GPUBuffer {
    const gpuData = this.gpuDataManager.get(gpuDataId);
    if (!gpuData) {
      throw new Error(`no GPU data for buffer: ${gpuDataId}`);
    }
    return gpuData.buffer;
  }
  createDownloader(
    gpuBuffer: GPUBuffer,
    size: number,
    type: Tensor.GpuBufferDataTypes,
  ): () => Promise<Tensor.DataType> {
    return async () => {
      const data = await downloadGpuData(this, gpuBuffer, size);
      return createView(data.buffer, type);
    };
  }
  // #endregion
  writeTimestamp(index: number): void {
    if (this.queryType !== 'inside-passes') {
      return;
    }

    // eslint-disable-next-line @typescript-eslint/no-explicit-any
    (this.computePassEncoder as any).writeTimestamp(this.querySet, index);
  }
  setQueryType(): void {
    this.queryType = 'none';
    if (
      this.env.webgpu.profiling?.mode === 'default' ||
      (typeof this.env.trace === 'undefined' ? this.env.wasm.trace : this.env.trace)
    ) {
      if (this.device.features.has('chromium-experimental-timestamp-query-inside-passes')) {
        this.queryType = 'inside-passes';
      } else if (this.device.features.has('timestamp-query')) {
        this.queryType = 'at-passes';
      }

      if (this.queryType !== 'none' && typeof this.querySet === 'undefined') {
        this.querySet = this.device.createQuerySet({
          type: 'timestamp',
          count: this.maxDispatchNumber * 2,
        });
        this.queryResolveBuffer = this.device.createBuffer(
          // eslint-disable-next-line no-bitwise
          { size: this.maxDispatchNumber * 2 * 8, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.QUERY_RESOLVE },
        );
      }
    }
  }

  captureBegin(): void {
    LOG_DEBUG('info', 'captureBegin');
    if (!this.capturedCommandList.get(this.currentSessionId!)) {
      this.capturedCommandList.set(this.currentSessionId!, []);
    }
    if (!this.capturedPendingKernels.get(this.currentSessionId!)) {
      this.capturedPendingKernels.set(this.currentSessionId!, []);
    }
    // flush the left commands before we change the status.
    this.flush();
    this.sessionStatus = 'capturing';
  }
  captureEnd(): void {
    LOG_DEBUG('info', 'captureEnd');
    // flush the left commands before we change the status.
    this.flush();
    this.sessionStatus = 'default';
  }
  replay(): void {
    LOG_DEBUG('info', 'replay');
    this.sessionStatus = 'replaying';
    const sessionCommandList = this.capturedCommandList.get(this.currentSessionId!);
    const sessionPendingKernels = this.capturedPendingKernels.get(this.currentSessionId!);
    const length = sessionCommandList!.length;
    this.pendingKernels = [];
    for (let i = 0; i < length; i++) {
      const computePassEncoder = this.getComputePassEncoder();
      const command = sessionCommandList![i];
      this.writeTimestamp(this.pendingDispatchNumber * 2);
      computePassEncoder.setPipeline(command.computePipeline);
      computePassEncoder.setBindGroup(0, command.bindGroup);
      computePassEncoder.dispatchWorkgroups(...command.dispatchGroup);
      this.writeTimestamp(this.pendingDispatchNumber * 2 + 1);
      this.pendingDispatchNumber++;
      if (this.queryType !== 'none') {
        this.pendingKernels.push(sessionPendingKernels![i]);
      }
      if (this.pendingDispatchNumber >= this.maxDispatchNumber || this.queryType === 'at-passes') {
        this.endComputePass();
      }
      if (this.pendingDispatchNumber >= this.maxDispatchNumber) {
        this.flush();
      }
    }
    // flush the left commands before we change the status.
    this.flush();
    this.sessionStatus = 'default';
  }

  onCreateSession(): void {
    this.gpuDataManager.onCreateSession();
  }

  onReleaseSession(sessionId: number): void {
    this.unregisterBuffers(sessionId);
    if (this.capturedCommandList.has(sessionId)) {
      this.capturedCommandList.delete(sessionId);
    }
    if (this.capturedPendingKernels.has(sessionId)) {
      this.capturedPendingKernels.delete(sessionId);
    }
    this.gpuDataManager.onReleaseSession(sessionId);
  }

  onRunStart(sessionId: number): void {
    this.currentSessionId = sessionId;
    this.setQueryType();
  }
}