mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-22 22:01:08 +00:00
<del> **This PR is based on a few prerequisites PRs. They are listed as below:** - #17465 - #17469 - #17470 - #17472 - #17473 - #17484 Please review the current change by only looking at commit e2e6623e673ec6de55a5c1f8edcbd3a46b535a89 and later. </del> ### Description This PR introduces WebGPU IO binding. This new feature allows onnxruntime-web users to use tensors created from GPU as model input/output so that a model inferencing can be done without unnecessary data copy between CPU and GPU for model input/output. ### Examples An E2E demo/example is being worked on. Following is some simple demo with code snippet. Let's first check today how we do: ```js // STEP.1 - create an inference session: const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'] }); // STEP.2 - create model input: (supposing myImageCpuData is a Float32Array) const feeds = { 'input_image:0': new ort.Tensor('float32', myImageCpuData, [1, 224, 224, 3]) }; // STEP.3 - run model const myResults = await mySession.run(feeds); // STEP.4 - get output data const myData = myResults['output_image:0'].data; // Float32Array ``` #### for inputs (GPU tensor): Now, with IO binding, you can create a tensor from a GPU buffer, and feed it to the model: ```js // new STEP.2.A - create model input from a GPU buffer: (supposing myInputGpuBuffer is a `GPUBuffer` object with input data) const feeds = { 'input_image:0': ort.Tensor.fromGpuBuffer(myInputGpuBuffer, { dataType: 'float32', dims: [1, 224, 224, 3] }) }; ``` ### for outputs (pre-allocated GPU tensor) you can also do that for output, **if you know the output shape**: ```js // new STEP.2.B - create model output from a GPU buffer: (supposing myOutputGpuBuffer is a pre-allocated `GPUBuffer` object) const fetches = { 'output_image:0': ort.Tensor.fromGpuBuffer(myOutputGpuBuffer, { dataType: 'float32', dims: [1, 512, 512, 3] }) }; // new STEP.3 - run model with pre-allocated output (fetches) const myResults = await mySession.run(feeds, fetches); ``` ### for outputs (specify location) if you do not know the output shape, you can specify the output location when creating the session: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: "gpu-buffer" }); ``` if the model has multiple outputs, you can specify them seperately: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: { "output_image:0": "gpu-buffer" } }); ``` now you don't need to prepare the `fetches` object and onnxruntime-web will prepare output data on the location that specified. #### read data when you get the output tensor, you can: ```js // get the gpu buffer object: const gpuBuffer = myOutputTensor.gpuBuffer; // GPUBuffer // get the CPU data asynchronizely const cpuData = await myOutputTensor.getData(); // get the CPU data asynchronizely and release the underlying GPU resources const cpuData = await myOutputTensor.getData(true); // dispose the tensor (release the underlying GPU resources). This tensor object will be invalid after dispose() is called. myOutputTensor.dispose(); ``` #### resource management JavaScript has GC so you don't need to worry about managing JavaScript objects. But there are 2 types of resources that are not managed by GC: - GPU buffer that used in tensors - Underlying ORT native resources To simplify, most of the unmanaged resources and handled inside ORT web. But there are a few resources that need users to manage: - All external GPU resources, including GPU buffers inside all tensors created by `Tensor.fromGpuBuffer()`, will not be managed by ORT. User should manage those GPU buffers themselves. - When a session is created with `preferredOutputLocation` == "gpu-buffer" specified in session options, and the corresponding output is not pre-allocated, user need to call the output tensor's `dispose()` or `getData(true)` to manually release the underlying GPU buffers. - ORT internal errors (including providing a pre-allocated output tensor with wrong type/dims) will invalidate the whole wasm memory and is not recoverable. An exception is thrown in this situation.
244 lines
8.6 KiB
TypeScript
244 lines
8.6 KiB
TypeScript
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
// Licensed under the MIT License.
|
|
|
|
import {Env, env, InferenceSession} from 'onnxruntime-common';
|
|
|
|
import {OrtWasmMessage, SerializableModeldata, SerializableSessionMetadata, SerializableTensorMetadata, TensorMetadata} from './proxy-messages';
|
|
import * as core from './wasm-core-impl';
|
|
import {initializeWebAssembly} from './wasm-factory';
|
|
|
|
const isProxy = (): boolean => !!env.wasm.proxy && typeof document !== 'undefined';
|
|
let proxyWorker: Worker|undefined;
|
|
let initializing = false;
|
|
let initialized = false;
|
|
let aborted = false;
|
|
|
|
// resolve; reject
|
|
type PromiseCallbacks<T = void> = [(result: T) => void, (reason: unknown) => void];
|
|
|
|
let initWasmCallbacks: PromiseCallbacks;
|
|
let initOrtCallbacks: PromiseCallbacks;
|
|
const createSessionAllocateCallbacks: Array<PromiseCallbacks<SerializableModeldata>> = [];
|
|
const createSessionFinalizeCallbacks: Array<PromiseCallbacks<SerializableSessionMetadata>> = [];
|
|
const createSessionCallbacks: Array<PromiseCallbacks<SerializableSessionMetadata>> = [];
|
|
const releaseSessionCallbacks: Array<PromiseCallbacks<void>> = [];
|
|
const runCallbacks: Array<PromiseCallbacks<SerializableTensorMetadata[]>> = [];
|
|
const endProfilingCallbacks: Array<PromiseCallbacks<void>> = [];
|
|
|
|
const ensureWorker = (): void => {
|
|
if (initializing || !initialized || aborted || !proxyWorker) {
|
|
throw new Error('worker not ready');
|
|
}
|
|
};
|
|
|
|
const onProxyWorkerMessage = (ev: MessageEvent<OrtWasmMessage>): void => {
|
|
switch (ev.data.type) {
|
|
case 'init-wasm':
|
|
initializing = false;
|
|
if (ev.data.err) {
|
|
aborted = true;
|
|
initWasmCallbacks[1](ev.data.err);
|
|
} else {
|
|
initialized = true;
|
|
initWasmCallbacks[0]();
|
|
}
|
|
break;
|
|
case 'init-ort':
|
|
if (ev.data.err) {
|
|
initOrtCallbacks[1](ev.data.err);
|
|
} else {
|
|
initOrtCallbacks[0]();
|
|
}
|
|
break;
|
|
case 'create_allocate':
|
|
if (ev.data.err) {
|
|
createSessionAllocateCallbacks.shift();
|
|
} else {
|
|
createSessionAllocateCallbacks.shift();
|
|
}
|
|
break;
|
|
case 'create_finalize':
|
|
if (ev.data.err) {
|
|
createSessionFinalizeCallbacks.shift();
|
|
} else {
|
|
createSessionFinalizeCallbacks.shift();
|
|
}
|
|
break;
|
|
case 'create':
|
|
if (ev.data.err) {
|
|
createSessionCallbacks.shift();
|
|
} else {
|
|
createSessionCallbacks.shift();
|
|
}
|
|
break;
|
|
case 'release':
|
|
if (ev.data.err) {
|
|
releaseSessionCallbacks.shift();
|
|
} else {
|
|
releaseSessionCallbacks.shift()![0]();
|
|
}
|
|
break;
|
|
case 'run':
|
|
if (ev.data.err) {
|
|
runCallbacks.shift();
|
|
} else {
|
|
runCallbacks.shift();
|
|
}
|
|
break;
|
|
case 'end-profiling':
|
|
if (ev.data.err) {
|
|
endProfilingCallbacks.shift();
|
|
} else {
|
|
endProfilingCallbacks.shift()![0]();
|
|
}
|
|
break;
|
|
default:
|
|
}
|
|
};
|
|
|
|
const scriptSrc = typeof document !== 'undefined' ? (document?.currentScript as HTMLScriptElement)?.src : undefined;
|
|
|
|
export const initializeWebAssemblyInstance = async(): Promise<void> => {
|
|
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
|
|
if (initialized) {
|
|
return;
|
|
}
|
|
if (initializing) {
|
|
throw new Error('multiple calls to \'initWasm()\' detected.');
|
|
}
|
|
if (aborted) {
|
|
throw new Error('previous call to \'initWasm()\' failed.');
|
|
}
|
|
|
|
initializing = true;
|
|
|
|
// overwrite wasm filepaths
|
|
if (env.wasm.wasmPaths === undefined) {
|
|
if (scriptSrc && scriptSrc.indexOf('blob:') !== 0) {
|
|
env.wasm.wasmPaths = scriptSrc.substr(0, +(scriptSrc).lastIndexOf('/') + 1);
|
|
}
|
|
}
|
|
|
|
return new Promise<void>((resolve, reject) => {
|
|
proxyWorker?.terminate();
|
|
// eslint-disable-next-line @typescript-eslint/no-var-requires, @typescript-eslint/no-require-imports
|
|
proxyWorker = require('worker-loader?inline=no-fallback!./proxy-worker/main').default() as Worker;
|
|
proxyWorker.onmessage = onProxyWorkerMessage;
|
|
initWasmCallbacks = [resolve, reject];
|
|
const message: OrtWasmMessage = {type: 'init-wasm', in : env.wasm};
|
|
proxyWorker.postMessage(message);
|
|
});
|
|
|
|
} else {
|
|
return initializeWebAssembly(env.wasm);
|
|
}
|
|
};
|
|
|
|
export const initializeRuntime = async(env: Env): Promise<void> => {
|
|
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
|
|
ensureWorker();
|
|
return new Promise<void>((resolve, reject) => {
|
|
initOrtCallbacks = [resolve, reject];
|
|
const message: OrtWasmMessage = {type: 'init-ort', in : env};
|
|
proxyWorker!.postMessage(message);
|
|
});
|
|
} else {
|
|
await core.initRuntime(env);
|
|
}
|
|
};
|
|
|
|
export const createSessionAllocate = async(model: Uint8Array): Promise<SerializableModeldata> => {
|
|
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
|
|
ensureWorker();
|
|
return new Promise<SerializableModeldata>((resolve, reject) => {
|
|
createSessionAllocateCallbacks.push([resolve, reject]);
|
|
const message: OrtWasmMessage = {type: 'create_allocate', in : {model}};
|
|
proxyWorker!.postMessage(message, [model.buffer]);
|
|
});
|
|
} else {
|
|
return core.createSessionAllocate(model);
|
|
}
|
|
};
|
|
|
|
export const createSessionFinalize = async(modeldata: SerializableModeldata, options?: InferenceSession.SessionOptions):
|
|
Promise<SerializableSessionMetadata> => {
|
|
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
|
|
ensureWorker();
|
|
return new Promise<SerializableSessionMetadata>((resolve, reject) => {
|
|
createSessionFinalizeCallbacks.push([resolve, reject]);
|
|
const message: OrtWasmMessage = {type: 'create_finalize', in : {modeldata, options}};
|
|
proxyWorker!.postMessage(message);
|
|
});
|
|
} else {
|
|
return core.createSessionFinalize(modeldata, options);
|
|
}
|
|
};
|
|
|
|
export const createSession =
|
|
async(model: Uint8Array, options?: InferenceSession.SessionOptions): Promise<SerializableSessionMetadata> => {
|
|
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
|
|
// check unsupported options
|
|
if (options?.preferredOutputLocation) {
|
|
throw new Error('session option "preferredOutputLocation" is not supported for proxy.');
|
|
}
|
|
ensureWorker();
|
|
return new Promise<SerializableSessionMetadata>((resolve, reject) => {
|
|
createSessionCallbacks.push([resolve, reject]);
|
|
const message: OrtWasmMessage = {type: 'create', in : {model, options}};
|
|
proxyWorker!.postMessage(message, [model.buffer]);
|
|
});
|
|
} else {
|
|
return core.createSession(model, options);
|
|
}
|
|
};
|
|
|
|
export const releaseSession = async(sessionId: number): Promise<void> => {
|
|
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
|
|
ensureWorker();
|
|
return new Promise<void>((resolve, reject) => {
|
|
releaseSessionCallbacks.push([resolve, reject]);
|
|
const message: OrtWasmMessage = {type: 'release', in : sessionId};
|
|
proxyWorker!.postMessage(message);
|
|
});
|
|
} else {
|
|
core.releaseSession(sessionId);
|
|
}
|
|
};
|
|
|
|
export const run = async(
|
|
sessionId: number, inputIndices: number[], inputs: TensorMetadata[], outputIndices: number[],
|
|
outputs: Array<TensorMetadata|null>, options: InferenceSession.RunOptions): Promise<TensorMetadata[]> => {
|
|
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
|
|
// check inputs location
|
|
if (inputs.some(t => t[3] !== 'cpu')) {
|
|
throw new Error('input tensor on GPU is not supported for proxy.');
|
|
}
|
|
// check outputs location
|
|
if (outputs.some(t => t)) {
|
|
throw new Error('pre-allocated output tensor is not supported for proxy.');
|
|
}
|
|
ensureWorker();
|
|
return new Promise<SerializableTensorMetadata[]>((resolve, reject) => {
|
|
runCallbacks.push([resolve, reject]);
|
|
const serializableInputs = inputs as SerializableTensorMetadata[]; // every input is on CPU.
|
|
const message: OrtWasmMessage =
|
|
{type: 'run', in : {sessionId, inputIndices, inputs: serializableInputs, outputIndices, options}};
|
|
proxyWorker!.postMessage(message, core.extractTransferableBuffers(serializableInputs));
|
|
});
|
|
} else {
|
|
return core.run(sessionId, inputIndices, inputs, outputIndices, outputs, options);
|
|
}
|
|
};
|
|
|
|
export const endProfiling = async(sessionId: number): Promise<void> => {
|
|
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
|
|
ensureWorker();
|
|
return new Promise<void>((resolve, reject) => {
|
|
endProfilingCallbacks.push([resolve, reject]);
|
|
const message: OrtWasmMessage = {type: 'end-profiling', in : sessionId};
|
|
proxyWorker!.postMessage(message);
|
|
});
|
|
} else {
|
|
core.endProfiling(sessionId);
|
|
}
|
|
};
|