onnxruntime/js/web/lib/wasm/proxy-wrapper.ts
Yulong Wang 561aca97cf
[js/webgpu] support IO binding (#17480)
<del>
**This PR is based on a few prerequisites PRs. They are listed as
below:**
- #17465
- #17469
- #17470
- #17472
- #17473
- #17484

Please review the current change by only looking at commit
e2e6623e673ec6de55a5c1f8edcbd3a46b535a89 and later.


</del>

### Description

This PR introduces WebGPU IO binding. This new feature allows
onnxruntime-web users to use tensors created from GPU as model
input/output so that a model inferencing can be done without unnecessary
data copy between CPU and GPU for model input/output.

### Examples

An E2E demo/example is being worked on.

Following is some simple demo with code snippet.

Let's first check today how we do:
```js
// STEP.1 - create an inference session:
const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'] });

// STEP.2 - create model input: (supposing myImageCpuData is a Float32Array)
const feeds = {
  'input_image:0': new ort.Tensor('float32', myImageCpuData, [1, 224, 224, 3])
};

// STEP.3 - run model
const myResults = await mySession.run(feeds);

// STEP.4 - get output data
const myData = myResults['output_image:0'].data; // Float32Array

```

#### for inputs (GPU tensor):

Now, with IO binding, you can create a tensor from a GPU buffer, and
feed it to the model:
```js
// new STEP.2.A - create model input from a GPU buffer: (supposing myInputGpuBuffer is a `GPUBuffer` object with input data)
const feeds = {
  'input_image:0': ort.Tensor.fromGpuBuffer(myInputGpuBuffer, { dataType: 'float32', dims: [1, 224, 224, 3] })
};
```

### for outputs (pre-allocated GPU tensor)

you can also do that for output, **if you know the output shape**:
```js
// new STEP.2.B - create model output from a GPU buffer: (supposing myOutputGpuBuffer is a pre-allocated `GPUBuffer` object)
const fetches = {
  'output_image:0': ort.Tensor.fromGpuBuffer(myOutputGpuBuffer, { dataType: 'float32', dims: [1, 512, 512, 3] })
};

// new STEP.3 - run model with pre-allocated output (fetches)
const myResults = await mySession.run(feeds, fetches);
```

### for outputs (specify location)

if you do not know the output shape, you can specify the output location
when creating the session:

```js
// new STEP.1 - create an inference session with an option "preferredOutputLocation":
const mySession = await ort.InferenceSession.create('./my_model.onnx', {
    executionProviders: ['webgpu'],
    preferredOutputLocation: "gpu-buffer"
});
```

if the model has multiple outputs, you can specify them seperately:
```js
// new STEP.1 - create an inference session with an option "preferredOutputLocation":
const mySession = await ort.InferenceSession.create('./my_model.onnx', {
    executionProviders: ['webgpu'],
    preferredOutputLocation: {
         "output_image:0": "gpu-buffer"
    }
});
```

now you don't need to prepare the `fetches` object and onnxruntime-web
will prepare output data on the location that specified.

#### read data

when you get the output tensor, you can:
```js
// get the gpu buffer object:
const gpuBuffer = myOutputTensor.gpuBuffer; // GPUBuffer

// get the CPU data asynchronizely
const cpuData = await myOutputTensor.getData();

// get the CPU data asynchronizely and release the underlying GPU resources
const cpuData = await myOutputTensor.getData(true);

// dispose the tensor (release the underlying GPU resources). This tensor object will be invalid after dispose() is called.
myOutputTensor.dispose();
```

#### resource management

JavaScript has GC so you don't need to worry about managing JavaScript
objects. But there are 2 types of resources that are not managed by GC:
- GPU buffer that used in tensors
- Underlying ORT native resources

To simplify, most of the unmanaged resources and handled inside ORT web.
But there are a few resources that need users to manage:
- All external GPU resources, including GPU buffers inside all tensors
created by `Tensor.fromGpuBuffer()`, will not be managed by ORT. User
should manage those GPU buffers themselves.
- When a session is created with `preferredOutputLocation` ==
"gpu-buffer" specified in session options, and the corresponding output
is not pre-allocated, user need to call the output tensor's `dispose()`
or `getData(true)` to manually release the underlying GPU buffers.
- ORT internal errors (including providing a pre-allocated output tensor
with wrong type/dims) will invalidate the whole wasm memory and is not
recoverable. An exception is thrown in this situation.
2023-09-29 11:24:42 -07:00

244 lines
8.6 KiB
TypeScript

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
import {Env, env, InferenceSession} from 'onnxruntime-common';
import {OrtWasmMessage, SerializableModeldata, SerializableSessionMetadata, SerializableTensorMetadata, TensorMetadata} from './proxy-messages';
import * as core from './wasm-core-impl';
import {initializeWebAssembly} from './wasm-factory';
const isProxy = (): boolean => !!env.wasm.proxy && typeof document !== 'undefined';
let proxyWorker: Worker|undefined;
let initializing = false;
let initialized = false;
let aborted = false;
// resolve; reject
type PromiseCallbacks<T = void> = [(result: T) => void, (reason: unknown) => void];
let initWasmCallbacks: PromiseCallbacks;
let initOrtCallbacks: PromiseCallbacks;
const createSessionAllocateCallbacks: Array<PromiseCallbacks<SerializableModeldata>> = [];
const createSessionFinalizeCallbacks: Array<PromiseCallbacks<SerializableSessionMetadata>> = [];
const createSessionCallbacks: Array<PromiseCallbacks<SerializableSessionMetadata>> = [];
const releaseSessionCallbacks: Array<PromiseCallbacks<void>> = [];
const runCallbacks: Array<PromiseCallbacks<SerializableTensorMetadata[]>> = [];
const endProfilingCallbacks: Array<PromiseCallbacks<void>> = [];
const ensureWorker = (): void => {
if (initializing || !initialized || aborted || !proxyWorker) {
throw new Error('worker not ready');
}
};
const onProxyWorkerMessage = (ev: MessageEvent<OrtWasmMessage>): void => {
switch (ev.data.type) {
case 'init-wasm':
initializing = false;
if (ev.data.err) {
aborted = true;
initWasmCallbacks[1](ev.data.err);
} else {
initialized = true;
initWasmCallbacks[0]();
}
break;
case 'init-ort':
if (ev.data.err) {
initOrtCallbacks[1](ev.data.err);
} else {
initOrtCallbacks[0]();
}
break;
case 'create_allocate':
if (ev.data.err) {
createSessionAllocateCallbacks.shift()![1](ev.data.err);
} else {
createSessionAllocateCallbacks.shift()![0](ev.data.out!);
}
break;
case 'create_finalize':
if (ev.data.err) {
createSessionFinalizeCallbacks.shift()![1](ev.data.err);
} else {
createSessionFinalizeCallbacks.shift()![0](ev.data.out!);
}
break;
case 'create':
if (ev.data.err) {
createSessionCallbacks.shift()![1](ev.data.err);
} else {
createSessionCallbacks.shift()![0](ev.data.out!);
}
break;
case 'release':
if (ev.data.err) {
releaseSessionCallbacks.shift()![1](ev.data.err);
} else {
releaseSessionCallbacks.shift()![0]();
}
break;
case 'run':
if (ev.data.err) {
runCallbacks.shift()![1](ev.data.err);
} else {
runCallbacks.shift()![0](ev.data.out!);
}
break;
case 'end-profiling':
if (ev.data.err) {
endProfilingCallbacks.shift()![1](ev.data.err);
} else {
endProfilingCallbacks.shift()![0]();
}
break;
default:
}
};
const scriptSrc = typeof document !== 'undefined' ? (document?.currentScript as HTMLScriptElement)?.src : undefined;
export const initializeWebAssemblyInstance = async(): Promise<void> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
if (initialized) {
return;
}
if (initializing) {
throw new Error('multiple calls to \'initWasm()\' detected.');
}
if (aborted) {
throw new Error('previous call to \'initWasm()\' failed.');
}
initializing = true;
// overwrite wasm filepaths
if (env.wasm.wasmPaths === undefined) {
if (scriptSrc && scriptSrc.indexOf('blob:') !== 0) {
env.wasm.wasmPaths = scriptSrc.substr(0, +(scriptSrc).lastIndexOf('/') + 1);
}
}
return new Promise<void>((resolve, reject) => {
proxyWorker?.terminate();
// eslint-disable-next-line @typescript-eslint/no-var-requires, @typescript-eslint/no-require-imports
proxyWorker = require('worker-loader?inline=no-fallback!./proxy-worker/main').default() as Worker;
proxyWorker.onmessage = onProxyWorkerMessage;
initWasmCallbacks = [resolve, reject];
const message: OrtWasmMessage = {type: 'init-wasm', in : env.wasm};
proxyWorker.postMessage(message);
});
} else {
return initializeWebAssembly(env.wasm);
}
};
export const initializeRuntime = async(env: Env): Promise<void> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
ensureWorker();
return new Promise<void>((resolve, reject) => {
initOrtCallbacks = [resolve, reject];
const message: OrtWasmMessage = {type: 'init-ort', in : env};
proxyWorker!.postMessage(message);
});
} else {
await core.initRuntime(env);
}
};
export const createSessionAllocate = async(model: Uint8Array): Promise<SerializableModeldata> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
ensureWorker();
return new Promise<SerializableModeldata>((resolve, reject) => {
createSessionAllocateCallbacks.push([resolve, reject]);
const message: OrtWasmMessage = {type: 'create_allocate', in : {model}};
proxyWorker!.postMessage(message, [model.buffer]);
});
} else {
return core.createSessionAllocate(model);
}
};
export const createSessionFinalize = async(modeldata: SerializableModeldata, options?: InferenceSession.SessionOptions):
Promise<SerializableSessionMetadata> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
ensureWorker();
return new Promise<SerializableSessionMetadata>((resolve, reject) => {
createSessionFinalizeCallbacks.push([resolve, reject]);
const message: OrtWasmMessage = {type: 'create_finalize', in : {modeldata, options}};
proxyWorker!.postMessage(message);
});
} else {
return core.createSessionFinalize(modeldata, options);
}
};
export const createSession =
async(model: Uint8Array, options?: InferenceSession.SessionOptions): Promise<SerializableSessionMetadata> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
// check unsupported options
if (options?.preferredOutputLocation) {
throw new Error('session option "preferredOutputLocation" is not supported for proxy.');
}
ensureWorker();
return new Promise<SerializableSessionMetadata>((resolve, reject) => {
createSessionCallbacks.push([resolve, reject]);
const message: OrtWasmMessage = {type: 'create', in : {model, options}};
proxyWorker!.postMessage(message, [model.buffer]);
});
} else {
return core.createSession(model, options);
}
};
export const releaseSession = async(sessionId: number): Promise<void> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
ensureWorker();
return new Promise<void>((resolve, reject) => {
releaseSessionCallbacks.push([resolve, reject]);
const message: OrtWasmMessage = {type: 'release', in : sessionId};
proxyWorker!.postMessage(message);
});
} else {
core.releaseSession(sessionId);
}
};
export const run = async(
sessionId: number, inputIndices: number[], inputs: TensorMetadata[], outputIndices: number[],
outputs: Array<TensorMetadata|null>, options: InferenceSession.RunOptions): Promise<TensorMetadata[]> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
// check inputs location
if (inputs.some(t => t[3] !== 'cpu')) {
throw new Error('input tensor on GPU is not supported for proxy.');
}
// check outputs location
if (outputs.some(t => t)) {
throw new Error('pre-allocated output tensor is not supported for proxy.');
}
ensureWorker();
return new Promise<SerializableTensorMetadata[]>((resolve, reject) => {
runCallbacks.push([resolve, reject]);
const serializableInputs = inputs as SerializableTensorMetadata[]; // every input is on CPU.
const message: OrtWasmMessage =
{type: 'run', in : {sessionId, inputIndices, inputs: serializableInputs, outputIndices, options}};
proxyWorker!.postMessage(message, core.extractTransferableBuffers(serializableInputs));
});
} else {
return core.run(sessionId, inputIndices, inputs, outputIndices, outputs, options);
}
};
export const endProfiling = async(sessionId: number): Promise<void> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
ensureWorker();
return new Promise<void>((resolve, reject) => {
endProfilingCallbacks.push([resolve, reject]);
const message: OrtWasmMessage = {type: 'end-profiling', in : sessionId};
proxyWorker!.postMessage(message);
});
} else {
core.endProfiling(sessionId);
}
};