onnxruntime/js/web/lib/wasm/proxy-wrapper.ts

220 lines
7.6 KiB
TypeScript
Raw Normal View History

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
import {env, InferenceSession} from 'onnxruntime-common';
import {OrtWasmMessage, SerializableInternalBuffer, SerializableSessionMetadata, SerializableTensorMetadata, TensorMetadata} from './proxy-messages';
import * as core from './wasm-core-impl';
import {initializeWebAssembly} from './wasm-factory';
const isProxy = (): boolean => !!env.wasm.proxy && typeof document !== 'undefined';
let proxyWorker: Worker|undefined;
let initializing = false;
let initialized = false;
let aborted = false;
type PromiseCallbacks<T = void> = [resolve: (result: T) => void, reject: (reason: unknown) => void];
let initWasmCallbacks: PromiseCallbacks;
const queuedCallbacks: Map<OrtWasmMessage['type'], Array<PromiseCallbacks<unknown>>> = new Map();
const enqueueCallbacks = (type: OrtWasmMessage['type'], callbacks: PromiseCallbacks<unknown>): void => {
const queue = queuedCallbacks.get(type);
if (queue) {
queue.push(callbacks);
} else {
queuedCallbacks.set(type, [callbacks]);
}
};
const ensureWorker = (): void => {
if (initializing || !initialized || aborted || !proxyWorker) {
throw new Error('worker not ready');
}
};
const onProxyWorkerMessage = (ev: MessageEvent<OrtWasmMessage>): void => {
switch (ev.data.type) {
case 'init-wasm':
initializing = false;
if (ev.data.err) {
aborted = true;
initWasmCallbacks[1](ev.data.err);
} else {
initialized = true;
initWasmCallbacks[0]();
}
break;
case 'init-ep':
case 'copy-from':
case 'create':
case 'release':
case 'run':
case 'end-profiling': {
const callbacks = queuedCallbacks.get(ev.data.type)!;
if (ev.data.err) {
callbacks.shift()![1](ev.data.err);
} else {
callbacks.shift()![0](ev.data.out!);
}
break;
}
default:
}
};
const scriptSrc = typeof document !== 'undefined' ? (document?.currentScript as HTMLScriptElement)?.src : undefined;
export const initializeWebAssemblyAndOrtRuntime = async(): Promise<void> => {
if (initialized) {
return;
}
if (initializing) {
throw new Error('multiple calls to \'initWasm()\' detected.');
}
if (aborted) {
throw new Error('previous call to \'initWasm()\' failed.');
}
initializing = true;
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
// overwrite wasm filepaths
if (env.wasm.wasmPaths === undefined) {
if (scriptSrc && scriptSrc.indexOf('blob:') !== 0) {
env.wasm.wasmPaths = scriptSrc.substr(0, +(scriptSrc).lastIndexOf('/') + 1);
}
}
return new Promise<void>((resolve, reject) => {
proxyWorker?.terminate();
const workerUrl = URL.createObjectURL(new Blob(
[
// This require() function is handled by esbuild plugin to load file content as string.
// eslint-disable-next-line @typescript-eslint/no-require-imports
require('./proxy-worker/main')
],
{type: 'text/javascript'}));
proxyWorker = new Worker(workerUrl, {name: 'ort-wasm-proxy-worker'});
proxyWorker.onerror = (ev: ErrorEvent) => reject(ev);
proxyWorker.onmessage = onProxyWorkerMessage;
URL.revokeObjectURL(workerUrl);
initWasmCallbacks = [resolve, reject];
const message: OrtWasmMessage = {type: 'init-wasm', in : env};
proxyWorker.postMessage(message);
});
} else {
try {
await initializeWebAssembly(env.wasm);
await core.initRuntime(env);
initialized = true;
} catch (e) {
aborted = true;
throw e;
} finally {
initializing = false;
}
}
};
export const initializeOrtEp = async(epName: string): Promise<void> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
ensureWorker();
return new Promise<void>((resolve, reject) => {
enqueueCallbacks('init-ep', [resolve, reject]);
const message: OrtWasmMessage = {type: 'init-ep', in : {epName, env}};
proxyWorker!.postMessage(message);
});
} else {
await core.initEp(env, epName);
}
};
export const copyFromExternalBuffer = async(buffer: Uint8Array): Promise<SerializableInternalBuffer> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
ensureWorker();
return new Promise<SerializableInternalBuffer>((resolve, reject) => {
enqueueCallbacks('copy-from', [resolve, reject]);
const message: OrtWasmMessage = {type: 'copy-from', in : {buffer}};
proxyWorker!.postMessage(message, [buffer.buffer]);
});
} else {
return core.copyFromExternalBuffer(buffer);
}
};
export const createSession =
async(model: SerializableInternalBuffer|Uint8Array, options?: InferenceSession.SessionOptions):
Promise<SerializableSessionMetadata> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
// check unsupported options
if (options?.preferredOutputLocation) {
throw new Error('session option "preferredOutputLocation" is not supported for proxy.');
}
ensureWorker();
return new Promise<SerializableSessionMetadata>((resolve, reject) => {
enqueueCallbacks('create', [resolve, reject]);
const message: OrtWasmMessage = {type: 'create', in : {model, options}};
const transferable: Transferable[] = [];
if (model instanceof Uint8Array) {
transferable.push(model.buffer);
}
proxyWorker!.postMessage(message, transferable);
});
} else {
return core.createSession(model, options);
}
};
export const releaseSession = async(sessionId: number): Promise<void> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
ensureWorker();
return new Promise<void>((resolve, reject) => {
enqueueCallbacks('release', [resolve, reject]);
const message: OrtWasmMessage = {type: 'release', in : sessionId};
proxyWorker!.postMessage(message);
});
} else {
core.releaseSession(sessionId);
}
};
export const run = async(
[js/webgpu] support IO binding (#17480) <del> **This PR is based on a few prerequisites PRs. They are listed as below:** - #17465 - #17469 - #17470 - #17472 - #17473 - #17484 Please review the current change by only looking at commit e2e6623e673ec6de55a5c1f8edcbd3a46b535a89 and later. </del> ### Description This PR introduces WebGPU IO binding. This new feature allows onnxruntime-web users to use tensors created from GPU as model input/output so that a model inferencing can be done without unnecessary data copy between CPU and GPU for model input/output. ### Examples An E2E demo/example is being worked on. Following is some simple demo with code snippet. Let's first check today how we do: ```js // STEP.1 - create an inference session: const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'] }); // STEP.2 - create model input: (supposing myImageCpuData is a Float32Array) const feeds = { 'input_image:0': new ort.Tensor('float32', myImageCpuData, [1, 224, 224, 3]) }; // STEP.3 - run model const myResults = await mySession.run(feeds); // STEP.4 - get output data const myData = myResults['output_image:0'].data; // Float32Array ``` #### for inputs (GPU tensor): Now, with IO binding, you can create a tensor from a GPU buffer, and feed it to the model: ```js // new STEP.2.A - create model input from a GPU buffer: (supposing myInputGpuBuffer is a `GPUBuffer` object with input data) const feeds = { 'input_image:0': ort.Tensor.fromGpuBuffer(myInputGpuBuffer, { dataType: 'float32', dims: [1, 224, 224, 3] }) }; ``` ### for outputs (pre-allocated GPU tensor) you can also do that for output, **if you know the output shape**: ```js // new STEP.2.B - create model output from a GPU buffer: (supposing myOutputGpuBuffer is a pre-allocated `GPUBuffer` object) const fetches = { 'output_image:0': ort.Tensor.fromGpuBuffer(myOutputGpuBuffer, { dataType: 'float32', dims: [1, 512, 512, 3] }) }; // new STEP.3 - run model with pre-allocated output (fetches) const myResults = await mySession.run(feeds, fetches); ``` ### for outputs (specify location) if you do not know the output shape, you can specify the output location when creating the session: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: "gpu-buffer" }); ``` if the model has multiple outputs, you can specify them seperately: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: { "output_image:0": "gpu-buffer" } }); ``` now you don't need to prepare the `fetches` object and onnxruntime-web will prepare output data on the location that specified. #### read data when you get the output tensor, you can: ```js // get the gpu buffer object: const gpuBuffer = myOutputTensor.gpuBuffer; // GPUBuffer // get the CPU data asynchronizely const cpuData = await myOutputTensor.getData(); // get the CPU data asynchronizely and release the underlying GPU resources const cpuData = await myOutputTensor.getData(true); // dispose the tensor (release the underlying GPU resources). This tensor object will be invalid after dispose() is called. myOutputTensor.dispose(); ``` #### resource management JavaScript has GC so you don't need to worry about managing JavaScript objects. But there are 2 types of resources that are not managed by GC: - GPU buffer that used in tensors - Underlying ORT native resources To simplify, most of the unmanaged resources and handled inside ORT web. But there are a few resources that need users to manage: - All external GPU resources, including GPU buffers inside all tensors created by `Tensor.fromGpuBuffer()`, will not be managed by ORT. User should manage those GPU buffers themselves. - When a session is created with `preferredOutputLocation` == "gpu-buffer" specified in session options, and the corresponding output is not pre-allocated, user need to call the output tensor's `dispose()` or `getData(true)` to manually release the underlying GPU buffers. - ORT internal errors (including providing a pre-allocated output tensor with wrong type/dims) will invalidate the whole wasm memory and is not recoverable. An exception is thrown in this situation.
2023-09-29 18:24:42 +00:00
sessionId: number, inputIndices: number[], inputs: TensorMetadata[], outputIndices: number[],
outputs: Array<TensorMetadata|null>, options: InferenceSession.RunOptions): Promise<TensorMetadata[]> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
[js/webgpu] support IO binding (#17480) <del> **This PR is based on a few prerequisites PRs. They are listed as below:** - #17465 - #17469 - #17470 - #17472 - #17473 - #17484 Please review the current change by only looking at commit e2e6623e673ec6de55a5c1f8edcbd3a46b535a89 and later. </del> ### Description This PR introduces WebGPU IO binding. This new feature allows onnxruntime-web users to use tensors created from GPU as model input/output so that a model inferencing can be done without unnecessary data copy between CPU and GPU for model input/output. ### Examples An E2E demo/example is being worked on. Following is some simple demo with code snippet. Let's first check today how we do: ```js // STEP.1 - create an inference session: const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'] }); // STEP.2 - create model input: (supposing myImageCpuData is a Float32Array) const feeds = { 'input_image:0': new ort.Tensor('float32', myImageCpuData, [1, 224, 224, 3]) }; // STEP.3 - run model const myResults = await mySession.run(feeds); // STEP.4 - get output data const myData = myResults['output_image:0'].data; // Float32Array ``` #### for inputs (GPU tensor): Now, with IO binding, you can create a tensor from a GPU buffer, and feed it to the model: ```js // new STEP.2.A - create model input from a GPU buffer: (supposing myInputGpuBuffer is a `GPUBuffer` object with input data) const feeds = { 'input_image:0': ort.Tensor.fromGpuBuffer(myInputGpuBuffer, { dataType: 'float32', dims: [1, 224, 224, 3] }) }; ``` ### for outputs (pre-allocated GPU tensor) you can also do that for output, **if you know the output shape**: ```js // new STEP.2.B - create model output from a GPU buffer: (supposing myOutputGpuBuffer is a pre-allocated `GPUBuffer` object) const fetches = { 'output_image:0': ort.Tensor.fromGpuBuffer(myOutputGpuBuffer, { dataType: 'float32', dims: [1, 512, 512, 3] }) }; // new STEP.3 - run model with pre-allocated output (fetches) const myResults = await mySession.run(feeds, fetches); ``` ### for outputs (specify location) if you do not know the output shape, you can specify the output location when creating the session: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: "gpu-buffer" }); ``` if the model has multiple outputs, you can specify them seperately: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: { "output_image:0": "gpu-buffer" } }); ``` now you don't need to prepare the `fetches` object and onnxruntime-web will prepare output data on the location that specified. #### read data when you get the output tensor, you can: ```js // get the gpu buffer object: const gpuBuffer = myOutputTensor.gpuBuffer; // GPUBuffer // get the CPU data asynchronizely const cpuData = await myOutputTensor.getData(); // get the CPU data asynchronizely and release the underlying GPU resources const cpuData = await myOutputTensor.getData(true); // dispose the tensor (release the underlying GPU resources). This tensor object will be invalid after dispose() is called. myOutputTensor.dispose(); ``` #### resource management JavaScript has GC so you don't need to worry about managing JavaScript objects. But there are 2 types of resources that are not managed by GC: - GPU buffer that used in tensors - Underlying ORT native resources To simplify, most of the unmanaged resources and handled inside ORT web. But there are a few resources that need users to manage: - All external GPU resources, including GPU buffers inside all tensors created by `Tensor.fromGpuBuffer()`, will not be managed by ORT. User should manage those GPU buffers themselves. - When a session is created with `preferredOutputLocation` == "gpu-buffer" specified in session options, and the corresponding output is not pre-allocated, user need to call the output tensor's `dispose()` or `getData(true)` to manually release the underlying GPU buffers. - ORT internal errors (including providing a pre-allocated output tensor with wrong type/dims) will invalidate the whole wasm memory and is not recoverable. An exception is thrown in this situation.
2023-09-29 18:24:42 +00:00
// check inputs location
if (inputs.some(t => t[3] !== 'cpu')) {
throw new Error('input tensor on GPU is not supported for proxy.');
}
// check outputs location
if (outputs.some(t => t)) {
throw new Error('pre-allocated output tensor is not supported for proxy.');
}
ensureWorker();
[js/webgpu] support IO binding (#17480) <del> **This PR is based on a few prerequisites PRs. They are listed as below:** - #17465 - #17469 - #17470 - #17472 - #17473 - #17484 Please review the current change by only looking at commit e2e6623e673ec6de55a5c1f8edcbd3a46b535a89 and later. </del> ### Description This PR introduces WebGPU IO binding. This new feature allows onnxruntime-web users to use tensors created from GPU as model input/output so that a model inferencing can be done without unnecessary data copy between CPU and GPU for model input/output. ### Examples An E2E demo/example is being worked on. Following is some simple demo with code snippet. Let's first check today how we do: ```js // STEP.1 - create an inference session: const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'] }); // STEP.2 - create model input: (supposing myImageCpuData is a Float32Array) const feeds = { 'input_image:0': new ort.Tensor('float32', myImageCpuData, [1, 224, 224, 3]) }; // STEP.3 - run model const myResults = await mySession.run(feeds); // STEP.4 - get output data const myData = myResults['output_image:0'].data; // Float32Array ``` #### for inputs (GPU tensor): Now, with IO binding, you can create a tensor from a GPU buffer, and feed it to the model: ```js // new STEP.2.A - create model input from a GPU buffer: (supposing myInputGpuBuffer is a `GPUBuffer` object with input data) const feeds = { 'input_image:0': ort.Tensor.fromGpuBuffer(myInputGpuBuffer, { dataType: 'float32', dims: [1, 224, 224, 3] }) }; ``` ### for outputs (pre-allocated GPU tensor) you can also do that for output, **if you know the output shape**: ```js // new STEP.2.B - create model output from a GPU buffer: (supposing myOutputGpuBuffer is a pre-allocated `GPUBuffer` object) const fetches = { 'output_image:0': ort.Tensor.fromGpuBuffer(myOutputGpuBuffer, { dataType: 'float32', dims: [1, 512, 512, 3] }) }; // new STEP.3 - run model with pre-allocated output (fetches) const myResults = await mySession.run(feeds, fetches); ``` ### for outputs (specify location) if you do not know the output shape, you can specify the output location when creating the session: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: "gpu-buffer" }); ``` if the model has multiple outputs, you can specify them seperately: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: { "output_image:0": "gpu-buffer" } }); ``` now you don't need to prepare the `fetches` object and onnxruntime-web will prepare output data on the location that specified. #### read data when you get the output tensor, you can: ```js // get the gpu buffer object: const gpuBuffer = myOutputTensor.gpuBuffer; // GPUBuffer // get the CPU data asynchronizely const cpuData = await myOutputTensor.getData(); // get the CPU data asynchronizely and release the underlying GPU resources const cpuData = await myOutputTensor.getData(true); // dispose the tensor (release the underlying GPU resources). This tensor object will be invalid after dispose() is called. myOutputTensor.dispose(); ``` #### resource management JavaScript has GC so you don't need to worry about managing JavaScript objects. But there are 2 types of resources that are not managed by GC: - GPU buffer that used in tensors - Underlying ORT native resources To simplify, most of the unmanaged resources and handled inside ORT web. But there are a few resources that need users to manage: - All external GPU resources, including GPU buffers inside all tensors created by `Tensor.fromGpuBuffer()`, will not be managed by ORT. User should manage those GPU buffers themselves. - When a session is created with `preferredOutputLocation` == "gpu-buffer" specified in session options, and the corresponding output is not pre-allocated, user need to call the output tensor's `dispose()` or `getData(true)` to manually release the underlying GPU buffers. - ORT internal errors (including providing a pre-allocated output tensor with wrong type/dims) will invalidate the whole wasm memory and is not recoverable. An exception is thrown in this situation.
2023-09-29 18:24:42 +00:00
return new Promise<SerializableTensorMetadata[]>((resolve, reject) => {
enqueueCallbacks('run', [resolve, reject]);
[js/webgpu] support IO binding (#17480) <del> **This PR is based on a few prerequisites PRs. They are listed as below:** - #17465 - #17469 - #17470 - #17472 - #17473 - #17484 Please review the current change by only looking at commit e2e6623e673ec6de55a5c1f8edcbd3a46b535a89 and later. </del> ### Description This PR introduces WebGPU IO binding. This new feature allows onnxruntime-web users to use tensors created from GPU as model input/output so that a model inferencing can be done without unnecessary data copy between CPU and GPU for model input/output. ### Examples An E2E demo/example is being worked on. Following is some simple demo with code snippet. Let's first check today how we do: ```js // STEP.1 - create an inference session: const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'] }); // STEP.2 - create model input: (supposing myImageCpuData is a Float32Array) const feeds = { 'input_image:0': new ort.Tensor('float32', myImageCpuData, [1, 224, 224, 3]) }; // STEP.3 - run model const myResults = await mySession.run(feeds); // STEP.4 - get output data const myData = myResults['output_image:0'].data; // Float32Array ``` #### for inputs (GPU tensor): Now, with IO binding, you can create a tensor from a GPU buffer, and feed it to the model: ```js // new STEP.2.A - create model input from a GPU buffer: (supposing myInputGpuBuffer is a `GPUBuffer` object with input data) const feeds = { 'input_image:0': ort.Tensor.fromGpuBuffer(myInputGpuBuffer, { dataType: 'float32', dims: [1, 224, 224, 3] }) }; ``` ### for outputs (pre-allocated GPU tensor) you can also do that for output, **if you know the output shape**: ```js // new STEP.2.B - create model output from a GPU buffer: (supposing myOutputGpuBuffer is a pre-allocated `GPUBuffer` object) const fetches = { 'output_image:0': ort.Tensor.fromGpuBuffer(myOutputGpuBuffer, { dataType: 'float32', dims: [1, 512, 512, 3] }) }; // new STEP.3 - run model with pre-allocated output (fetches) const myResults = await mySession.run(feeds, fetches); ``` ### for outputs (specify location) if you do not know the output shape, you can specify the output location when creating the session: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: "gpu-buffer" }); ``` if the model has multiple outputs, you can specify them seperately: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: { "output_image:0": "gpu-buffer" } }); ``` now you don't need to prepare the `fetches` object and onnxruntime-web will prepare output data on the location that specified. #### read data when you get the output tensor, you can: ```js // get the gpu buffer object: const gpuBuffer = myOutputTensor.gpuBuffer; // GPUBuffer // get the CPU data asynchronizely const cpuData = await myOutputTensor.getData(); // get the CPU data asynchronizely and release the underlying GPU resources const cpuData = await myOutputTensor.getData(true); // dispose the tensor (release the underlying GPU resources). This tensor object will be invalid after dispose() is called. myOutputTensor.dispose(); ``` #### resource management JavaScript has GC so you don't need to worry about managing JavaScript objects. But there are 2 types of resources that are not managed by GC: - GPU buffer that used in tensors - Underlying ORT native resources To simplify, most of the unmanaged resources and handled inside ORT web. But there are a few resources that need users to manage: - All external GPU resources, including GPU buffers inside all tensors created by `Tensor.fromGpuBuffer()`, will not be managed by ORT. User should manage those GPU buffers themselves. - When a session is created with `preferredOutputLocation` == "gpu-buffer" specified in session options, and the corresponding output is not pre-allocated, user need to call the output tensor's `dispose()` or `getData(true)` to manually release the underlying GPU buffers. - ORT internal errors (including providing a pre-allocated output tensor with wrong type/dims) will invalidate the whole wasm memory and is not recoverable. An exception is thrown in this situation.
2023-09-29 18:24:42 +00:00
const serializableInputs = inputs as SerializableTensorMetadata[]; // every input is on CPU.
const message: OrtWasmMessage =
{type: 'run', in : {sessionId, inputIndices, inputs: serializableInputs, outputIndices, options}};
proxyWorker!.postMessage(message, core.extractTransferableBuffers(serializableInputs));
});
} else {
[js/webgpu] support IO binding (#17480) <del> **This PR is based on a few prerequisites PRs. They are listed as below:** - #17465 - #17469 - #17470 - #17472 - #17473 - #17484 Please review the current change by only looking at commit e2e6623e673ec6de55a5c1f8edcbd3a46b535a89 and later. </del> ### Description This PR introduces WebGPU IO binding. This new feature allows onnxruntime-web users to use tensors created from GPU as model input/output so that a model inferencing can be done without unnecessary data copy between CPU and GPU for model input/output. ### Examples An E2E demo/example is being worked on. Following is some simple demo with code snippet. Let's first check today how we do: ```js // STEP.1 - create an inference session: const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'] }); // STEP.2 - create model input: (supposing myImageCpuData is a Float32Array) const feeds = { 'input_image:0': new ort.Tensor('float32', myImageCpuData, [1, 224, 224, 3]) }; // STEP.3 - run model const myResults = await mySession.run(feeds); // STEP.4 - get output data const myData = myResults['output_image:0'].data; // Float32Array ``` #### for inputs (GPU tensor): Now, with IO binding, you can create a tensor from a GPU buffer, and feed it to the model: ```js // new STEP.2.A - create model input from a GPU buffer: (supposing myInputGpuBuffer is a `GPUBuffer` object with input data) const feeds = { 'input_image:0': ort.Tensor.fromGpuBuffer(myInputGpuBuffer, { dataType: 'float32', dims: [1, 224, 224, 3] }) }; ``` ### for outputs (pre-allocated GPU tensor) you can also do that for output, **if you know the output shape**: ```js // new STEP.2.B - create model output from a GPU buffer: (supposing myOutputGpuBuffer is a pre-allocated `GPUBuffer` object) const fetches = { 'output_image:0': ort.Tensor.fromGpuBuffer(myOutputGpuBuffer, { dataType: 'float32', dims: [1, 512, 512, 3] }) }; // new STEP.3 - run model with pre-allocated output (fetches) const myResults = await mySession.run(feeds, fetches); ``` ### for outputs (specify location) if you do not know the output shape, you can specify the output location when creating the session: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: "gpu-buffer" }); ``` if the model has multiple outputs, you can specify them seperately: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: { "output_image:0": "gpu-buffer" } }); ``` now you don't need to prepare the `fetches` object and onnxruntime-web will prepare output data on the location that specified. #### read data when you get the output tensor, you can: ```js // get the gpu buffer object: const gpuBuffer = myOutputTensor.gpuBuffer; // GPUBuffer // get the CPU data asynchronizely const cpuData = await myOutputTensor.getData(); // get the CPU data asynchronizely and release the underlying GPU resources const cpuData = await myOutputTensor.getData(true); // dispose the tensor (release the underlying GPU resources). This tensor object will be invalid after dispose() is called. myOutputTensor.dispose(); ``` #### resource management JavaScript has GC so you don't need to worry about managing JavaScript objects. But there are 2 types of resources that are not managed by GC: - GPU buffer that used in tensors - Underlying ORT native resources To simplify, most of the unmanaged resources and handled inside ORT web. But there are a few resources that need users to manage: - All external GPU resources, including GPU buffers inside all tensors created by `Tensor.fromGpuBuffer()`, will not be managed by ORT. User should manage those GPU buffers themselves. - When a session is created with `preferredOutputLocation` == "gpu-buffer" specified in session options, and the corresponding output is not pre-allocated, user need to call the output tensor's `dispose()` or `getData(true)` to manually release the underlying GPU buffers. - ORT internal errors (including providing a pre-allocated output tensor with wrong type/dims) will invalidate the whole wasm memory and is not recoverable. An exception is thrown in this situation.
2023-09-29 18:24:42 +00:00
return core.run(sessionId, inputIndices, inputs, outputIndices, outputs, options);
}
};
export const endProfiling = async(sessionId: number): Promise<void> => {
if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
ensureWorker();
return new Promise<void>((resolve, reject) => {
enqueueCallbacks('end-profiling', [resolve, reject]);
const message: OrtWasmMessage = {type: 'end-profiling', in : sessionId};
proxyWorker!.postMessage(message);
});
} else {
core.endProfiling(sessionId);
}
};