onnxruntime/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
Yulong Wang 561aca97cf
[js/webgpu] support IO binding (#17480)
<del>
**This PR is based on a few prerequisites PRs. They are listed as
below:**
- #17465
- #17469
- #17470
- #17472
- #17473
- #17484

Please review the current change by only looking at commit
e2e6623e673ec6de55a5c1f8edcbd3a46b535a89 and later.


</del>

### Description

This PR introduces WebGPU IO binding. This new feature allows
onnxruntime-web users to use tensors created from GPU as model
input/output so that a model inferencing can be done without unnecessary
data copy between CPU and GPU for model input/output.

### Examples

An E2E demo/example is being worked on.

Following is some simple demo with code snippet.

Let's first check today how we do:
```js
// STEP.1 - create an inference session:
const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'] });

// STEP.2 - create model input: (supposing myImageCpuData is a Float32Array)
const feeds = {
  'input_image:0': new ort.Tensor('float32', myImageCpuData, [1, 224, 224, 3])
};

// STEP.3 - run model
const myResults = await mySession.run(feeds);

// STEP.4 - get output data
const myData = myResults['output_image:0'].data; // Float32Array

```

#### for inputs (GPU tensor):

Now, with IO binding, you can create a tensor from a GPU buffer, and
feed it to the model:
```js
// new STEP.2.A - create model input from a GPU buffer: (supposing myInputGpuBuffer is a `GPUBuffer` object with input data)
const feeds = {
  'input_image:0': ort.Tensor.fromGpuBuffer(myInputGpuBuffer, { dataType: 'float32', dims: [1, 224, 224, 3] })
};
```

### for outputs (pre-allocated GPU tensor)

you can also do that for output, **if you know the output shape**:
```js
// new STEP.2.B - create model output from a GPU buffer: (supposing myOutputGpuBuffer is a pre-allocated `GPUBuffer` object)
const fetches = {
  'output_image:0': ort.Tensor.fromGpuBuffer(myOutputGpuBuffer, { dataType: 'float32', dims: [1, 512, 512, 3] })
};

// new STEP.3 - run model with pre-allocated output (fetches)
const myResults = await mySession.run(feeds, fetches);
```

### for outputs (specify location)

if you do not know the output shape, you can specify the output location
when creating the session:

```js
// new STEP.1 - create an inference session with an option "preferredOutputLocation":
const mySession = await ort.InferenceSession.create('./my_model.onnx', {
    executionProviders: ['webgpu'],
    preferredOutputLocation: "gpu-buffer"
});
```

if the model has multiple outputs, you can specify them seperately:
```js
// new STEP.1 - create an inference session with an option "preferredOutputLocation":
const mySession = await ort.InferenceSession.create('./my_model.onnx', {
    executionProviders: ['webgpu'],
    preferredOutputLocation: {
         "output_image:0": "gpu-buffer"
    }
});
```

now you don't need to prepare the `fetches` object and onnxruntime-web
will prepare output data on the location that specified.

#### read data

when you get the output tensor, you can:
```js
// get the gpu buffer object:
const gpuBuffer = myOutputTensor.gpuBuffer; // GPUBuffer

// get the CPU data asynchronizely
const cpuData = await myOutputTensor.getData();

// get the CPU data asynchronizely and release the underlying GPU resources
const cpuData = await myOutputTensor.getData(true);

// dispose the tensor (release the underlying GPU resources). This tensor object will be invalid after dispose() is called.
myOutputTensor.dispose();
```

#### resource management

JavaScript has GC so you don't need to worry about managing JavaScript
objects. But there are 2 types of resources that are not managed by GC:
- GPU buffer that used in tensors
- Underlying ORT native resources

To simplify, most of the unmanaged resources and handled inside ORT web.
But there are a few resources that need users to manage:
- All external GPU resources, including GPU buffers inside all tensors
created by `Tensor.fromGpuBuffer()`, will not be managed by ORT. User
should manage those GPU buffers themselves.
- When a session is created with `preferredOutputLocation` ==
"gpu-buffer" specified in session options, and the corresponding output
is not pre-allocated, user need to call the output tensor's `dispose()`
or `getData(true)` to manually release the underlying GPU buffers.
- ORT internal errors (including providing a pre-allocated output tensor
with wrong type/dims) will invalidate the whole wasm memory and is not
recoverable. An exception is thrown in this situation.
2023-09-29 11:24:42 -07:00

337 lines
12 KiB
TypeScript

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
import {WebGpuBackend} from '../backend-webgpu';
import {LOG_DEBUG} from '../log';
import {GpuData, GpuDataId, GpuDataType} from './types';
/**
* manages GpuDataId -> GpuBuffer
*/
export interface GpuDataManager {
/**
* copy data from CPU to GPU.
*/
upload(id: GpuDataId, data: Uint8Array): void;
/**
* copy data from GPU to GPU.
*/
memcpy(sourceId: GpuDataId, destinationId: GpuDataId): void;
/**
* create new data on GPU.
*/
create(size: number, usage?: number): GpuData;
/**
* get GPU data by ID.
*/
get(id: GpuDataId): GpuData|undefined;
/**
* release the data on GPU by ID.
*
* @return size of the data released
*/
release(id: GpuDataId): number;
/**
* copy data from GPU to CPU.
*/
download(id: GpuDataId, getTargetBuffer: () => Uint8Array): Promise<void>;
/**
* refresh the buffers that marked for release.
*
* when release() is called, the buffer is not released immediately. this is because we need to wait for the commands
* to be submitted to the GPU. this function is called after the commands are submitted so that the buffers can be
* actually released.
*/
refreshPendingBuffers(): void;
/**
* register an external buffer for IO Binding. If the buffer is already registered, return the existing GPU data ID.
*
* GPU data manager only manages a mapping between the buffer and the GPU data ID. It will not manage the lifecycle of
* the external buffer.
*/
registerExternalBuffer(buffer: GPUBuffer, originalSize: number, previousBuffer?: GPUBuffer): number;
/**
* unregister an external buffer for IO Binding.
*/
unregisterExternalBuffer(buffer: GPUBuffer): void;
/**
* destroy all gpu buffers. Call this when the session.release is called.
*/
dispose(): void;
}
interface StorageCacheValue {
gpuData: GpuData;
originalSize: number;
}
/**
* normalize the buffer size so that it fits the 128-bits (16 bytes) alignment.
*/
const calcNormalizedBufferSize = (size: number) => Math.ceil(size / 16) * 16;
let guid = 1;
const createNewGpuDataId = () => guid++;
/**
* exported standard download function. This function is used by the session to download the data from GPU, and also by
* factory to create GPU tensors with the capacity of downloading data from GPU.
*
* @param backend - the WebGPU backend
* @param gpuBuffer - the GPU buffer to download
* @param originalSize - the original size of the data
* @param getTargetBuffer - optional. If provided, the data will be copied to the target buffer. Otherwise, a new buffer
* will be created and returned.
*/
export const downloadGpuData =
async(backend: WebGpuBackend, gpuBuffer: GPUBuffer, originalSize: number, getTargetBuffer?: () => Uint8Array):
Promise<Uint8Array> => {
const bufferSize = calcNormalizedBufferSize(originalSize);
const gpuReadBuffer = backend.device.createBuffer(
// eslint-disable-next-line no-bitwise
{size: bufferSize, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ});
try {
const commandEncoder = backend.getCommandEncoder();
backend.endComputePass();
commandEncoder.copyBufferToBuffer(
gpuBuffer /* source buffer */, 0 /* source offset */, gpuReadBuffer /* destination buffer */,
0 /* destination offset */, bufferSize /* size */
);
backend.flush();
await gpuReadBuffer.mapAsync(GPUMapMode.READ);
const arrayBuffer = gpuReadBuffer.getMappedRange();
if (getTargetBuffer) {
// if we already have a CPU buffer to accept the data, no need to clone the ArrayBuffer.
const targetBuffer = getTargetBuffer();
targetBuffer.set(new Uint8Array(arrayBuffer, 0, originalSize));
return targetBuffer;
} else {
// the mapped ArrayBuffer will be released when the GPU buffer is destroyed. Need to clone the
// ArrayBuffer.
return new Uint8Array(arrayBuffer.slice(0, originalSize));
}
} finally {
gpuReadBuffer.destroy();
}
};
class GpuDataManagerImpl implements GpuDataManager {
// GPU Data ID => GPU Data ( storage buffer )
private storageCache: Map<GpuDataId, StorageCacheValue>;
// pending buffers for uploading ( data is unmapped )
private buffersForUploadingPending: GPUBuffer[];
// pending buffers for computing
private buffersPending: GPUBuffer[];
// The reusable storage buffers for computing.
private freeBuffers: Map<number, GPUBuffer[]>;
// The external buffers registered users for IO Binding.
private externalBuffers: Map<GPUBuffer, GpuDataId>;
constructor(private backend: WebGpuBackend) {
this.storageCache = new Map();
this.freeBuffers = new Map();
this.buffersForUploadingPending = [];
this.buffersPending = [];
this.externalBuffers = new Map();
}
upload(id: GpuDataId, data: Uint8Array): void {
const srcArrayBuffer = data.buffer;
const srcOffset = data.byteOffset;
const srcLength = data.byteLength;
const size = calcNormalizedBufferSize(srcLength);
// get destination gpu buffer
const gpuDataCache = this.storageCache.get(id);
if (!gpuDataCache) {
throw new Error('gpu data for uploading does not exist');
}
if (gpuDataCache.originalSize !== srcLength) {
throw new Error(`inconsistent data size. gpu data size=${gpuDataCache.originalSize}, data size=${srcLength}`);
}
// create gpu buffer
const gpuBufferForUploading = this.backend.device.createBuffer(
// eslint-disable-next-line no-bitwise
{mappedAtCreation: true, size, usage: GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC});
// copy (upload) data
const arrayBuffer = gpuBufferForUploading.getMappedRange();
new Uint8Array(arrayBuffer).set(new Uint8Array(srcArrayBuffer, srcOffset, srcLength));
gpuBufferForUploading.unmap();
// GPU copy
const commandEncoder = this.backend.getCommandEncoder();
this.backend.endComputePass();
commandEncoder.copyBufferToBuffer(gpuBufferForUploading, 0, gpuDataCache.gpuData.buffer, 0, size);
LOG_DEBUG('verbose', () => `[WebGPU] GpuDataManager.upload(id=${id})`);
this.buffersForUploadingPending.push(gpuBufferForUploading);
}
memcpy(sourceId: GpuDataId, destinationId: GpuDataId): void {
// get source gpu buffer
const sourceGpuDataCache = this.storageCache.get(sourceId);
if (!sourceGpuDataCache) {
throw new Error('source gpu data for memcpy does not exist');
}
// get destination gpu buffer
const destinationGpuDataCache = this.storageCache.get(destinationId);
if (!destinationGpuDataCache) {
throw new Error('destination gpu data for memcpy does not exist');
}
if (sourceGpuDataCache.originalSize !== destinationGpuDataCache.originalSize) {
throw new Error('inconsistent source and destination gpu data size');
}
const size = calcNormalizedBufferSize(sourceGpuDataCache.originalSize);
// GPU copy
const commandEncoder = this.backend.getCommandEncoder();
this.backend.endComputePass();
commandEncoder.copyBufferToBuffer(
sourceGpuDataCache.gpuData.buffer, 0, destinationGpuDataCache.gpuData.buffer, 0, size);
}
registerExternalBuffer(buffer: GPUBuffer, originalSize: number, previousBuffer?: GPUBuffer): number {
let id: number|undefined;
if (previousBuffer) {
id = this.externalBuffers.get(previousBuffer);
if (id === undefined) {
throw new Error('previous buffer is not registered');
}
if (buffer === previousBuffer) {
LOG_DEBUG(
'verbose',
() => `[WebGPU] GpuDataManager.registerExternalBuffer(size=${originalSize}) => id=${
id}, buffer is the same, skip.`);
return id;
}
this.externalBuffers.delete(previousBuffer);
} else {
id = createNewGpuDataId();
}
this.storageCache.set(id, {gpuData: {id, type: GpuDataType.default, buffer}, originalSize});
this.externalBuffers.set(buffer, id);
LOG_DEBUG(
'verbose',
() => `[WebGPU] GpuDataManager.registerExternalBuffer(size=${originalSize}) => id=${id}, registered.`);
return id;
}
unregisterExternalBuffer(buffer: GPUBuffer): void {
const id = this.externalBuffers.get(buffer);
if (id !== undefined) {
this.storageCache.delete(id);
this.externalBuffers.delete(buffer);
LOG_DEBUG('verbose', () => `[WebGPU] GpuDataManager.unregisterExternalBuffer() => id=${id}`);
}
}
// eslint-disable-next-line no-bitwise
create(size: number, usage = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST): GpuData {
const bufferSize = calcNormalizedBufferSize(size);
let gpuBuffer;
// Currently, only storage buffers are reused.
// eslint-disable-next-line no-bitwise
if ((usage & GPUBufferUsage.STORAGE) === GPUBufferUsage.STORAGE) {
let buffers = this.freeBuffers.get(bufferSize);
if (!buffers) {
buffers = [];
this.freeBuffers.set(bufferSize, buffers);
}
if (buffers.length > 0) {
gpuBuffer = buffers.pop() as GPUBuffer;
} else {
// create gpu buffer
gpuBuffer = this.backend.device.createBuffer({size: bufferSize, usage});
}
} else {
// create gpu buffer
gpuBuffer = this.backend.device.createBuffer({size: bufferSize, usage});
}
const gpuData = {id: createNewGpuDataId(), type: GpuDataType.default, buffer: gpuBuffer};
this.storageCache.set(gpuData.id, {gpuData, originalSize: size});
LOG_DEBUG('verbose', () => `[WebGPU] GpuDataManager.create(size=${size}) => id=${gpuData.id}`);
return gpuData;
}
get(id: GpuDataId): GpuData|undefined {
return this.storageCache.get(id)?.gpuData;
}
release(id: GpuDataId): number {
const cachedData = this.storageCache.get(id);
if (!cachedData) {
throw new Error('releasing data does not exist');
}
LOG_DEBUG('verbose', () => `[WebGPU] GpuDataManager.release(id=${id}), gpuDataId=${cachedData.gpuData.id}`);
this.storageCache.delete(id);
this.buffersPending.push(cachedData.gpuData.buffer);
// cachedData.gpuData.buffer.destroy();
return cachedData.originalSize;
}
async download(id: GpuDataId, getTargetBuffer: () => Uint8Array): Promise<void> {
const cachedData = this.storageCache.get(id);
if (!cachedData) {
throw new Error('data does not exist');
}
await downloadGpuData(this.backend, cachedData.gpuData.buffer, cachedData.originalSize, getTargetBuffer);
}
refreshPendingBuffers(): void {
for (const buffer of this.buffersForUploadingPending) {
// upload buffer is only useful in the session creation time. So we don't need to reuse them in session running.
buffer.destroy();
}
this.buffersForUploadingPending = [];
for (const buffer of this.buffersPending) {
// eslint-disable-next-line no-bitwise
if ((buffer.usage & GPUBufferUsage.STORAGE) === GPUBufferUsage.STORAGE) {
// Put the pending buffer to freeBuffers list instead of really destroying it for buffer reusing.
this.freeBuffers.get(buffer.size)!.push(buffer);
} else {
buffer.destroy();
}
}
this.buffersPending = [];
}
dispose() {
this.freeBuffers.forEach((buffers) => {
buffers.forEach(buffer => {
buffer.destroy();
});
});
this.storageCache.forEach((storage) => {
storage.gpuData.buffer.destroy();
});
this.storageCache = new Map();
this.freeBuffers = new Map();
}
}
export const createGpuDataManager = (...args: ConstructorParameters<typeof GpuDataManagerImpl>): GpuDataManager =>
new GpuDataManagerImpl(...args);