onnxruntime/js/web/script/test-runner-cli.ts

731 lines
30 KiB
TypeScript
Raw Normal View History

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
/* eslint-disable guard-for-in */
/* eslint-disable @typescript-eslint/no-use-before-define */
import { spawnSync } from 'child_process';
import * as fs from 'fs-extra';
import { default as minimatch } from 'minimatch';
import npmlog from 'npmlog';
import * as os from 'os';
import * as path from 'path';
import { inspect } from 'util';
import { onnx } from '../lib/onnxjs/ort-schema/protobuf/onnx';
import { bufferToBase64 } from '../test/test-shared';
import { Test } from '../test/test-types';
import { parseTestRunnerCliArgs, TestRunnerCliArgs } from './test-runner-cli-args';
async function main() {
// use dynamic import so that we can use ESM only libraries in commonJS.
const { globbySync } = await import('globby');
const stripJsonComments = (await import('strip-json-comments')).default;
npmlog.info('TestRunnerCli', 'Initializing...');
const args = parseTestRunnerCliArgs(process.argv.slice(2));
npmlog.verbose('TestRunnerCli.Init.Config', inspect(args));
const DIST_ROOT = path.join(__dirname, '..', 'dist');
const TEST_ROOT = path.join(__dirname, '..', 'test');
const TEST_DATA_MODEL_NODE_ROOT = path.join(TEST_ROOT, 'data', 'node');
const TEST_DATA_OP_ROOT = path.join(TEST_ROOT, 'data', 'ops');
const TEST_DATA_BASE = args.env === 'node' ? TEST_ROOT : '/base/test/';
npmlog.verbose('TestRunnerCli.Init', 'Ensure test data folder...');
fs.ensureSymlinkSync(path.join(__dirname, '../../test/data/node'), TEST_DATA_MODEL_NODE_ROOT, 'junction');
npmlog.verbose('TestRunnerCli.Init', 'Ensure test data folder... DONE');
let testlist: Test.TestList;
const shouldLoadSuiteTestData = args.mode === 'suite0' || args.mode === 'suite1';
if (shouldLoadSuiteTestData) {
npmlog.verbose('TestRunnerCli.Init', 'Loading testlist...');
// The following is a list of unittests for already implemented operators.
// Modify this list to control what node tests to run.
const jsonWithComments = fs.readFileSync(path.resolve(TEST_ROOT, './suite-test-list.jsonc')).toString();
const json = stripJsonComments(jsonWithComments, { whitespace: true });
testlist = JSON.parse(json) as Test.TestList;
npmlog.verbose('TestRunnerCli.Init', 'Loading testlist... DONE');
}
// The default backends and opset version lists. Those will be used in suite tests.
const DEFAULT_BACKENDS: readonly TestRunnerCliArgs.Backend[] =
args.env === 'node' ? ['cpu', 'wasm'] : ['wasm', 'webgl', 'webgpu', 'webnn'];
const DEFAULT_OPSET_VERSIONS = fs
.readdirSync(TEST_DATA_MODEL_NODE_ROOT, { withFileTypes: true })
.filter((dir) => dir.isDirectory() && dir.name.startsWith('opset'))
.map((dir) => dir.name.slice(5));
const MAX_OPSET_VERSION = Math.max(...DEFAULT_OPSET_VERSIONS.map((v) => Number.parseInt(v, 10)));
const FILE_CACHE_ENABLED = args.fileCache; // whether to enable file cache
const FILE_CACHE_MAX_FILE_SIZE = 1 * 1024 * 1024; // The max size of the file that will be put into file cache
const FILE_CACHE_SPLIT_SIZE = 4 * 1024 * 1024; // The min size of the cache file
const fileCache: Test.FileCache = {};
const nodeTests = new Map<string, Test.ModelTestGroup[]>();
const onnxTests = new Map<string, Test.ModelTestGroup>();
const opTests = new Map<string, Test.OperatorTestGroup[]>();
if (shouldLoadSuiteTestData) {
npmlog.verbose('TestRunnerCli.Init', 'Loading test groups for suite test...');
// collect all model test folders
const allNodeTestsFolders = DEFAULT_OPSET_VERSIONS.map((version) => {
const suiteRootFolder = path.join(TEST_DATA_MODEL_NODE_ROOT, `opset${version}`);
if (!fs.existsSync(suiteRootFolder) || !fs.statSync(suiteRootFolder).isDirectory()) {
throw new Error(`model test root folder '${suiteRootFolder}' does not exist.`);
}
return fs.readdirSync(suiteRootFolder).map((f) => `opset${version}/${f}`);
}).flat();
for (const backend of DEFAULT_BACKENDS) {
if (args.backends.indexOf(backend) !== -1) {
nodeTests.set(backend, loadNodeTests(backend, allNodeTestsFolders));
opTests.set(backend, loadOpTests(backend));
}
}
}
if (shouldLoadSuiteTestData) {
npmlog.verbose('TestRunnerCli.Init', 'Loading test groups for suite test... DONE');
npmlog.verbose('TestRunnerCli.Init', 'Validate testlist...');
validateTestList();
npmlog.verbose('TestRunnerCli.Init', 'Validate testlist... DONE');
}
const modelTestGroups: Test.ModelTestGroup[] = [];
const opTestGroups: Test.OperatorTestGroup[] = [];
let unittest = false;
npmlog.verbose('TestRunnerCli.Init', 'Preparing test config...');
switch (args.mode) {
case 'suite0':
case 'suite1':
for (const backend of DEFAULT_BACKENDS) {
if (args.backends.indexOf(backend) !== -1) {
modelTestGroups.push(...nodeTests.get(backend)!); // model test : node
opTestGroups.push(...opTests.get(backend)!); // operator test
}
}
if (args.mode === 'suite0') {
unittest = true;
}
break;
case 'model':
if (!args.param) {
throw new Error("the test folder should be specified in mode 'node'");
} else {
const testFolderSearchPattern = args.param;
const testFolder = tryLocateModelTestFolder(testFolderSearchPattern);
for (const b of args.backends) {
modelTestGroups.push({
name: testFolder,
tests: [modelTestFromFolder(testFolder, b, undefined, args.times)],
});
}
}
break;
case 'unittest':
unittest = true;
break;
case 'op':
if (!args.param) {
throw new Error("the test manifest should be specified in mode 'op'");
} else {
const manifestFileSearchPattern = args.param;
const manifestFile = tryLocateOpTestManifest(manifestFileSearchPattern);
for (const b of args.backends) {
opTestGroups.push(opTestFromManifest(manifestFile, b));
}
}
break;
default:
throw new Error(`unsupported mode '${args.mode}'`);
}
npmlog.verbose('TestRunnerCli.Init', 'Preparing test config... DONE');
npmlog.info('TestRunnerCli', 'Initialization completed. Start to run tests...');
run({
unittest,
model: modelTestGroups,
op: opTestGroups,
log: args.logConfig,
profile: args.profile,
downloadModel: args.downloadModel,
options: {
sessionOptions: {
graphOptimizationLevel: args.graphOptimizationLevel,
optimizedModelFilePath: args.optimizedModelFilePath,
},
debug: args.debug,
cpuOptions: args.cpuOptions,
webglOptions: args.webglOptions,
webnnOptions: args.webnnOptions,
wasmOptions: args.wasmOptions,
globalEnvFlags: args.globalEnvFlags,
},
});
npmlog.info('TestRunnerCli', 'Tests completed successfully');
function validateTestList() {
for (const backend of DEFAULT_BACKENDS) {
const nodeTest = nodeTests.get(backend);
if (nodeTest) {
for (const testCase of testlist[backend].node) {
const testCaseName = typeof testCase === 'string' ? testCase : testCase.name;
let found = false;
for (const testGroup of nodeTest) {
found ||=
minimatch.match(
testGroup.tests.map((test) => test.modelUrl).filter((url) => url !== ''),
path.join('**', testCaseName, '*.+(onnx|ort)').replace(/\\/g, '/'),
{ matchBase: true },
).length > 0;
}
if (!found) {
throw new Error(`node model test case '${testCaseName}' in test list does not exist.`);
}
}
}
const onnxTest = onnxTests.get(backend);
if (onnxTest) {
const onnxModelTests = onnxTest.tests.map((i) => i.name);
for (const testCase of testlist[backend].onnx) {
const testCaseName = typeof testCase === 'string' ? testCase : testCase.name;
if (onnxModelTests.indexOf(testCaseName) === -1) {
throw new Error(`onnx model test case '${testCaseName}' in test list does not exist.`);
}
}
}
const opTest = opTests.get(backend);
if (opTest) {
const opTests = opTest.map((i) => i.name);
for (const testCase of testlist[backend].ops) {
const testCaseName = typeof testCase === 'string' ? testCase : testCase.name;
if (opTests.indexOf(testCaseName) === -1) {
throw new Error(`operator test case '${testCaseName}' in test list does not exist.`);
}
}
}
}
}
function loadNodeTests(backend: string, allFolders: string[]): Test.ModelTestGroup[] {
const allTests = testlist[backend]?.node;
// key is folder name, value is test index array
const folderTestMatchCount = new Map<string, number[]>(allFolders.map((f) => [f, []]));
// key is test category, value is a list of model test
const opsetTests = new Map<string, Test.ModelTest[]>();
allTests.forEach((test, i) => {
const testName = typeof test === 'string' ? test : test.name;
const matches = minimatch.match(allFolders, path.join('**', testName).replace(/\\/g, '/'));
matches.forEach((m) => folderTestMatchCount.get(m)!.push(i));
});
for (const folder of allFolders) {
const testIds = folderTestMatchCount.get(folder);
const times = testIds ? testIds.length : 0;
if (times > 1) {
throw new Error(`multiple testlist rules matches test: ${path.join(TEST_DATA_MODEL_NODE_ROOT, folder)}`);
}
const test = testIds && testIds.length > 0 ? allTests[testIds[0]] : undefined;
const platformCondition = test && typeof test !== 'string' ? test.platformCondition : undefined;
const opsetVersion = folder.split('/')[0];
const category = `node-${opsetVersion}-${backend}`;
let modelTests = opsetTests.get(category);
if (!modelTests) {
modelTests = [];
opsetTests.set(category, modelTests);
}
modelTests.push(
modelTestFromFolder(path.resolve(TEST_DATA_MODEL_NODE_ROOT, folder), backend, platformCondition, times),
);
}
return Array.from(opsetTests.keys()).map((category) => ({ name: category, tests: opsetTests.get(category)! }));
}
function modelTestFromFolder(
testDataRootFolder: string,
backend: string,
platformCondition?: Test.PlatformCondition,
times?: number,
): Test.ModelTest {
if (times === 0) {
npmlog.verbose('TestRunnerCli.Init.Model', `Skip test data from folder: ${testDataRootFolder}`);
return {
name: path.basename(testDataRootFolder),
backend,
modelUrl: '',
cases: [],
ioBinding: args.ioBindingMode,
};
}
let modelUrl: string | null = null;
let cases: Test.ModelTestCase[] = [];
let externalData: Array<{ data: string; path: string }> | undefined;
npmlog.verbose('TestRunnerCli.Init.Model', `Start to prepare test data from folder: ${testDataRootFolder}`);
try {
const maybeExternalDataFiles: Array<[fileNameWithoutExtension: string, size: number]> = [];
for (const thisPath of fs.readdirSync(testDataRootFolder)) {
const thisFullPath = path.join(testDataRootFolder, thisPath);
const stat = fs.lstatSync(thisFullPath);
if (stat.isFile()) {
const ext = path.extname(thisPath);
if (ext.toLowerCase() === '.onnx' || ext.toLowerCase() === '.ort') {
if (modelUrl === null) {
modelUrl = path.join(TEST_DATA_BASE, path.relative(TEST_ROOT, thisFullPath));
if (FILE_CACHE_ENABLED && !fileCache[modelUrl] && stat.size <= FILE_CACHE_MAX_FILE_SIZE) {
fileCache[modelUrl] = bufferToBase64(fs.readFileSync(thisFullPath));
}
} else {
throw new Error('there are multiple model files under the folder specified');
}
} else {
maybeExternalDataFiles.push([path.parse(thisPath).name, stat.size]);
}
} else if (stat.isDirectory()) {
const dataFiles: string[] = [];
for (const dataFile of fs.readdirSync(thisFullPath)) {
const dataFileFullPath = path.join(thisFullPath, dataFile);
const ext = path.extname(dataFile);
if (ext.toLowerCase() === '.pb') {
const dataFileUrl = path.join(TEST_DATA_BASE, path.relative(TEST_ROOT, dataFileFullPath));
dataFiles.push(dataFileUrl);
if (
FILE_CACHE_ENABLED &&
!fileCache[dataFileUrl] &&
fs.lstatSync(dataFileFullPath).size <= FILE_CACHE_MAX_FILE_SIZE
) {
fileCache[dataFileUrl] = bufferToBase64(fs.readFileSync(dataFileFullPath));
}
}
}
if (dataFiles.length > 0) {
cases.push({ dataFiles, name: thisPath });
}
}
}
if (modelUrl === null) {
throw new Error('there are no model file under the folder specified');
}
// for performance consideration, we do not parse every model. when we think it's likely to have external
// data, we will parse it. We think it's "likely" when one of the following conditions is met:
// 1. any file in the same folder has the similar file name as the model file
// (e.g., model file is "model_abc.onnx", and there is a file "model_abc.pb" or "model_abc.onnx.data")
// 2. the file size is larger than 1GB
const likelyToHaveExternalData = maybeExternalDataFiles.some(
([fileNameWithoutExtension, size]) =>
path.basename(modelUrl!).startsWith(fileNameWithoutExtension) || size >= 1 * 1024 * 1024 * 1024,
);
if (likelyToHaveExternalData) {
const model = onnx.ModelProto.decode(fs.readFileSync(path.join(testDataRootFolder, path.basename(modelUrl!))));
const externalDataPathSet = new Set<string>();
for (const initializer of model.graph!.initializer!) {
if (initializer.externalData) {
for (const data of initializer.externalData) {
if (data.key === 'location') {
externalDataPathSet.add(data.value!);
}
}
}
}
externalData = [];
const externalDataPaths = [...externalDataPathSet];
for (const dataPath of externalDataPaths) {
const fullPath = path.resolve(testDataRootFolder, dataPath);
const url = path.join(TEST_DATA_BASE, path.relative(TEST_ROOT, fullPath));
externalData.push({ data: url, path: dataPath });
}
}
} catch (e) {
npmlog.error('TestRunnerCli.Init.Model', `Failed to prepare test data. Error: ${inspect(e)}`);
throw e;
}
const caseCount = cases.length;
if (times !== undefined) {
if (times > caseCount) {
for (let i = 0; cases.length < times; i++) {
const origin = cases[i % caseCount];
const duplicated = {
name: `${origin.name} - copy ${Math.floor(i / caseCount)}`,
dataFiles: origin.dataFiles,
};
cases.push(duplicated);
}
} else {
cases = cases.slice(0, times);
}
}
[js/webgpu] support IO binding (#17480) <del> **This PR is based on a few prerequisites PRs. They are listed as below:** - #17465 - #17469 - #17470 - #17472 - #17473 - #17484 Please review the current change by only looking at commit e2e6623e673ec6de55a5c1f8edcbd3a46b535a89 and later. </del> ### Description This PR introduces WebGPU IO binding. This new feature allows onnxruntime-web users to use tensors created from GPU as model input/output so that a model inferencing can be done without unnecessary data copy between CPU and GPU for model input/output. ### Examples An E2E demo/example is being worked on. Following is some simple demo with code snippet. Let's first check today how we do: ```js // STEP.1 - create an inference session: const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'] }); // STEP.2 - create model input: (supposing myImageCpuData is a Float32Array) const feeds = { 'input_image:0': new ort.Tensor('float32', myImageCpuData, [1, 224, 224, 3]) }; // STEP.3 - run model const myResults = await mySession.run(feeds); // STEP.4 - get output data const myData = myResults['output_image:0'].data; // Float32Array ``` #### for inputs (GPU tensor): Now, with IO binding, you can create a tensor from a GPU buffer, and feed it to the model: ```js // new STEP.2.A - create model input from a GPU buffer: (supposing myInputGpuBuffer is a `GPUBuffer` object with input data) const feeds = { 'input_image:0': ort.Tensor.fromGpuBuffer(myInputGpuBuffer, { dataType: 'float32', dims: [1, 224, 224, 3] }) }; ``` ### for outputs (pre-allocated GPU tensor) you can also do that for output, **if you know the output shape**: ```js // new STEP.2.B - create model output from a GPU buffer: (supposing myOutputGpuBuffer is a pre-allocated `GPUBuffer` object) const fetches = { 'output_image:0': ort.Tensor.fromGpuBuffer(myOutputGpuBuffer, { dataType: 'float32', dims: [1, 512, 512, 3] }) }; // new STEP.3 - run model with pre-allocated output (fetches) const myResults = await mySession.run(feeds, fetches); ``` ### for outputs (specify location) if you do not know the output shape, you can specify the output location when creating the session: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: "gpu-buffer" }); ``` if the model has multiple outputs, you can specify them seperately: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: { "output_image:0": "gpu-buffer" } }); ``` now you don't need to prepare the `fetches` object and onnxruntime-web will prepare output data on the location that specified. #### read data when you get the output tensor, you can: ```js // get the gpu buffer object: const gpuBuffer = myOutputTensor.gpuBuffer; // GPUBuffer // get the CPU data asynchronizely const cpuData = await myOutputTensor.getData(); // get the CPU data asynchronizely and release the underlying GPU resources const cpuData = await myOutputTensor.getData(true); // dispose the tensor (release the underlying GPU resources). This tensor object will be invalid after dispose() is called. myOutputTensor.dispose(); ``` #### resource management JavaScript has GC so you don't need to worry about managing JavaScript objects. But there are 2 types of resources that are not managed by GC: - GPU buffer that used in tensors - Underlying ORT native resources To simplify, most of the unmanaged resources and handled inside ORT web. But there are a few resources that need users to manage: - All external GPU resources, including GPU buffers inside all tensors created by `Tensor.fromGpuBuffer()`, will not be managed by ORT. User should manage those GPU buffers themselves. - When a session is created with `preferredOutputLocation` == "gpu-buffer" specified in session options, and the corresponding output is not pre-allocated, user need to call the output tensor's `dispose()` or `getData(true)` to manually release the underlying GPU buffers. - ORT internal errors (including providing a pre-allocated output tensor with wrong type/dims) will invalidate the whole wasm memory and is not recoverable. An exception is thrown in this situation.
2023-09-29 18:24:42 +00:00
let ioBinding: Test.IOBindingMode;
if (!['webgpu', 'webnn'].includes(backend) && args.ioBindingMode !== 'none') {
[js/webgpu] support IO binding (#17480) <del> **This PR is based on a few prerequisites PRs. They are listed as below:** - #17465 - #17469 - #17470 - #17472 - #17473 - #17484 Please review the current change by only looking at commit e2e6623e673ec6de55a5c1f8edcbd3a46b535a89 and later. </del> ### Description This PR introduces WebGPU IO binding. This new feature allows onnxruntime-web users to use tensors created from GPU as model input/output so that a model inferencing can be done without unnecessary data copy between CPU and GPU for model input/output. ### Examples An E2E demo/example is being worked on. Following is some simple demo with code snippet. Let's first check today how we do: ```js // STEP.1 - create an inference session: const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'] }); // STEP.2 - create model input: (supposing myImageCpuData is a Float32Array) const feeds = { 'input_image:0': new ort.Tensor('float32', myImageCpuData, [1, 224, 224, 3]) }; // STEP.3 - run model const myResults = await mySession.run(feeds); // STEP.4 - get output data const myData = myResults['output_image:0'].data; // Float32Array ``` #### for inputs (GPU tensor): Now, with IO binding, you can create a tensor from a GPU buffer, and feed it to the model: ```js // new STEP.2.A - create model input from a GPU buffer: (supposing myInputGpuBuffer is a `GPUBuffer` object with input data) const feeds = { 'input_image:0': ort.Tensor.fromGpuBuffer(myInputGpuBuffer, { dataType: 'float32', dims: [1, 224, 224, 3] }) }; ``` ### for outputs (pre-allocated GPU tensor) you can also do that for output, **if you know the output shape**: ```js // new STEP.2.B - create model output from a GPU buffer: (supposing myOutputGpuBuffer is a pre-allocated `GPUBuffer` object) const fetches = { 'output_image:0': ort.Tensor.fromGpuBuffer(myOutputGpuBuffer, { dataType: 'float32', dims: [1, 512, 512, 3] }) }; // new STEP.3 - run model with pre-allocated output (fetches) const myResults = await mySession.run(feeds, fetches); ``` ### for outputs (specify location) if you do not know the output shape, you can specify the output location when creating the session: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: "gpu-buffer" }); ``` if the model has multiple outputs, you can specify them seperately: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: { "output_image:0": "gpu-buffer" } }); ``` now you don't need to prepare the `fetches` object and onnxruntime-web will prepare output data on the location that specified. #### read data when you get the output tensor, you can: ```js // get the gpu buffer object: const gpuBuffer = myOutputTensor.gpuBuffer; // GPUBuffer // get the CPU data asynchronizely const cpuData = await myOutputTensor.getData(); // get the CPU data asynchronizely and release the underlying GPU resources const cpuData = await myOutputTensor.getData(true); // dispose the tensor (release the underlying GPU resources). This tensor object will be invalid after dispose() is called. myOutputTensor.dispose(); ``` #### resource management JavaScript has GC so you don't need to worry about managing JavaScript objects. But there are 2 types of resources that are not managed by GC: - GPU buffer that used in tensors - Underlying ORT native resources To simplify, most of the unmanaged resources and handled inside ORT web. But there are a few resources that need users to manage: - All external GPU resources, including GPU buffers inside all tensors created by `Tensor.fromGpuBuffer()`, will not be managed by ORT. User should manage those GPU buffers themselves. - When a session is created with `preferredOutputLocation` == "gpu-buffer" specified in session options, and the corresponding output is not pre-allocated, user need to call the output tensor's `dispose()` or `getData(true)` to manually release the underlying GPU buffers. - ORT internal errors (including providing a pre-allocated output tensor with wrong type/dims) will invalidate the whole wasm memory and is not recoverable. An exception is thrown in this situation.
2023-09-29 18:24:42 +00:00
npmlog.warn(
'TestRunnerCli.Init.Model',
`Ignoring IO Binding Mode "${args.ioBindingMode}" for backend "${backend}".`,
);
[js/webgpu] support IO binding (#17480) <del> **This PR is based on a few prerequisites PRs. They are listed as below:** - #17465 - #17469 - #17470 - #17472 - #17473 - #17484 Please review the current change by only looking at commit e2e6623e673ec6de55a5c1f8edcbd3a46b535a89 and later. </del> ### Description This PR introduces WebGPU IO binding. This new feature allows onnxruntime-web users to use tensors created from GPU as model input/output so that a model inferencing can be done without unnecessary data copy between CPU and GPU for model input/output. ### Examples An E2E demo/example is being worked on. Following is some simple demo with code snippet. Let's first check today how we do: ```js // STEP.1 - create an inference session: const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'] }); // STEP.2 - create model input: (supposing myImageCpuData is a Float32Array) const feeds = { 'input_image:0': new ort.Tensor('float32', myImageCpuData, [1, 224, 224, 3]) }; // STEP.3 - run model const myResults = await mySession.run(feeds); // STEP.4 - get output data const myData = myResults['output_image:0'].data; // Float32Array ``` #### for inputs (GPU tensor): Now, with IO binding, you can create a tensor from a GPU buffer, and feed it to the model: ```js // new STEP.2.A - create model input from a GPU buffer: (supposing myInputGpuBuffer is a `GPUBuffer` object with input data) const feeds = { 'input_image:0': ort.Tensor.fromGpuBuffer(myInputGpuBuffer, { dataType: 'float32', dims: [1, 224, 224, 3] }) }; ``` ### for outputs (pre-allocated GPU tensor) you can also do that for output, **if you know the output shape**: ```js // new STEP.2.B - create model output from a GPU buffer: (supposing myOutputGpuBuffer is a pre-allocated `GPUBuffer` object) const fetches = { 'output_image:0': ort.Tensor.fromGpuBuffer(myOutputGpuBuffer, { dataType: 'float32', dims: [1, 512, 512, 3] }) }; // new STEP.3 - run model with pre-allocated output (fetches) const myResults = await mySession.run(feeds, fetches); ``` ### for outputs (specify location) if you do not know the output shape, you can specify the output location when creating the session: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: "gpu-buffer" }); ``` if the model has multiple outputs, you can specify them seperately: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: { "output_image:0": "gpu-buffer" } }); ``` now you don't need to prepare the `fetches` object and onnxruntime-web will prepare output data on the location that specified. #### read data when you get the output tensor, you can: ```js // get the gpu buffer object: const gpuBuffer = myOutputTensor.gpuBuffer; // GPUBuffer // get the CPU data asynchronizely const cpuData = await myOutputTensor.getData(); // get the CPU data asynchronizely and release the underlying GPU resources const cpuData = await myOutputTensor.getData(true); // dispose the tensor (release the underlying GPU resources). This tensor object will be invalid after dispose() is called. myOutputTensor.dispose(); ``` #### resource management JavaScript has GC so you don't need to worry about managing JavaScript objects. But there are 2 types of resources that are not managed by GC: - GPU buffer that used in tensors - Underlying ORT native resources To simplify, most of the unmanaged resources and handled inside ORT web. But there are a few resources that need users to manage: - All external GPU resources, including GPU buffers inside all tensors created by `Tensor.fromGpuBuffer()`, will not be managed by ORT. User should manage those GPU buffers themselves. - When a session is created with `preferredOutputLocation` == "gpu-buffer" specified in session options, and the corresponding output is not pre-allocated, user need to call the output tensor's `dispose()` or `getData(true)` to manually release the underlying GPU buffers. - ORT internal errors (including providing a pre-allocated output tensor with wrong type/dims) will invalidate the whole wasm memory and is not recoverable. An exception is thrown in this situation.
2023-09-29 18:24:42 +00:00
ioBinding = 'none';
} else {
ioBinding = args.ioBindingMode;
}
npmlog.verbose('TestRunnerCli.Init.Model', 'Finished preparing test data.');
npmlog.verbose('TestRunnerCli.Init.Model', '===============================================================');
npmlog.verbose('TestRunnerCli.Init.Model', ` Model file: ${modelUrl}`);
npmlog.verbose('TestRunnerCli.Init.Model', ` Backend: ${backend}`);
npmlog.verbose('TestRunnerCli.Init.Model', ` Test set(s): ${cases.length} (${caseCount})`);
if (externalData) {
npmlog.verbose('TestRunnerCli.Init.Model', ` External data: ${externalData.length}`);
for (const data of externalData) {
npmlog.verbose('TestRunnerCli.Init.Model', ` - ${data.path}`);
}
}
npmlog.verbose('TestRunnerCli.Init.Model', '===============================================================');
return {
name: path.basename(testDataRootFolder),
platformCondition,
modelUrl,
backend,
cases,
ioBinding,
externalData,
};
}
function tryLocateModelTestFolder(searchPattern: string): string {
const folderCandidates: string[] = [];
// 1 - check whether search pattern is a directory
if (fs.existsSync(searchPattern) && fs.lstatSync(searchPattern).isDirectory()) {
folderCandidates.push(searchPattern);
}
// 2 - check the globby result of searchPattern
// 3 - check the globby result of ONNX root combined with searchPattern
const globbyPattern = [
searchPattern,
path.join(TEST_DATA_MODEL_NODE_ROOT, '**', searchPattern).replace(/\\/g, '/'),
];
// 4 - check the globby result of NODE root combined with opset versions and searchPattern
globbyPattern.push(
...DEFAULT_OPSET_VERSIONS.map((v) =>
path.join(TEST_DATA_MODEL_NODE_ROOT, `opset${v}`, '**', searchPattern).replace(/\\/g, '/'),
),
);
folderCandidates.push(...globbySync(globbyPattern, { onlyDirectories: true, absolute: true }));
// pick the first folder that matches the pattern
for (const folderCandidate of folderCandidates) {
const modelCandidates = globbySync('*.{onnx,ort}', { onlyFiles: true, cwd: folderCandidate });
if (modelCandidates && modelCandidates.length === 1) {
return folderCandidate;
}
}
throw new Error(`no model folder found: ${searchPattern}`);
}
function loadOpTests(backend: string): Test.OperatorTestGroup[] {
const groups: Test.OperatorTestGroup[] = [];
for (const thisPath of fs.readdirSync(TEST_DATA_OP_ROOT)) {
const thisFullPath = path.join(TEST_DATA_OP_ROOT, thisPath);
const stat = fs.lstatSync(thisFullPath);
const ext = path.extname(thisFullPath);
if (stat.isFile() && (ext === '.json' || ext === '.jsonc')) {
const skip = testlist[backend].ops.indexOf(thisPath) === -1;
groups.push(opTestFromManifest(thisFullPath, backend, skip));
}
}
return groups;
}
function opTestFromManifest(manifestFile: string, backend: string, skip = false): Test.OperatorTestGroup {
let tests: Test.OperatorTest[] = [];
const filePath = path.resolve(process.cwd(), manifestFile);
if (skip) {
npmlog.verbose('TestRunnerCli.Init.Op', `Skip test data from manifest file: ${filePath}`);
} else {
npmlog.verbose('TestRunnerCli.Init.Op', `Start to prepare test data from manifest file: ${filePath}`);
const jsonWithComments = fs.readFileSync(filePath).toString();
const json = stripJsonComments(jsonWithComments, { whitespace: true });
tests = JSON.parse(json) as Test.OperatorTest[];
// field 'verbose' and 'backend' is not set
for (const test of tests) {
test.backend = backend;
test.opset = test.opset || { domain: '', version: MAX_OPSET_VERSION };
[js/webgpu] support IO binding (#17480) <del> **This PR is based on a few prerequisites PRs. They are listed as below:** - #17465 - #17469 - #17470 - #17472 - #17473 - #17484 Please review the current change by only looking at commit e2e6623e673ec6de55a5c1f8edcbd3a46b535a89 and later. </del> ### Description This PR introduces WebGPU IO binding. This new feature allows onnxruntime-web users to use tensors created from GPU as model input/output so that a model inferencing can be done without unnecessary data copy between CPU and GPU for model input/output. ### Examples An E2E demo/example is being worked on. Following is some simple demo with code snippet. Let's first check today how we do: ```js // STEP.1 - create an inference session: const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'] }); // STEP.2 - create model input: (supposing myImageCpuData is a Float32Array) const feeds = { 'input_image:0': new ort.Tensor('float32', myImageCpuData, [1, 224, 224, 3]) }; // STEP.3 - run model const myResults = await mySession.run(feeds); // STEP.4 - get output data const myData = myResults['output_image:0'].data; // Float32Array ``` #### for inputs (GPU tensor): Now, with IO binding, you can create a tensor from a GPU buffer, and feed it to the model: ```js // new STEP.2.A - create model input from a GPU buffer: (supposing myInputGpuBuffer is a `GPUBuffer` object with input data) const feeds = { 'input_image:0': ort.Tensor.fromGpuBuffer(myInputGpuBuffer, { dataType: 'float32', dims: [1, 224, 224, 3] }) }; ``` ### for outputs (pre-allocated GPU tensor) you can also do that for output, **if you know the output shape**: ```js // new STEP.2.B - create model output from a GPU buffer: (supposing myOutputGpuBuffer is a pre-allocated `GPUBuffer` object) const fetches = { 'output_image:0': ort.Tensor.fromGpuBuffer(myOutputGpuBuffer, { dataType: 'float32', dims: [1, 512, 512, 3] }) }; // new STEP.3 - run model with pre-allocated output (fetches) const myResults = await mySession.run(feeds, fetches); ``` ### for outputs (specify location) if you do not know the output shape, you can specify the output location when creating the session: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: "gpu-buffer" }); ``` if the model has multiple outputs, you can specify them seperately: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: { "output_image:0": "gpu-buffer" } }); ``` now you don't need to prepare the `fetches` object and onnxruntime-web will prepare output data on the location that specified. #### read data when you get the output tensor, you can: ```js // get the gpu buffer object: const gpuBuffer = myOutputTensor.gpuBuffer; // GPUBuffer // get the CPU data asynchronizely const cpuData = await myOutputTensor.getData(); // get the CPU data asynchronizely and release the underlying GPU resources const cpuData = await myOutputTensor.getData(true); // dispose the tensor (release the underlying GPU resources). This tensor object will be invalid after dispose() is called. myOutputTensor.dispose(); ``` #### resource management JavaScript has GC so you don't need to worry about managing JavaScript objects. But there are 2 types of resources that are not managed by GC: - GPU buffer that used in tensors - Underlying ORT native resources To simplify, most of the unmanaged resources and handled inside ORT web. But there are a few resources that need users to manage: - All external GPU resources, including GPU buffers inside all tensors created by `Tensor.fromGpuBuffer()`, will not be managed by ORT. User should manage those GPU buffers themselves. - When a session is created with `preferredOutputLocation` == "gpu-buffer" specified in session options, and the corresponding output is not pre-allocated, user need to call the output tensor's `dispose()` or `getData(true)` to manually release the underlying GPU buffers. - ORT internal errors (including providing a pre-allocated output tensor with wrong type/dims) will invalidate the whole wasm memory and is not recoverable. An exception is thrown in this situation.
2023-09-29 18:24:42 +00:00
if (backend !== 'webgpu' && args.ioBindingMode !== 'none') {
npmlog.warn(
'TestRunnerCli.Init.Op',
`Ignoring IO Binding Mode "${args.ioBindingMode}" for backend "${backend}".`,
);
[js/webgpu] support IO binding (#17480) <del> **This PR is based on a few prerequisites PRs. They are listed as below:** - #17465 - #17469 - #17470 - #17472 - #17473 - #17484 Please review the current change by only looking at commit e2e6623e673ec6de55a5c1f8edcbd3a46b535a89 and later. </del> ### Description This PR introduces WebGPU IO binding. This new feature allows onnxruntime-web users to use tensors created from GPU as model input/output so that a model inferencing can be done without unnecessary data copy between CPU and GPU for model input/output. ### Examples An E2E demo/example is being worked on. Following is some simple demo with code snippet. Let's first check today how we do: ```js // STEP.1 - create an inference session: const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'] }); // STEP.2 - create model input: (supposing myImageCpuData is a Float32Array) const feeds = { 'input_image:0': new ort.Tensor('float32', myImageCpuData, [1, 224, 224, 3]) }; // STEP.3 - run model const myResults = await mySession.run(feeds); // STEP.4 - get output data const myData = myResults['output_image:0'].data; // Float32Array ``` #### for inputs (GPU tensor): Now, with IO binding, you can create a tensor from a GPU buffer, and feed it to the model: ```js // new STEP.2.A - create model input from a GPU buffer: (supposing myInputGpuBuffer is a `GPUBuffer` object with input data) const feeds = { 'input_image:0': ort.Tensor.fromGpuBuffer(myInputGpuBuffer, { dataType: 'float32', dims: [1, 224, 224, 3] }) }; ``` ### for outputs (pre-allocated GPU tensor) you can also do that for output, **if you know the output shape**: ```js // new STEP.2.B - create model output from a GPU buffer: (supposing myOutputGpuBuffer is a pre-allocated `GPUBuffer` object) const fetches = { 'output_image:0': ort.Tensor.fromGpuBuffer(myOutputGpuBuffer, { dataType: 'float32', dims: [1, 512, 512, 3] }) }; // new STEP.3 - run model with pre-allocated output (fetches) const myResults = await mySession.run(feeds, fetches); ``` ### for outputs (specify location) if you do not know the output shape, you can specify the output location when creating the session: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: "gpu-buffer" }); ``` if the model has multiple outputs, you can specify them seperately: ```js // new STEP.1 - create an inference session with an option "preferredOutputLocation": const mySession = await ort.InferenceSession.create('./my_model.onnx', { executionProviders: ['webgpu'], preferredOutputLocation: { "output_image:0": "gpu-buffer" } }); ``` now you don't need to prepare the `fetches` object and onnxruntime-web will prepare output data on the location that specified. #### read data when you get the output tensor, you can: ```js // get the gpu buffer object: const gpuBuffer = myOutputTensor.gpuBuffer; // GPUBuffer // get the CPU data asynchronizely const cpuData = await myOutputTensor.getData(); // get the CPU data asynchronizely and release the underlying GPU resources const cpuData = await myOutputTensor.getData(true); // dispose the tensor (release the underlying GPU resources). This tensor object will be invalid after dispose() is called. myOutputTensor.dispose(); ``` #### resource management JavaScript has GC so you don't need to worry about managing JavaScript objects. But there are 2 types of resources that are not managed by GC: - GPU buffer that used in tensors - Underlying ORT native resources To simplify, most of the unmanaged resources and handled inside ORT web. But there are a few resources that need users to manage: - All external GPU resources, including GPU buffers inside all tensors created by `Tensor.fromGpuBuffer()`, will not be managed by ORT. User should manage those GPU buffers themselves. - When a session is created with `preferredOutputLocation` == "gpu-buffer" specified in session options, and the corresponding output is not pre-allocated, user need to call the output tensor's `dispose()` or `getData(true)` to manually release the underlying GPU buffers. - ORT internal errors (including providing a pre-allocated output tensor with wrong type/dims) will invalidate the whole wasm memory and is not recoverable. An exception is thrown in this situation.
2023-09-29 18:24:42 +00:00
test.ioBinding = 'none';
} else {
test.ioBinding = args.ioBindingMode;
}
}
npmlog.verbose('TestRunnerCli.Init.Op', 'Finished preparing test data.');
npmlog.verbose('TestRunnerCli.Init.Op', '===============================================================');
npmlog.verbose('TestRunnerCli.Init.Op', ` Test Group: ${path.relative(TEST_DATA_OP_ROOT, filePath)}`);
npmlog.verbose('TestRunnerCli.Init.Op', ` Backend: ${backend}`);
npmlog.verbose('TestRunnerCli.Init.Op', ` Test case(s): ${tests.length}`);
npmlog.verbose('TestRunnerCli.Init.Op', '===============================================================');
}
return { name: path.relative(TEST_DATA_OP_ROOT, filePath), tests };
}
function tryLocateOpTestManifest(searchPattern: string): string {
for (const manifestCandidate of globbySync(
[
searchPattern,
path.join(TEST_DATA_OP_ROOT, '**', searchPattern).replace(/\\/g, '/'),
path.join(TEST_DATA_OP_ROOT, '**', searchPattern + '.json').replace(/\\/g, '/'),
path.join(TEST_DATA_OP_ROOT, '**', searchPattern + '.jsonc').replace(/\\/g, '/'),
],
{ onlyFiles: true, absolute: true, cwd: TEST_ROOT },
)) {
return manifestCandidate;
}
throw new Error(`no OP test manifest found: ${searchPattern}`);
}
function run(config: Test.Config) {
// STEP 1. write file cache to testdata-file-cache-*.json
npmlog.info('TestRunnerCli.Run', '(1/4) Writing file cache to file: testdata-file-cache-*.json ...');
const fileCacheUrls = saveFileCache(fileCache);
if (fileCacheUrls.length > 0) {
config.fileCacheUrls = fileCacheUrls;
}
npmlog.info(
'TestRunnerCli.Run',
`(1/4) Writing file cache to file: testdata-file-cache-*.json ... ${
fileCacheUrls.length > 0 ? `DONE, ${fileCacheUrls.length} file(s) generated` : 'SKIPPED'
}`,
);
// STEP 2. write the config to testdata-config.json
npmlog.info('TestRunnerCli.Run', '(2/4) Writing config to file: testdata-config.json ...');
saveConfig(config);
npmlog.info('TestRunnerCli.Run', '(2/4) Writing config to file: testdata-config.json ... DONE');
// STEP 3. generate bundle
npmlog.info('TestRunnerCli.Run', '(3/4) Running build to generate bundle...');
const buildCommand = `node ${path.join(__dirname, 'build')}`;
const buildArgs = [`--bundle-mode=${args.env === 'node' ? 'node' : args.bundleMode}`];
npmlog.info('TestRunnerCli.Run', `CMD: ${buildCommand} ${buildArgs.join(' ')}`);
const build = spawnSync(buildCommand, buildArgs, { shell: true, stdio: 'inherit' });
if (build.status !== 0) {
console.error(build.error);
process.exit(build.status === null ? undefined : build.status);
}
npmlog.info('TestRunnerCli.Run', '(3/4) Running build to generate bundle... DONE');
if (args.env === 'node') {
// STEP 5. run tsc and run mocha
npmlog.info('TestRunnerCli.Run', '(4/4) Running tsc...');
const tsc = spawnSync('npx', ['tsc'], { shell: true, stdio: 'inherit' });
if (tsc.status !== 0) {
console.error(tsc.error);
process.exit(tsc.status === null ? undefined : tsc.status);
}
npmlog.info('TestRunnerCli.Run', '(4/4) Running tsc... DONE');
npmlog.info('TestRunnerCli.Run', '(4/4) Running mocha...');
const mochaArgs = [
'mocha',
'--timeout',
`${args.debug ? 9999999 : 60000}`,
'-r',
path.join(DIST_ROOT, 'ort.node.min.js'),
path.join(TEST_ROOT, 'test-main'),
];
npmlog.info('TestRunnerCli.Run', `CMD: npx ${mochaArgs.join(' ')}`);
const mocha = spawnSync('npx', mochaArgs, { shell: true, stdio: 'inherit' });
if (mocha.status !== 0) {
console.error(mocha.error);
process.exit(mocha.status === null ? undefined : mocha.status);
}
npmlog.info('TestRunnerCli.Run', '(4/4) Running mocha... DONE');
} else {
// STEP 5. use Karma to run test
npmlog.info('TestRunnerCli.Run', '(4/4) Running karma to start test runner...');
[js/web] WebGPU backend via JSEP (#14579) ### Description This change introduced the following new components into ONNX Runtime Web: - JavaScript Execution Provider (JSEP) - Asynchronized inferencing execution powered by Emscripten's Asyncify - WebGPU backend implemented in TypeScript - initial implementation of kernels: - elementwise operators (22) - binary operators (5) - tensor: Shape, Reshape, Transpose, Gemm - nn: Conv, {Global}Maxpool, {Global}AveragePool Code need to be polished. still working on it. ## Q&A What is JSEP? > JSEP, aka JavaScript Execution Provider, is a new ONNXRuntime execution provider that specifically works on Web environment (browsers). JSEP allows JavaScript code to kick in from various places when ONNX Runtime inferences a model. Why JSEP? > JSEP is a hybrid mode EP that contains both C/C++ and TypeScript/JavaScript implementation. There are 2 strong reasons why we introduces JSEP: > 1. the C/C++ part helps JSEP to leverage ONNX Runtime's capabilities as much as possible including graph transformer, optimizers and also the capabilities to fallback to CPU EP. TypeScript/JavaScript helps JSEP to develop and debug much easier in the browser for the kernel implementation. > 2. the requirement of asynchronized execution from JavaScript API (eg. `buffer.mapAsync()`) makes it impossible to run `OrtRun()` in a synchronized context (see "async problem" section below). This is done by using Emscripten's Asyncify. What is WebGPU? > WebGPU is the new GPU API that available in browser. It's one of the only 2 APIs that currently available to access the GPU from browser (the other is WebGL). > WebGPU is designed with more advanced and stronger features comparing to WebGL and is potentially solution that offer the best GPU performance for model inferencing that currently available. What is the async problem and why we have the problem? > The "async problem" is a problem that you cannot call an async function in a synchronous context. Think about the following C++ code: > ```c > // C-style declarations (API) > typedef void (*ON_COMPLETE)(PVOID state, DATA *data); > void read_data_from_file(FILEHANDLE file, ON_COMPLETE on_complete); > > // implementation > DATA * my_impl_read_data_from_file_sync(FILEHANDLE file) { > // how to implement? > } > ``` > The answer is, it's impossible to implement this function. Usually we try to find a sync version API, or launch a thread to call the async function and sync-wait on the main thread. Unfortunately, in browser environment, neither is possible. > > WebGPU does not offer any synchronized API for data downloading (GPU to CPU). This is the only operation that MUST be async. As `OrtRun()` will eventually call into DataTransfer for copy data from GPU to CPU, and `OrtRun()` is a synchronized function, this cannot be done in normal way. What is Emscripten? How is the Asyncify feature resolved the problem? > Emscripten is the C/C++ compiler for WebAssembly. It's what we use to compile ORT and generates the WebAssembly artifacts which runs on browsers. > > Asyncify is a [compiler feature](https://emscripten.org/docs/porting/asyncify.html) that allows calling async functions from a synchronized context. In short, it generates code to unwind and rewind call stack to emulate async execution. With this feature, we are able to call the async function inside `OrtRun()` call. ## Design Overview **Inter-op** JSEP is doing pretty much same thing to just another EP. It exposes an interface for inter-op with JavaScript, which is defined in onnxruntime/wasm/js_internal_api.js: ```js // init JSEP Module["jsepInit"] = function (backend, alloc, free, copy, copyAsync, createKernel, releaseKernel, run) { Module.jsepBackend = backend; Module.jsepAlloc = alloc; Module.jsepFree = free; Module.jsepCopy = copy; Module.jsepCopyAsync = copyAsync; Module.jsepCreateKernel = createKernel; Module.jsepReleaseKernel = releaseKernel; Module.jsepRun = run; }; ``` This simple JavaScript snippet defines all language barrier level functions that requires by JSEP to achieve implementing kernels and data transfers using JavaScript inside ONNX Runtime: - `jsepBackend`: assign the singleton object to webassembly module - `jsepAlloc` and `jsepFree`: implementation of data transfer's Alloc() and Free() - `jsepCopy`: synchronized copy ( GPU to GPU, CPU to GPU) - `jsepCopyAsync`: asynchronized copy ( GPU to CPU) - `jsepCreateKernel` and `jsepReleaseKernel`: a corresponding object that maintained in JS to match lifecycle of Kernel in ORT - `jsepRun`: OpKernel::Compute() should call into this The abstraction above allows to tie as little as possible connections and dependencies between C/C++ and TypeScript/JavaScript. **Resource Management** Lifecycle of tensor data and kernels are managed by ORT(C/C++) but the implementation are left to JavaScript. JavaScript code are responsible to implement the callbacks correctly. For WebGPU, the GPU data is managed by JavaScript using a singleton map (tensot_data_id => GPUBuffer). GPU pipeline is managed as singleton. Shaders are managed using a singletonmap (shader_key => gpu_program), while shader_key is generated by cache_key (OP specific, including attributes) and input shapes. **about data transfer** `js::DataTransfer::CopyTensor` implemented to call either synchronized or asynchronized copy callback, depending on the destination is GPU or not. Emscripten's macro `EM_ASYNC_JS` is used to wrap the async function to be called in the synchronized context. **run kernel in JS** Kernel class constructor calls once `jsepCreateKernel()` with an optional per-kernel specific serialization to pass attributes into JavaScript. `Compute()` are implemented in a way that a metadata serialization is performed in a base class and JavaScript code can access the data using the Emscripten specific builtin macro `EM_ASM_*`. **disabled features** memory pattern is force disabled, because the WebGPU data is not presented by a general memory model (a buffer can be represented by offset + size). concurrent run support is disabled. WebGPU is stateful and it also has async function call. To support concurrent run will significantly increase the complexity and we don't get any real benefit from it. **prefer channels last** JSEP prefers channels last and returns `DataLayout::NHWC` in method `GetPreferredLayout()`. This will let the graph transformers to preprocess the graph into a channels last form so that a more optimized WebGPU shader can be used. **Testing code** It's impossible to test JSEP directly because JSEP itself does not contain any kernel implementation. However, it has the kernel registration which need to work together with the corresponding JavaScript code. There are unit tests that run onnx models from JavaScript API. --------- Co-authored-by: Scott McKay <skottmckay@gmail.com>
2023-04-24 22:21:18 +00:00
const webgpu = args.backends.indexOf('webgpu') > -1;
const webnn = args.backends.indexOf('webnn') > -1;
const browser = getBrowserNameFromEnv(args.env);
const karmaArgs = ['karma', 'start', `--browsers ${browser}`];
const chromiumFlags = ['--enable-features=SharedArrayBuffer', ...args.chromiumFlags];
if (args.bundleMode === 'dev' && !args.debug) {
// use headless for 'test' mode (when 'perf' and 'debug' are OFF)
chromiumFlags.push('--headless=new');
}
if (args.debug) {
karmaArgs.push('--log-level info --timeout-mocha 9999999');
chromiumFlags.push('--remote-debugging-port=9333');
} else {
karmaArgs.push('--single-run');
}
if (args.noSandbox) {
karmaArgs.push('--no-sandbox');
}
// When using BrowserStack with Safari, we need NOT to use 'localhost' as the hostname.
if (!(browser.startsWith('BS_') && browser.includes('Safari'))) {
[js/web] WebGPU backend via JSEP (#14579) ### Description This change introduced the following new components into ONNX Runtime Web: - JavaScript Execution Provider (JSEP) - Asynchronized inferencing execution powered by Emscripten's Asyncify - WebGPU backend implemented in TypeScript - initial implementation of kernels: - elementwise operators (22) - binary operators (5) - tensor: Shape, Reshape, Transpose, Gemm - nn: Conv, {Global}Maxpool, {Global}AveragePool Code need to be polished. still working on it. ## Q&A What is JSEP? > JSEP, aka JavaScript Execution Provider, is a new ONNXRuntime execution provider that specifically works on Web environment (browsers). JSEP allows JavaScript code to kick in from various places when ONNX Runtime inferences a model. Why JSEP? > JSEP is a hybrid mode EP that contains both C/C++ and TypeScript/JavaScript implementation. There are 2 strong reasons why we introduces JSEP: > 1. the C/C++ part helps JSEP to leverage ONNX Runtime's capabilities as much as possible including graph transformer, optimizers and also the capabilities to fallback to CPU EP. TypeScript/JavaScript helps JSEP to develop and debug much easier in the browser for the kernel implementation. > 2. the requirement of asynchronized execution from JavaScript API (eg. `buffer.mapAsync()`) makes it impossible to run `OrtRun()` in a synchronized context (see "async problem" section below). This is done by using Emscripten's Asyncify. What is WebGPU? > WebGPU is the new GPU API that available in browser. It's one of the only 2 APIs that currently available to access the GPU from browser (the other is WebGL). > WebGPU is designed with more advanced and stronger features comparing to WebGL and is potentially solution that offer the best GPU performance for model inferencing that currently available. What is the async problem and why we have the problem? > The "async problem" is a problem that you cannot call an async function in a synchronous context. Think about the following C++ code: > ```c > // C-style declarations (API) > typedef void (*ON_COMPLETE)(PVOID state, DATA *data); > void read_data_from_file(FILEHANDLE file, ON_COMPLETE on_complete); > > // implementation > DATA * my_impl_read_data_from_file_sync(FILEHANDLE file) { > // how to implement? > } > ``` > The answer is, it's impossible to implement this function. Usually we try to find a sync version API, or launch a thread to call the async function and sync-wait on the main thread. Unfortunately, in browser environment, neither is possible. > > WebGPU does not offer any synchronized API for data downloading (GPU to CPU). This is the only operation that MUST be async. As `OrtRun()` will eventually call into DataTransfer for copy data from GPU to CPU, and `OrtRun()` is a synchronized function, this cannot be done in normal way. What is Emscripten? How is the Asyncify feature resolved the problem? > Emscripten is the C/C++ compiler for WebAssembly. It's what we use to compile ORT and generates the WebAssembly artifacts which runs on browsers. > > Asyncify is a [compiler feature](https://emscripten.org/docs/porting/asyncify.html) that allows calling async functions from a synchronized context. In short, it generates code to unwind and rewind call stack to emulate async execution. With this feature, we are able to call the async function inside `OrtRun()` call. ## Design Overview **Inter-op** JSEP is doing pretty much same thing to just another EP. It exposes an interface for inter-op with JavaScript, which is defined in onnxruntime/wasm/js_internal_api.js: ```js // init JSEP Module["jsepInit"] = function (backend, alloc, free, copy, copyAsync, createKernel, releaseKernel, run) { Module.jsepBackend = backend; Module.jsepAlloc = alloc; Module.jsepFree = free; Module.jsepCopy = copy; Module.jsepCopyAsync = copyAsync; Module.jsepCreateKernel = createKernel; Module.jsepReleaseKernel = releaseKernel; Module.jsepRun = run; }; ``` This simple JavaScript snippet defines all language barrier level functions that requires by JSEP to achieve implementing kernels and data transfers using JavaScript inside ONNX Runtime: - `jsepBackend`: assign the singleton object to webassembly module - `jsepAlloc` and `jsepFree`: implementation of data transfer's Alloc() and Free() - `jsepCopy`: synchronized copy ( GPU to GPU, CPU to GPU) - `jsepCopyAsync`: asynchronized copy ( GPU to CPU) - `jsepCreateKernel` and `jsepReleaseKernel`: a corresponding object that maintained in JS to match lifecycle of Kernel in ORT - `jsepRun`: OpKernel::Compute() should call into this The abstraction above allows to tie as little as possible connections and dependencies between C/C++ and TypeScript/JavaScript. **Resource Management** Lifecycle of tensor data and kernels are managed by ORT(C/C++) but the implementation are left to JavaScript. JavaScript code are responsible to implement the callbacks correctly. For WebGPU, the GPU data is managed by JavaScript using a singleton map (tensot_data_id => GPUBuffer). GPU pipeline is managed as singleton. Shaders are managed using a singletonmap (shader_key => gpu_program), while shader_key is generated by cache_key (OP specific, including attributes) and input shapes. **about data transfer** `js::DataTransfer::CopyTensor` implemented to call either synchronized or asynchronized copy callback, depending on the destination is GPU or not. Emscripten's macro `EM_ASYNC_JS` is used to wrap the async function to be called in the synchronized context. **run kernel in JS** Kernel class constructor calls once `jsepCreateKernel()` with an optional per-kernel specific serialization to pass attributes into JavaScript. `Compute()` are implemented in a way that a metadata serialization is performed in a base class and JavaScript code can access the data using the Emscripten specific builtin macro `EM_ASM_*`. **disabled features** memory pattern is force disabled, because the WebGPU data is not presented by a general memory model (a buffer can be represented by offset + size). concurrent run support is disabled. WebGPU is stateful and it also has async function call. To support concurrent run will significantly increase the complexity and we don't get any real benefit from it. **prefer channels last** JSEP prefers channels last and returns `DataLayout::NHWC` in method `GetPreferredLayout()`. This will let the graph transformers to preprocess the graph into a channels last form so that a more optimized WebGPU shader can be used. **Testing code** It's impossible to test JSEP directly because JSEP itself does not contain any kernel implementation. However, it has the kernel registration which need to work together with the corresponding JavaScript code. There are unit tests that run onnx models from JavaScript API. --------- Co-authored-by: Scott McKay <skottmckay@gmail.com>
2023-04-24 22:21:18 +00:00
karmaArgs.push('--force-localhost');
}
if (webgpu) {
// flag 'allow_unsafe_apis' is required to enable experimental features like fp16 and profiling inside pass.
// flag 'use_dxc' is required to enable DXC compiler.
chromiumFlags.push('--enable-dawn-features=allow_unsafe_apis,use_dxc');
}
if (webnn) {
chromiumFlags.push('--enable-features=WebMachineLearningNeuralNetwork');
}
if (process.argv.includes('--karma-debug')) {
karmaArgs.push('--log-level debug');
}
karmaArgs.push(`--bundle-mode=${args.bundleMode}`);
if (args.userDataDir) {
karmaArgs.push(`--user-data-dir="${args.userDataDir}"`);
}
karmaArgs.push(...chromiumFlags.map((flag) => `--chromium-flags=${flag}`));
if (browser.startsWith('Edge')) {
// There are currently 2 Edge browser launchers:
// - karma-edge-launcher: used to launch the old Edge browser
// - karma-chromium-edge-launcher: used to launch the new chromium-kernel Edge browser
//
// Those 2 plugins cannot be loaded at the same time, so we need to determine which launchers to use.
// - If we use 'karma-edge-launcher', no plugins config need to be set.
// - If we use 'karma-chromium-edge-launcher', we need to:
// - add plugin "@chiragrupani/karma-chromium-edge-launcher" explicitly, because it does not match the
// default plugins config "^karma-.*"
// - remove "karma-edge-launcher".
// check if we have the latest Edge installed:
if (
os.platform() === 'darwin' ||
(os.platform() === 'win32' &&
require('@chiragrupani/karma-chromium-edge-launcher/dist/Utilities').default.GetEdgeExe('Edge') !== '')
) {
// use "@chiragrupani/karma-chromium-edge-launcher"
karmaArgs.push(
'--karma-plugins=@chiragrupani/karma-chromium-edge-launcher',
'--karma-plugins=(?!karma-edge-launcher$)karma-*',
);
} else {
// use "karma-edge-launcher"
// == Special treatment to Microsoft Edge ==
//
// == Edge's Auto Recovery Feature ==
// when Edge starts, if it found itself was terminated forcely last time, it always recovers all previous
// pages. this always happen in Karma because `karma-edge-launcher` uses `taskkill` command to kill Edge every
// time.
//
// == The Problem ==
// every time when a test is completed, it will be added to the recovery page list.
// if we run the test 100 times, there will be 100 previous tabs when we launch Edge again.
// this run out of resources quickly and fails the further test.
// and it cannot recover by itself because every time it is terminated forcely or crashes.
// and the auto recovery feature has no way to disable by configuration/commandline/registry
//
// == The Solution ==
// for Microsoft Edge, we should clean up the previous active page before each run
// delete the files stores in the specific folder to clean up the recovery page list.
// see also: https://www.laptopmag.com/articles/edge-browser-stop-tab-restore
const deleteEdgeActiveRecoveryCommand =
// eslint-disable-next-line max-len
'del /F /Q % LOCALAPPDATA %\\Packages\\Microsoft.MicrosoftEdge_8wekyb3d8bbwe\\AC\\MicrosoftEdge\\User\\Default\\Recovery\\Active\\*';
npmlog.info('TestRunnerCli.Run', `CMD: ${deleteEdgeActiveRecoveryCommand}`);
spawnSync(deleteEdgeActiveRecoveryCommand, { shell: true, stdio: 'inherit' });
}
}
npmlog.info('TestRunnerCli.Run', `CMD: npx ${karmaArgs.join(' ')}`);
const karma = spawnSync('npx', karmaArgs, { shell: true, stdio: 'inherit' });
if (karma.status !== 0) {
console.error(karma.error);
process.exit(karma.status === null ? undefined : karma.status);
}
npmlog.info('TestRunnerCli.Run', '(4/4) Running karma to start test runner... DONE');
}
}
function saveFileCache(fileCache: Test.FileCache) {
const fileCacheUrls: string[] = [];
let currentIndex = 0;
let currentCache: Test.FileCache = {};
let currentContentTotalSize = 0;
for (const key in fileCache) {
const content = fileCache[key];
if (currentContentTotalSize > FILE_CACHE_SPLIT_SIZE) {
fileCacheUrls.push(saveOneFileCache(currentIndex, currentCache));
currentContentTotalSize = 0;
currentIndex++;
currentCache = {};
}
currentCache[key] = content;
currentContentTotalSize += key.length + content.length;
}
if (currentContentTotalSize > 0) {
fileCacheUrls.push(saveOneFileCache(currentIndex, currentCache));
}
return fileCacheUrls;
}
function saveOneFileCache(index: number, fileCache: Test.FileCache) {
fs.writeFileSync(path.join(TEST_ROOT, `./testdata-file-cache-${index}.json`), JSON.stringify(fileCache));
return path.join(TEST_DATA_BASE, `./testdata-file-cache-${index}.json`);
}
function saveConfig(config: Test.Config) {
fs.writeJSONSync(path.join(TEST_ROOT, './testdata-config.json'), config);
}
function getBrowserNameFromEnv(env: TestRunnerCliArgs['env']) {
switch (env) {
case 'chrome':
return 'ChromeTest';
case 'chromecanary':
return 'ChromeCanaryTest';
case 'edge':
return 'EdgeTest';
case 'firefox':
return 'FirefoxTest';
case 'electron':
return 'Electron';
case 'safari':
return 'Safari';
case 'bs':
return process.env.ORT_WEB_TEST_BS_BROWSERS!;
default:
throw new Error(`env "${env}" not supported.`);
}
}
}
void main();