onnxruntime/onnxruntime/test/python/onnxruntime_test_python_nuphar.py

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

# -*- coding: UTF-8 -*-
import numpy as np
import onnx
from onnx import numpy_helper
import onnxruntime as onnxrt
import os
from onnxruntime.nuphar.rnn_benchmark import perf_test, generate_model
from pathlib import Path
import shutil
import sys
import subprocess
import tarfile
import unittest
import urllib.request


class TestNuphar(unittest.TestCase):

    def test_bidaf(self):
        # download BiDAF model
        cwd = os.getcwd()
        bidaf_url = 'https://onnxzoo.blob.core.windows.net/models/opset_9/bidaf/bidaf.tar.gz'
        cache_dir = os.path.join(os.path.expanduser("~"), '.cache', 'onnxruntime')
        os.makedirs(cache_dir, exist_ok=True)
        bidaf_local = os.path.join(cache_dir, 'bidaf.tar.gz')
        if not os.path.exists(bidaf_local):
            urllib.request.urlretrieve(bidaf_url, bidaf_local)
        with tarfile.open(bidaf_local, 'r') as f:
            f.extractall(cwd)

        # verify accuracy of quantized model
        bidaf_dir = os.path.join(cwd, 'bidaf')
        bidaf_model = os.path.join(bidaf_dir, 'bidaf.onnx')
        bidaf_scan_model = os.path.join(bidaf_dir, 'bidaf_scan.onnx')
        bidaf_opt_scan_model = os.path.join(bidaf_dir, 'bidaf_opt_scan.onnx')
        bidaf_int8_scan_only_model = os.path.join(bidaf_dir, 'bidaf_int8_scan_only.onnx')
        subprocess.run([
            sys.executable, '-m', 'onnxruntime.nuphar.model_editor', '--input', bidaf_model, '--output',
            bidaf_scan_model, '--mode', 'to_scan'
        ],
                       check=True,
                       cwd=cwd)
        subprocess.run([
            sys.executable, '-m', 'onnxruntime.nuphar.model_editor', '--input', bidaf_scan_model, '--output',
            bidaf_opt_scan_model, '--mode', 'opt_inproj'
        ],
                       check=True,
                       cwd=cwd)
        subprocess.run([
            sys.executable, '-m', 'onnxruntime.nuphar.model_quantizer', '--input', bidaf_opt_scan_model, '--output',
            bidaf_int8_scan_only_model, '--only_for_scan'
        ],
                       check=True,
                       cwd=cwd)

        # run onnx_test_runner to verify results
        # use -M to disable memory pattern
        onnx_test_runner = os.path.join(cwd, 'onnx_test_runner')
        subprocess.run([onnx_test_runner, '-e', 'nuphar', '-M', '-n', 'bidaf', cwd], check=True, cwd=cwd)

        # test AOT on the quantized model
        if os.name not in ['nt', 'posix']:
            return  # don't run the rest of test if AOT is not supported

        cache_dir = os.path.join(cwd, 'nuphar_cache')
        if os.path.exists(cache_dir):
            shutil.rmtree(cache_dir)
        os.makedirs(cache_dir)

        # prepare feed
        feed = {}
        for i in range(4):
            tp = onnx.load_tensor(os.path.join(bidaf_dir, 'test_data_set_0', 'input_{}.pb'.format(i)))
            feed[tp.name] = numpy_helper.to_array(tp)

        for model in [bidaf_opt_scan_model, bidaf_int8_scan_only_model]:
            nuphar_settings = 'nuphar_cache_path:{}'.format(cache_dir)
            for isa in ['avx', 'avx2', 'avx512']:
                onnxrt.capi._pybind_state.set_nuphar_settings(nuphar_settings + ', nuphar_codegen_target:' + isa)
                sess = onnxrt.InferenceSession(model)  # JIT cache happens when initializing session

            cache_dir_content = os.listdir(cache_dir)
            assert len(cache_dir_content) == 1
            cache_versioned_dir = os.path.join(cache_dir, cache_dir_content[0])
            so_name = os.path.basename(model) + '.so'
            subprocess.run([
                sys.executable, '-m', 'onnxruntime.nuphar.create_shared', '--input_dir', cache_versioned_dir,
                '--output_name', so_name
            ],
                           check=True)

            nuphar_settings = 'nuphar_cache_path:{}, nuphar_cache_so_name:{}, nuphar_cache_force_no_jit:{}'.format(
                cache_dir, so_name, 'on')
            onnxrt.capi._pybind_state.set_nuphar_settings(nuphar_settings)
            sess = onnxrt.InferenceSession(model)
            sess.run([], feed)

            # test avx
            nuphar_settings = 'nuphar_cache_path:{}, nuphar_cache_so_name:{}, nuphar_cache_force_no_jit:{}, nuphar_codegen_target:{}'.format(
                cache_dir, so_name, 'on', 'avx')
            onnxrt.capi._pybind_state.set_nuphar_settings(nuphar_settings)
            sess = onnxrt.InferenceSession(model)
            sess.run([], feed)

    def test_bert_squad(self):
        # download BERT_squad model
        cwd = os.getcwd()
        bert_squad_url = 'https://onnxzoo.blob.core.windows.net/models/opset_10/bert_squad/download_sample_10.tar.gz'
        cache_dir = os.path.join(os.path.expanduser("~"), '.cache', 'onnxruntime')
        os.makedirs(cache_dir, exist_ok=True)
        bert_squad_local = os.path.join(cache_dir, 'bert_squad.tar.gz')
        if not os.path.exists(bert_squad_local):
            urllib.request.urlretrieve(bert_squad_url, bert_squad_local)
        with tarfile.open(bert_squad_local, 'r') as f:
            f.extractall(cwd)

        # run symbolic shape inference on this model
        # set int_max to 1,000,000 to simplify symbol computes for things like min(1000000, seq_len) -> seq_len
        bert_squad_dir = os.path.join(cwd, 'download_sample_10')
        bert_squad_model = os.path.join(bert_squad_dir, 'bertsquad10.onnx')
        subprocess.run([
            sys.executable, '-m', 'onnxruntime.nuphar.symbolic_shape_infer', '--input', bert_squad_model, '--output',
            bert_squad_model, '--auto_merge', '--int_max=1000000'
        ],
                       check=True,
                       cwd=cwd)

        # run onnx_test_runner to verify results
        onnx_test_runner = os.path.join(cwd, 'onnx_test_runner')
        subprocess.run([onnx_test_runner, '-e', 'nuphar', '-n', 'download_sample_10', cwd], check=True, cwd=cwd)

        # run onnxruntime_perf_test, note that nuphar currently is not integrated with ORT thread pool, so set -x 1 to avoid thread confliction with OpenMP
        onnxruntime_perf_test = os.path.join(cwd, 'onnxruntime_perf_test')
        subprocess.run([onnxruntime_perf_test, '-e', 'nuphar', '-x', '1', '-t', '20', bert_squad_model, '1.txt'],
                       check=True,
                       cwd=cwd)

    def test_rnn_benchmark(self):
        # make sure benchmarking scripts works
        # note: quantized model requires AVX2, otherwise it might be slow
        avg_rnn, avg_scan, avg_int8 = perf_test('lstm',
                                                num_threads=1,
                                                input_dim=128,
                                                hidden_dim=1024,
                                                bidirectional=True,
                                                layers=1,
                                                seq_len=16,
                                                batch_size=1,
                                                min_duration_seconds=1)
        avg_rnn, avg_scan, avg_int8 = perf_test('gru',
                                                num_threads=1,
                                                input_dim=128,
                                                hidden_dim=1024,
                                                bidirectional=False,
                                                layers=2,
                                                seq_len=16,
                                                batch_size=3,
                                                min_duration_seconds=1)
        avg_rnn, avg_scan, avg_int8 = perf_test('rnn',
                                                num_threads=1,
                                                input_dim=128,
                                                hidden_dim=1024,
                                                bidirectional=False,
                                                layers=3,
                                                seq_len=16,
                                                batch_size=2,
                                                min_duration_seconds=1)

    def test_batch_scan(self):
        input_dim = 3
        hidden_dim = 5
        bidirectional = False
        layers = 3

        lstm_model_name = 'test_batch_rnn_lstm.onnx'
        # create an LSTM model for generating baseline data
        generate_model('lstm',
                       input_dim,
                       hidden_dim,
                       bidirectional,
                       layers,
                       lstm_model_name,
                       batch_one=False,
                       has_seq_len=True)

        seq_len = 8
        batch_size = 2
        # prepare input
        data_input = (np.random.rand(seq_len, batch_size, input_dim) * 2 - 1).astype(np.float32)
        data_seq_len = np.random.randint(1, seq_len, size=(batch_size,), dtype=np.int32)

        # run lstm as baseline
        sess = onnxrt.InferenceSession(lstm_model_name)
        first_lstm_data_output = sess.run([], {'input': data_input[:, 0:1, :], 'seq_len': data_seq_len[0:1]})

        lstm_data_output = []
        lstm_data_output = first_lstm_data_output

        for b in range(1, batch_size):
            lstm_data_output = lstm_data_output + sess.run([], {
                'input': data_input[:, b:(b + 1), :],
                'seq_len': data_seq_len[b:(b + 1)]
            })
        lstm_data_output = np.concatenate(lstm_data_output, axis=1)

        # generate a batch scan model
        scan_model_name = 'test_batch_rnn_scan.onnx'
        subprocess.run([
            sys.executable, '-m', 'onnxruntime.nuphar.model_editor', '--input', lstm_model_name, '--output',
            scan_model_name, '--mode', 'to_scan'
        ],
                       check=True)

        # run scan_batch with batch size 1
        sess = onnxrt.InferenceSession(scan_model_name)
        scan_batch_data_output = sess.run([], {'input': data_input[:, 0:1, :], 'seq_len': data_seq_len[0:1]})
        assert np.allclose(first_lstm_data_output, scan_batch_data_output)

        # run scan_batch with batch size 2
        scan_batch_data_output = sess.run([], {'input': data_input, 'seq_len': data_seq_len})
        assert np.allclose(lstm_data_output, scan_batch_data_output)

        # run scan_batch with batch size 1 again
        scan_batch_data_output = sess.run([], {'input': data_input[:, 0:1, :], 'seq_len': data_seq_len[0:1]})
        assert np.allclose(first_lstm_data_output, scan_batch_data_output)

    def test_symbolic_shape_infer(self):
        cwd = os.getcwd()
        test_model_dir = os.path.join(cwd, '..', 'models')
        for filename in Path(test_model_dir).rglob('*.onnx'):
            if filename.name.startswith('.'):
                continue  # skip some bad model files
            subprocess.run([
                sys.executable, '-m', 'onnxruntime.nuphar.symbolic_shape_infer', '--input',
                str(filename), '--auto_merge', '--int_max=100000', '--guess_output_rank'
            ],
                           check=True,
                           cwd=cwd)


if __name__ == '__main__':
    unittest.main()