onnxruntime/onnxruntime/test/python/transformers/test_generation.py
kunal-vaishnavi 4bea5ec513
Add Whisper export with beam search test cases (#17228)
### Description
This PR adds test cases for the custom export of [Whisper with beam
search](https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/python/tools/transformers/models/whisper).



### Motivation and Context
This PR checks that Whisper can be exported and runs with parity.
2023-08-20 00:58:08 -07:00

383 lines
14 KiB
Python

#!/usr/bin/env python
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
import os
import shutil
import unittest
import onnx
import pytest
import torch
from parity_utilities import find_transformers_source
from onnxruntime import get_available_providers
if find_transformers_source() and find_transformers_source(["models", "t5"]):
from benchmark_helper import Precision
from convert_generation import main as run
from models.t5.convert_to_onnx import export_onnx_models as export_t5_onnx_models
from models.whisper.convert_to_onnx import main as run_whisper
else:
from onnxruntime.transformers.benchmark_helper import Precision
from onnxruntime.transformers.convert_generation import main as run
from onnxruntime.transformers.models.t5.convert_to_onnx import export_onnx_models as export_t5_onnx_models
from onnxruntime.transformers.models.whisper.convert_to_onnx import main as run_whisper
class TestBeamSearchGpt(unittest.TestCase):
"""Test BeamSearch for GPT-2 model"""
def setUp(self):
self.model_name = "gpt2"
self.gpt2_onnx_path = os.path.join(".", "onnx_models", "gpt2_past_fp32_shape.onnx")
self.beam_search_onnx_path = os.path.join(".", "onnx_models", "gpt2_beam_search.onnx")
self.default_arguments = [
f"-m {self.model_name}",
f"--decoder_onnx {self.gpt2_onnx_path}",
f"--output {self.beam_search_onnx_path}",
"--repetition_penalty 2.0",
]
self.sentences = [
"The product is released",
"I enjoy walking in the park",
"Test best way to invest",
# "The AI community building the future",
# "The selloff in tech shares deepened",
# "Abortion rights take centre stage",
]
self.enable_cuda = torch.cuda.is_available() and "CUDAExecutionProvider" in get_available_providers()
self.remove_onnx_files()
def tearDown(self):
self.remove_onnx_files()
def remove_onnx_files(self):
if os.path.exists(self.gpt2_onnx_path):
os.remove(self.gpt2_onnx_path)
if os.path.exists(self.beam_search_onnx_path):
os.remove(self.beam_search_onnx_path)
def check_for_init_decoder_attr(self, model_path: str):
init_decoder_found = False
gpt2_beam_search_onnx_model = onnx.load(model_path)
graph_proto = gpt2_beam_search_onnx_model.graph
for node in graph_proto.node:
if node.op_type == "BeamSearch" or node.op_type == "GreedySearch":
for attr in node.attribute:
if attr.name == "init_decoder":
init_decoder_found = True
break
self.assertTrue(init_decoder_found)
def run_beam_search(self, extra_arguments: str, sentences=None, append_arguments=True, is_greedy=False):
if append_arguments:
arguments = " ".join([*self.default_arguments, extra_arguments]).split()
else:
arguments = extra_arguments.split()
if is_greedy:
arguments.extend("--num_beams 1 --num_return_sequences 1".split())
else:
arguments.extend("--output_sequences_score".split())
# Test CPU
result = run(arguments, sentences=self.sentences if sentences is None else sentences)
self.assertTrue(result["parity"], f"ORT and PyTorch result is different on CPU for arguments {arguments}")
# (CPU) Check for the presence of the "init_decoder" attribute
self.check_for_init_decoder_attr(self.beam_search_onnx_path)
# Test GPU
if self.enable_cuda:
if "--use_gpu" not in arguments:
arguments.append("--use_gpu")
result = run(arguments, sentences=self.sentences if sentences is None else sentences)
self.assertTrue(result["parity"], f"ORT and PyTorch result is different on GPU for arguments {arguments}")
# (GPU) Check for the presence of the "init_decoder" attribute
self.check_for_init_decoder_attr(self.beam_search_onnx_path)
os.remove(self.beam_search_onnx_path)
@pytest.mark.slow
def test_return_sequences(self):
for return_sequences in [1, 2]:
self.run_beam_search(f"--num_return_sequences {return_sequences} --output_sequences_score")
@pytest.mark.slow
def test_early_stopping(self):
self.run_beam_search("--early_stopping --output_sequences_score")
@pytest.mark.slow
def test_length_penalty(self):
for length_penalty in [0.5, 2.0]:
self.run_beam_search(f"--length_penalty {length_penalty} --output_sequences_score")
@pytest.mark.slow
def test_no_repeat_ngram(self):
for ngram_size in [1, 2]:
self.run_beam_search(f"--no_repeat_ngram_size {ngram_size} --output_sequences_score")
@pytest.mark.slow
def test_greedy_search(self):
self.run_beam_search("", is_greedy=True)
@pytest.mark.slow
def test_greedy_search_past_present_share_buffer(self):
if self.enable_cuda:
self.run_beam_search("--past_present_share_buffer --use_gpu", is_greedy=True)
@pytest.mark.slow
def test_greedy_search_past_present_share_buffer_fp16(self):
if self.enable_cuda:
self.run_beam_search("--past_present_share_buffer --use_gpu -p fp16", is_greedy=True)
@pytest.mark.slow
def test_greedy_search_use_decoder_masked_self_attention(self):
if self.enable_cuda:
self.run_beam_search(
"--past_present_share_buffer --use_decoder_masked_self_attention --use_gpu", is_greedy=True
)
@pytest.mark.slow
def test_greedy_search_use_decoder_masked_self_attention_fp16(self):
if self.enable_cuda:
self.run_beam_search(
"--past_present_share_buffer --use_decoder_masked_self_attention --use_gpu -p fp16", is_greedy=True
)
@pytest.mark.slow
def test_greedy_search_float16(self):
# TODO: investigate fp16 parity issue for greedy/beam search with repetition_penalty != 1.0
if self.enable_cuda:
self.run_beam_search("--repetition_penalty 1.0 --use_gpu -p fp16", is_greedy=True)
@pytest.mark.slow
def test_beam_search_use_decoder_masked_self_attention(self):
if self.enable_cuda:
self.run_beam_search("--past_present_share_buffer --use_decoder_masked_self_attention --use_gpu")
@pytest.mark.slow
def test_beam_search_use_decoder_masked_self_attention_fp16(self):
if self.enable_cuda:
self.run_beam_search("--past_present_share_buffer --use_decoder_masked_self_attention --use_gpu -p fp16")
@pytest.mark.slow
def test_external_data(self):
self.run_beam_search(
f"-m gpt2 --output_sequences_score -e --output {self.beam_search_onnx_path}",
sentences=None,
append_arguments=False,
)
class TestBeamSearchT5(unittest.TestCase):
"""Test BeamSearch for T5 model"""
def setUp(self):
self.model_name = "t5-small"
self.decoder_onnx_path = os.path.join(".", "onnx_models", "t5-small_decoder.onnx")
self.encoder_onnx_path = os.path.join(".", "onnx_models", "t5-small_encoder_decoder_init.onnx")
self.beam_search_onnx_path = os.path.join(".", "onnx_models", "t5_small_beam_search.onnx")
self.default_arguments = [
f"-m {self.model_name}",
"--model_type t5",
f"--decoder_onnx {self.decoder_onnx_path}",
f"--encoder_decoder_init_onnx {self.encoder_onnx_path}",
f"--output {self.beam_search_onnx_path}",
"--output_sequences_score",
"--repetition_penalty 2.0",
]
self.enable_cuda = torch.cuda.is_available() and "CUDAExecutionProvider" in get_available_providers()
export_t5_onnx_models(
self.model_name,
os.path.join(".", "cache_models"),
os.path.join(".", "onnx_models"),
use_gpu=False,
use_external_data_format=False,
optimize_onnx=False,
precision=Precision.FLOAT32,
verbose=False,
use_decoder_start_token=False,
merge_encoder_and_decoder_init=True,
overwrite=True,
disable_auto_mixed_precision=False,
use_int32_inputs=True,
)
self.sentences = [
"translate English to French: The product is released",
"summarize: research continues to show that pets bring real health benefits to their owners. Having a dog around can lead to lower levels of stress for both adults and kids.",
]
if os.path.exists(self.beam_search_onnx_path):
os.remove(self.beam_search_onnx_path)
def tearDown(self):
self.remove_onnx_files()
def remove_onnx_files(self):
if os.path.exists(self.beam_search_onnx_path):
os.remove(self.beam_search_onnx_path)
if os.path.exists(self.decoder_onnx_path):
os.remove(self.decoder_onnx_path)
if os.path.exists(self.encoder_onnx_path):
os.remove(self.encoder_onnx_path)
def run_beam_search(self, extra_arguments: str, sentences=None, append_arguments=True):
if append_arguments:
arguments = " ".join([*self.default_arguments, extra_arguments]).split()
else:
arguments = extra_arguments.split()
# Test CPU
result = run(arguments, sentences=self.sentences if sentences is None else sentences)
self.assertTrue(result["parity"], f"ORT and PyTorch result is different on CPU for arguments {arguments}")
# Test GPU
if self.enable_cuda:
if "--use_gpu" not in arguments:
arguments.append("--use_gpu")
result = run(arguments, sentences=self.sentences if sentences is None else sentences)
self.assertTrue(result["parity"], f"ORT and PyTorch result is different on GPU for arguments {arguments}")
os.remove(self.beam_search_onnx_path)
@pytest.mark.slow
def test_return_sequences(self):
for return_sequences in [1, 2]:
self.run_beam_search(f"--num_return_sequences {return_sequences}")
@pytest.mark.slow
def test_early_stopping(self):
self.run_beam_search("--early_stopping")
@pytest.mark.slow
def test_length_penalty(self):
for length_penalty in [0.5, 2.0]:
self.run_beam_search(f"--length_penalty {length_penalty}")
@pytest.mark.slow
def test_no_repeat_ngram(self):
for ngram_size in [1, 2]:
self.run_beam_search(f"--no_repeat_ngram_size {ngram_size}")
@pytest.mark.slow
def test_custom_attention_mask(self):
self.run_beam_search("--custom_attention_mask")
@pytest.mark.slow
def test_external_data(self):
self.run_beam_search(
f"-m t5-small --model_type t5 -e --output {self.beam_search_onnx_path}",
sentences=None,
append_arguments=False,
)
class TestBeamSearchWhisper(unittest.TestCase):
"""Test BeamSearch for Whisper"""
def setUp(self):
self.model_name = "openai/whisper-tiny"
self.pytorch_folder = "cache_models"
self.onnx_folder = "onnx_models"
self.decoder_onnx_path = os.path.join(".", self.onnx_folder, "whisper-tiny_decoder.onnx")
self.encoder_onnx_path = os.path.join(".", self.onnx_folder, "whisper-tiny_encoder_decoder_init.onnx")
self.beam_search_onnx_path = os.path.join(".", self.onnx_folder, "whisper-tiny_beamsearch.onnx")
self.enable_cuda = torch.cuda.is_available() and "CUDAExecutionProvider" in get_available_providers()
self.base_arguments = [
"-m",
self.model_name,
"--output",
self.onnx_folder,
"--use_external_data_format",
]
self.fp32_cpu_arguments = [
"--precision",
"fp32",
"--optimize_onnx",
]
self.fp16_cuda_arguments = [
"--precision",
"fp16",
"--provider",
"cuda",
"--optimize_onnx",
"--use_gpu",
]
self.int8_cpu_arguments = [
"--precision",
"int8",
"--quantize_embedding_layer",
]
def tearDown(self):
pytorch_dir = os.path.join(".", self.pytorch_folder)
if os.path.exists(pytorch_dir):
shutil.rmtree(pytorch_dir)
onnx_dir = os.path.join(".", self.onnx_folder)
if os.path.exists(onnx_dir):
shutil.rmtree(onnx_dir)
def remove_onnx_files(self):
if os.path.exists(self.beam_search_onnx_path):
os.remove(self.beam_search_onnx_path)
os.remove(self.beam_search_onnx_path + ".data")
if os.path.exists(self.decoder_onnx_path):
os.remove(self.decoder_onnx_path)
os.remove(self.decoder_onnx_path + ".data")
if os.path.exists(self.encoder_onnx_path):
os.remove(self.encoder_onnx_path)
os.remove(self.encoder_onnx_path + ".data")
def run_export(self, arguments):
max_diff = run_whisper(arguments)
self.assertTrue(os.path.exists(self.beam_search_onnx_path), "Whisper model was not exported")
self.remove_onnx_files()
self.assertTrue(max_diff == 0, f"ORT and PyTorch have a parity mismatch of {max_diff}")
def run_configs(self, optional_arguments):
# FP32 CPU
arguments = self.base_arguments + self.fp32_cpu_arguments + optional_arguments
self.run_export(arguments)
if self.enable_cuda:
# FP16 CUDA
arguments = self.base_arguments + self.fp16_cuda_arguments + optional_arguments
self.run_export(arguments)
# INT8 CPU
arguments = self.base_arguments + self.int8_cpu_arguments + optional_arguments
self.run_export(arguments)
@pytest.mark.slow
def test_required_args(self):
optional_args = []
self.run_configs(optional_args)
@pytest.mark.slow
def test_forced_decoder_ids(self):
decoder_input_ids = ["--use_forced_decoder_ids"]
self.run_configs(decoder_input_ids)
@pytest.mark.slow
def test_logits_processor(self):
logits_processor = ["--use_logits_processor"]
self.run_configs(logits_processor)
if __name__ == "__main__":
unittest.main()