Setup EP Dashboard (#7321)

* setting up dashboard
* posting to ort dashboard
* creating separate docker file
* including common deps
* tracking latency over time
This commit is contained in:
Olivia Jain 2021-05-11 10:33:39 -07:00 committed by GitHub
parent ce8473a4ea
commit 29172d8f54
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 257 additions and 95 deletions

View file

@ -45,8 +45,8 @@ ep_to_provider_list = {
# latency gain headers
trt_cuda_gain = 'TRT_CUDA_gain(%)'
trt_cuda_fp16_gain = 'TRT_CUDA_fp16_gain(%)'
trt_native_gain = 'EP_Native_TRT_gain(%)'
trt_native_fp16_gain = 'EP_Native_TRT_fp16_gain(%)'
trt_native_gain = 'TRT_Standalone_gain(%)'
trt_native_fp16_gain = 'TRT_Standalone_fp16_gain(%)'
# metadata
FAIL_MODEL_FILE = ".fail_model_map"
@ -55,7 +55,7 @@ METRICS_FILE = ".metrics_map"
MEMORY_FILE = './temp_memory.csv'
def run_trt_standalone(trtexec, model_path, ort_inputs, all_inputs_shape, fp16):
logger.info("running native trt")
logger.info("running standalone trt")
model_path = "--onnx=" + model_path
input_shape = []
@ -1019,17 +1019,18 @@ def run_onnxruntime(args, models):
"sequence_length": 1,
"datetime": str(datetime.now()),}
# get standalone TensorRT perf
if trt in ep and args.trtexec:
# get standalone TensorRT perf
try:
ep = standalone_trt_fp16 if fp16 else standalone_trt
if args.track_memory:
ep = standalone_trt_fp16 if fp16 else standalone_trt
p = start_memory_tracking()
result = run_trt_standalone(args.trtexec, model_path, sess.get_inputs(), all_inputs_shape, fp16)
mem_usage = end_memory_tracking(p, True)
if result and mem_usage:
result["memory"] = mem_usage
p = start_memory_tracking()
result = run_trt_standalone(args.trtexec, model_path, sess.get_inputs(), all_inputs_shape, fp16)
mem_usage = end_memory_tracking(p, True)
if result and mem_usage:
result["memory"] = mem_usage
else:
result = run_trt_standalone(args.trtexec, model_path, sess.get_inputs(), all_inputs_shape, fp16)
@ -1247,7 +1248,7 @@ def output_status(results, csv_filename):
standalone_trt + " fp32",
cuda + " fp16",
trt + " fp16",
standalone_trt + "fp16"
standalone_trt + " fp16"
]
csv_writer = csv.writer(csv_file)
@ -1300,30 +1301,30 @@ def output_latency(results, csv_filename):
with open(csv_filename, mode="a", newline='') as csv_file:
column_names = ["Model",
"CPU \nmean (ms)",
"CPU \n 90th percentile (ms)",
"CPU fp32 \nmean (ms)",
"CPU fp32 \n 90th percentile (ms)",
"CUDA fp32 \nmean (ms)",
"CUDA fp32 \n90th percentile (ms)",
"CUDA EP fp32 \nmemory usage (MiB)",
"CUDA EP fp32 \npeak memory usage (MiB)",
"TRT EP fp32 \nmean (ms)",
"TRT EP fp32 \n90th percentile (ms)",
"TRT EP fp32 \nmemory usage (MiB)",
"TRT EP fp32 \npeak memory usage (MiB)",
"Standalone TRT fp32 \nmean (ms)",
"Standalone TRT fp32 \n90th percentile (ms)",
"Standalone TRT fp32 \nmemory usage (MiB)",
"Standalone TRT fp32 \npeak memory usage (MiB)",
"TRT v CUDA EP fp32 \ngain (mean) (%)",
"EP v Native TRT fp32 \ngain (mean) (%)",
"EP v Standalone TRT fp32 \ngain (mean) (%)",
"CUDA fp16 \nmean (ms)",
"CUDA fp16 \n90th percentile (ms)",
"CUDA EP fp16 \nmemory usage (MiB)",
"CUDA EP fp16 \npeak memory usage (MiB)",
"TRT EP fp16 \nmean (ms)",
"TRT EP fp16 \n90 percentile (ms)",
"TRT EP fp16 \nmemory usage (MiB)",
"TRT EP fp16 \n90th percentile (ms)",
"TRT EP fp16 \npeak memory usage (MiB)",
"Standalone TRT fp16 \nmean (ms)",
"Standalone TRT fp16 \n90th percentile (ms)",
"Standalone TRT fp16 \nmemory usage (MiB)",
"Standalone TRT fp16 \npeak memory usage (MiB)",
"TRT v CUDA EP fp16 \ngain (mean) (%)",
"EP v Native TRT fp16 \ngain (mean) (%)"]
"EP v Standalone TRT fp16 \ngain (mean) (%)"]
csv_writer = csv.writer(csv_file)
if need_write_header:

View file

@ -9,4 +9,4 @@ i) IMAGE_NAME=${OPTARG};;
esac
done
sudo docker build --no-cache -t $IMAGE_NAME --build-arg ONNXRUNTIME_BRANCH=$ORT_BRANCH -f $ORT_DOCKERFILE_PATH ..
sudo docker build --no-cache -t $IMAGE_NAME --build-arg ONNXRUNTIME_BRANCH=$ORT_BRANCH -f $PERF_DOCKERFILE_PATH ..

View file

@ -68,49 +68,19 @@
{
"model_name": "squeezenet1.1-7",
"working_directory": "./models/squeezenet1.1-7/",
"model_path": "./squeezenet1.1/._squeezenet1.1.onnx",
"model_path": "./squeezenet1.1/squeezenet1.1.onnx",
"test_data_path": "./squeezenet1.1/"
},
{
"model_name": "squeezenet1.0-3",
"working_directory": "./models/squeezenet1.0-3/",
"model_path": "./squeezenet/model.onnx",
"test_data_path": "./squeezenet/"
},
{
"model_name": "squeezenet1.0-6",
"working_directory": "./models/squeezenet1.0-6/",
"model_path": "./squeezenet/model.onnx",
"test_data_path": "./squeezenet/"
},
{
"model_name": "squeezenet1.0-7",
"working_directory": "./models/squeezenet1.0-7/",
"model_path": "./squeezenet/model.onnx",
"test_data_path": "./squeezenet/"
},
{
"model_name": "squeezenet1.0-8",
"working_directory": "./models/squeezenet1.0-8/",
"model_path": "./squeezenet/model.onnx",
"test_data_path": "./squeezenet/"
},
{
"model_name": "squeezenet1.0-9",
"working_directory": "./models/squeezenet1.0-9/",
"model_path": "./squeezenet/model.onnx",
"test_data_path": "./squeezenet/"
},
{
"model_name": "vgg16-7",
"working_directory": "./models/vgg16-7/",
"model_path": "./vgg16/._vgg16.onnx",
"model_path": "./vgg16/vgg16.onnx",
"test_data_path": "./vgg16/"
},
{
"model_name": "vgg19-bn-7",
"working_directory": "./models/vgg19-bn-7/",
"model_path": "./vgg19-bn/._vgg19-bn.onnx",
"model_path": "./vgg19-bn/vgg19-bn.onnx",
"test_data_path": "./vgg19-bn/"
},
{
@ -127,18 +97,6 @@
},
{
"model_name": "caffenet-9",
"working_directory": "./models/caffenet-9/",
"model_path": "./bvlc_reference_caffenet/model.onnx",
"test_data_path": "./bvlc_reference_caffenet/"
},
{
"model_name": "rcnn-ilsvrc13-9",
"working_directory": "./models/rcnn-ilsvrc13-9/",
"model_path": "./bvlc_reference_rcnn_ilsvrc13/model.onnx",
"test_data_path": "./bvlc_reference_rcnn_ilsvrc13/"
},
{
"model_name": "densenet-9",
"working_directory": "./models/densenet-9/",
"model_path": "./densenet121/model.onnx",
"test_data_path": "./densenet121/"
@ -224,7 +182,7 @@
{
"model_name": "yolov4",
"working_directory": "./models/yolov4/",
"model_path": "./yolov4/yolov4.onnx",
"model_path": "./yolov4/yolov4_shape.onnx",
"test_data_path": "./yolov4/"
},
{
@ -251,12 +209,6 @@
"model_path": "./GPT-2-LM-HEAD/model.onnx",
"test_data_path": "./GPT-2-LM-HEAD/"
},
{
"model_name": "gpt2-10",
"working_directory": "./models/gpt2-10/",
"model_path": "./GPT2/model.onnx",
"test_data_path": "./GPT2/"
},
{
"model_name": "zfnet512-9",
"working_directory": "./models/zfnet512-9/",
@ -268,11 +220,5 @@
"working_directory": "./models/arcfaceresnet100-8/",
"model_path": "./resnet100/resnet100.onnx",
"test_data_path": "./resnet100/"
},
{
"model_name": "mosaic-9",
"working_directory": "./models/mosaic-9/",
"model_path": "./mosaic/mosaic.onnx",
"test_data_path": "./mosaic/"
}
]

View file

@ -0,0 +1,195 @@
import argparse
import mysql.connector
import sys
import os
import subprocess
import pandas as pd
from sqlalchemy import create_engine
# database connection strings
sql_connector = 'mysql+mysqlconnector://'
user='powerbi@onnxruntimedashboard'
password=os.environ.get('DASHBOARD_MYSQL_ORT_PASSWORD')
host='onnxruntimedashboard.mysql.database.azure.com'
database='onnxruntime'
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"-r", "--report_folder", help="Path to the local file report", required=True)
parser.add_argument(
"-c", "--commit_hash", help="Commit id", required=True)
parser.add_argument(
"-u", "--report_url", help="Report Url", required=True)
return parser.parse_args()
def parse_csv(report_file):
table = pd.read_csv(report_file)
return table
def insert_latency(commit_hash, report_url, latency):
# connect to database
cnx = mysql.connector.connect(
user=user,
password=password,
host=host,
database=database)
try:
cursor = cnx.cursor()
# delete old records
delete_query = ('DELETE FROM onnxruntime.ep_latency_over_time '
'WHERE UploadTime < DATE_SUB(Now(), INTERVAL 30 DAY);'
)
cursor.execute(delete_query)
if not latency.empty:
print('posting latency over time results to dashboard')
to_drop = ['TrtGain-CudaFp32', 'EpGain-TrtFp32', 'TrtGain-CudaFp16', 'EpGain-TrtFp16']
over_time = latency.drop(to_drop, axis='columns')
over_time = over_time.melt(id_vars=['Model', 'Group'], var_name='Ep', value_name='Latency')
import time
datetime = time.strftime('%Y-%m-%d %H:%M:%S')
over_time = over_time.assign(UploadTime=datetime)
over_time = over_time.assign(CommitId=commit_hash)
over_time = over_time.assign(ReportUrl=report_url)
over_time = over_time[['UploadTime', 'CommitId', 'Model', 'Ep', 'Latency', 'ReportUrl', 'Group']]
over_time.fillna('', inplace=True)
tuples = list(over_time.to_records(index=False))
tuples = str(tuples)[1:-1] # cut off list brackets
# insert current record
insert_query = ('INSERT INTO onnxruntime.ep_latency_over_time '
'''(UploadTime, CommitId, Model, Ep, Latency, ReportUrl, ModelGroup) '''
'''VALUES %s; ''')
query = insert_query % tuples
cursor.execute(query)
cnx.commit()
cursor.close()
cnx.close()
except BaseException as e:
cnx.close()
raise e
def adjust_columns(table, columns, db_columns, model_group):
table = table[columns]
table = table.set_axis(db_columns, axis=1)
table = table.assign(Group=model_group)
return table
def get_failures(fail, model_group):
fail_columns = fail.keys()
fail_db_columns = ['Model', 'Ep', 'ErrorType', 'ErrorMessage']
fail = adjust_columns(fail, fail_columns, fail_db_columns, model_group)
return fail
def get_memory(memory, model_group):
memory_columns = ['Model', \
'CUDA EP fp32 \npeak memory usage (MiB)', \
'TRT EP fp32 \npeak memory usage (MiB)', \
'Standalone TRT fp32 \npeak memory usage (MiB)', \
'CUDA EP fp16 \npeak memory usage (MiB)', \
'TRT EP fp16 \npeak memory usage (MiB)', \
'Standalone TRT fp16 \npeak memory usage (MiB)' \
]
memory_db_columns = ['Model', 'CudaFp32', 'TrtFp32', 'StandaloneFp32', 'CudaFp16', 'TrtFp16', 'StandaloneFp16']
memory = adjust_columns(memory, memory_columns, memory_db_columns, model_group)
return memory
def get_latency(latency, model_group):
latency_columns = ['Model', \
'CPU fp32 \nmean (ms)', \
'CUDA fp32 \nmean (ms)', \
'TRT EP fp32 \nmean (ms)', \
'Standalone TRT fp32 \nmean (ms)', \
'TRT v CUDA EP fp32 \ngain (mean) (%)', \
'EP v Standalone TRT fp32 \ngain (mean) (%)',
'CUDA fp16 \nmean (ms)', \
'TRT EP fp16 \nmean (ms)', \
'Standalone TRT fp16 \nmean (ms)', \
'TRT v CUDA EP fp16 \ngain (mean) (%)', \
'EP v Standalone TRT fp16 \ngain (mean) (%)' \
]
latency_db_columns = ['Model', 'CpuFp32', 'CudaEpFp32', 'TrtEpFp32', 'StandaloneFp32', 'TrtGain-CudaFp32', 'EpGain-TrtFp32', \
'CudaEpFp16', 'TrtEpFp16', 'StandaloneFp16', 'TrtGain-CudaFp16', 'EpGain-TrtFp16']
latency = adjust_columns(latency, latency_columns, latency_db_columns, model_group)
return latency
def get_status(status, model_group):
status_columns = status.keys()
status_db_columns = ['Model', 'CpuFp32', 'CudaEpFp32', 'TrtEpFp32', 'StandaloneFp32', 'CudaEpFp16', 'TrtEpFp16', 'StandaloneFp16']
status = adjust_columns(status, status_columns, status_db_columns, model_group)
return status
def get_database_cert():
cert = 'BaltimoreCyberTrustRoot.crt.pem'
if not os.path.exists(cert):
p = subprocess.run(["wget", "https://cacerts.digicert.com/DigiCertGlobalRootG2.crt.pem", "-O", cert], check=True)
return cert
def write_table(engine, table, table_name):
table.to_sql(table_name, con=engine, if_exists='replace', index=False, chunksize=1)
def main():
# connect to database
cert = get_database_cert()
ssl_args = {'ssl_ca': cert}
connection_string = sql_connector + \
user + \
password + \
host + \
database
engine = create_engine(connection_string, connect_args=ssl_args)
try:
args = parse_arguments()
result_file = args.report_folder
folders = os.listdir(result_file)
os.chdir(result_file)
fail = pd.DataFrame()
memory = pd.DataFrame()
latency = pd.DataFrame()
status = pd.DataFrame()
for model_group in folders:
os.chdir(model_group)
csv_filenames = os.listdir()
for csv in csv_filenames:
table = parse_csv(csv)
if "fail" in csv:
fail = fail.append(get_failures(table, model_group), ignore_index=True)
if "latency" in csv:
memory = memory.append(get_memory(table, model_group), ignore_index=True)
latency = latency.append(get_latency(table, model_group), ignore_index=True)
if "status" in csv:
status = status.append(get_status(table, model_group), ignore_index=True)
os.chdir(result_file)
print('writing failures over time to database')
write_table(engine, fail, 'ep_model_fails')
print('writing memory to database')
write_table(engine, memory, 'ep_model_memory')
print('writing latency to database')
write_table(engine, latency, 'ep_model_latency')
print('writing status to database')
write_table(engine, status, 'ep_models_status')
print('writing latency over time to database')
insert_latency(args.commit_hash, args.report_url, latency)
except BaseException as e:
print(str(e))
sys.exit(1)
if __name__ == "__main__":
main()

View file

@ -1,36 +1,46 @@
#!/bin/bash
# Parse Arguments
while getopts d:o:m: parameter
while getopts d:o:m:p: parameter
do case "${parameter}"
in
d) DOCKER_IMAGE=${OPTARG};;
o) OPTION=${OPTARG};;
m) MODEL_PATH=${OPTARG};;
p) PERF_DIR=${OPTARG};;
esac
done
# Variables
MACHINE_PERF_DIR=/home/hcsuser/perf/
DOCKER_PERF_DIR=/usr/share/perf/
PERF_SCRIPT=$DOCKER_PERF_DIR'perf.sh'
VOLUME=$MACHINE_PERF_DIR:$DOCKER_PERF_DIR
VOLUME=$PERF_DIR:$DOCKER_PERF_DIR
ONNX_ZOO_VOLUME=' -v /home/hcsuser/perf/models:/usr/share/perf/models'
MANY_MODELS_VOLUME=' -v /home/hcsuser/mount/many-models:/usr/share/mount/many-models'
PARTNER_VOLUME=' -v /home/hcsuser/perf/partner:/usr/share/perf/partner'
# Add Remaining Variables
if [ $OPTION == "onnx-zoo-models" ]
then
MODEL_PATH=model_list.json
MODEL_PATH='model_list.json'
VOLUME=$VOLUME$ONNX_ZOO_VOLUME
fi
if [ $OPTION == "many-models" ]
then
MODEL_PATH=/usr/share/mount/many-models
VOLUME=$VOLUME' -v /home/hcsuser/mount/many-models:/usr/share/mount/many-models'
VOLUME=$VOLUME$MANY_MODELS_VOLUME
fi
if [ $OPTION == "partner-models" ]
then
MODEL_PATH=partner_model_list.json
MODEL_PATH='partner/partner_model_list.json'
VOLUME=$VOLUME$PARTNER_VOLUME
fi
if [ $OPTION == "selected-models" ]
then
VOLUME=$VOLUME$ONNX_ZOO_VOLUME$MANY_MODELS_VOLUME$PARTNER_VOLUME' -v /home/hcsuser/perf/subset_jsons/:/usr/share/perf/subset_jsons'
fi
sudo docker run --gpus all -v $VOLUME $DOCKER_IMAGE /bin/bash $PERF_SCRIPT -d $DOCKER_PERF_DIR -o $OPTION -m $MODEL_PATH

View file

@ -55,7 +55,7 @@ def get_model_info(link):
def write_json(models):
model_json = json.dumps(models, indent=4)
with open('model_list.json', 'w') as fp:
fp.write(models_json)
fp.write(model_json)
def main():
links = []

View file

@ -4,26 +4,26 @@ jobs:
variables:
ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
branch: 'master'
timeoutInMinutes: 4000
timeoutInMinutes: 3000
steps:
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/build_image.sh -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/Dockerfile.tensorrt-perf -b master -i ort-$(branch)'
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/build_image.sh -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/Dockerfile.tensorrt-perf -b $(branch) -i ort-$(branch)'
displayName: 'Build latest ORT Images'
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build'
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d ort-$(branch) -o "onnx-zoo-models"'
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d ort-$(branch) -o "onnx-zoo-models" -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf'
displayName: 'Onnx Zoo Models Perf'
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d ort-$(branch) -o "many-models"'
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d ort-$(branch) -o "many-models" -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf'
displayName: 'Many Models Perf'
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d ort-$(branch) -o "partner-models"'
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d ort-$(branch) -o "partner-models" -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf'
displayName: 'Partner Models Perf'
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
- script: 'mkdir $(Build.SourcesDirectory)/Artifact && cp -r /home/hcsuser/perf/result/ $(Build.SourcesDirectory)/Artifact'
- script: 'mkdir $(Build.SourcesDirectory)/Artifact && cp -r $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/result/ $(Build.SourcesDirectory)/Artifact'
displayName: 'Prepare Artifacts'
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
@ -31,5 +31,15 @@ jobs:
inputs:
pathtoPublish: '$(Build.SourcesDirectory)/Artifact'
artifactName: 'result'
- template: templates/clean-agent-build-directory-step.yml
- script: 'python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/result -c $(Build.SourceVersion) -u "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" '
displayName: 'Post to Dashboard'
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
env:
DASHBOARD_MYSQL_ORT_PASSWORD: $(dashboard-mysql-ort-password)
- script: sudo rm -rf $(Agent.BuildDirectory)
displayName: Clean build files (POSIX)
condition: not(eq(variables['Agent.OS'], 'Windows_NT')) # and always()
continueOnError: true # continuing on error for this step, since linux build folder is somehow getting permission issue