[EP Perf] Fix on EP Perf (#20683)

### Description
<!-- Describe your changes. -->
* Partially revert [previous
change](https://github.com/microsoft/onnxruntime/pull/19804), and
   * Redo concurrency_test_result parser outside of post.py
* Add support of syncing memtest result to db


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
To fix the error when CI is running on two model groups.
- When running on two model groups, the [previous
change](https://github.com/microsoft/onnxruntime/pull/19804) wrongly
navigates two levels up in the directory after running one model group,
while one level is needed. After that, the script can't find another
model group.
- Running on one model group can't repro the issue
This commit is contained in:
Yifan Li 2024-05-15 21:38:52 -07:00 committed by GitHub
parent f5bfbd6d81
commit 47a178b518
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 173 additions and 59 deletions

View file

@ -0,0 +1,132 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
import csv
import datetime
import os
import re
import pandas as pd
from azure.kusto.data import KustoConnectionStringBuilder
from azure.kusto.ingest import QueuedIngestClient
from post import get_identifier, parse_arguments, write_table
def parse_valgrind_log(input_path, output_path, keywords):
is_definitely_lost = False
is_ort_trt_related = False
buffer = []
leak_block = None
leak_bytes = None
keyword = None
results = []
with open(input_path) as file:
lines = file.readlines()
for line in lines:
line = line.strip() # noqa: PLW2901
# Remove "==xxxxx==" pattern from the line
line = line.split("==")[-1].strip() # noqa: PLW2901
if "blocks are definitely lost in loss" in line:
is_definitely_lost = True
# Extract LeakBlock and LeakBytes
match = re.search(r"([\d,]+) byte[s]? in ([\d,]+) block[s]?", line)
if match:
leak_bytes = match.group(1).replace(",", "")
leak_block = match.group(2).replace(",", "")
continue
if is_definitely_lost:
if line:
buffer.append(line)
for word in keywords:
if word in line:
is_ort_trt_related = True
keyword = word
break
# End of section
if is_definitely_lost and not line:
if is_ort_trt_related:
results.append((keyword, leak_block, leak_bytes, "\n".join(buffer)))
# Reset var
is_definitely_lost = False
is_ort_trt_related = False
buffer = []
leak_block = None
leak_bytes = None
keyword = None
# Writing results to CSV
with open(output_path, "w", newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["Keyword", "LeakBlock", "LeakBytes", "ValgrindMessage"])
for entry in results:
csvwriter.writerow([entry[0], entry[1], entry[2], entry[3]])
def parse_concurrency_test_log(input_path, output_path):
with open(input_path) as log_file:
log_content = log_file.read()
failed_cases_section = log_content.split("Failed Test Cases:")[1]
# passed = 1 if no failed test cases
if failed_cases_section.strip() == "":
passed = 1
else:
passed = 0
with open(output_path, "w", newline="") as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Passed", "Log"])
csv_writer.writerow([passed, log_content])
if __name__ == "__main__":
args = parse_arguments()
# connect to database
kcsb_ingest = KustoConnectionStringBuilder.with_az_cli_authentication(args.kusto_conn)
ingest_client = QueuedIngestClient(kcsb_ingest)
identifier = get_identifier(
args.commit_datetime, args.commit_hash, args.trt_version, args.branch, args.use_tensorrt_oss_parser
)
upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
try:
result_mem_test_path = args.report_folder
os.chdir(result_mem_test_path)
# Parse mem_test log
logs = ["valgrind.log", "concurrency_test.log"]
csv_paths = ["mem_test.csv", "concurrency_test.csv"]
for log, csv_path in zip(logs, csv_paths):
if os.path.exists(log):
print(f"{identifier}: Parsing {log}")
if log == logs[0]:
parse_valgrind_log(log, csv_path, ["TensorrtExecutionProvider", "TensorRT"])
else:
parse_concurrency_test_log(log, csv_path)
# Upload to db
for csv_path, db_table_name in zip(csv_paths, ["ep_valgrind_record", "ep_concurrencytest_record"]):
if os.path.exists(csv_path):
table = pd.read_csv(csv_path)
write_table(
ingest_client,
args.database,
table,
db_table_name,
upload_time,
identifier,
args.branch,
args.commit_hash,
args.commit_datetime,
)
print(f"{identifier}: {csv_path} is synced to db")
except Exception as e:
print(str(e))

View file

@ -3,7 +3,6 @@
# Licensed under the MIT License.
# --------------------------------------------------------------------------
import argparse
import csv
import datetime
import os
import sys
@ -421,11 +420,10 @@ def main():
upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
try:
# Load EP Perf test results from /result
result_file = args.report_folder
result_perf_test_path = os.path.join(result_file, "result")
folders = os.listdir(result_perf_test_path)
os.chdir(result_perf_test_path)
folders = os.listdir(result_file)
os.chdir(result_file)
tables = [
fail_name,
@ -448,13 +446,13 @@ def main():
for model_group in folders:
os.chdir(model_group)
csv_filenames = os.listdir()
for csv_file in csv_filenames:
table = pd.read_csv(csv_file)
if session_name in csv_file:
for csv in csv_filenames:
table = pd.read_csv(csv)
if session_name in csv:
table_results[session_name] = pd.concat(
[table_results[session_name], get_session(table, model_group)], ignore_index=True
)
elif specs_name in csv_file:
elif specs_name in csv:
table_results[specs_name] = pd.concat(
[
table_results[specs_name],
@ -462,12 +460,12 @@ def main():
],
ignore_index=True,
)
elif fail_name in csv_file:
elif fail_name in csv:
table_results[fail_name] = pd.concat(
[table_results[fail_name], get_failures(table, model_group)],
ignore_index=True,
)
elif latency_name in csv_file:
elif latency_name in csv:
table_results[memory_name] = pd.concat(
[table_results[memory_name], get_memory(table, model_group)],
ignore_index=True,
@ -477,11 +475,11 @@ def main():
[table_results[latency_name], get_latency(table, model_group)],
ignore_index=True,
)
elif status_name in csv_file:
elif status_name in csv:
table_results[status_name] = pd.concat(
[table_results[status_name], get_status(table, model_group)], ignore_index=True
)
elif op_metrics_name in csv_file:
elif op_metrics_name in csv:
table = table.assign(Group=model_group)
table_results[op_metrics_name] = pd.concat(
[table_results[op_metrics_name], table], ignore_index=True
@ -515,43 +513,6 @@ def main():
args.commit_datetime,
)
# Load concurrency test results
result_mem_test_path = os.path.join(result_file, "result_mem_test")
os.chdir(result_mem_test_path)
log_path = "concurrency_test.log"
if os.path.exists(log_path):
print("Generating concurrency test report")
with open(log_path) as log_file:
log_content = log_file.read()
failed_cases_section = log_content.split("Failed Test Cases:")[1]
# passed = 1 if no failed test cases
if failed_cases_section.strip() == "":
passed = 1
else:
passed = 0
csv_path = "concurrency_test.csv"
with open(csv_path, "w", newline="") as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Passed", "Log"])
csv_writer.writerow([passed, log_content])
db_table_name = "ep_concurrencytest_record"
table = pd.read_csv(csv_path)
write_table(
ingest_client,
args.database,
table,
db_table_name,
upload_time,
identifier,
args.branch,
args.commit_hash,
args.commit_datetime,
)
except BaseException as e:
print(str(e))
sys.exit(1)

View file

@ -1,7 +1,7 @@
parameters:
- name: PostToDashboard
displayName: Post to Dashboard
displayName: Post EP Perf results to Dashboard
type: boolean
default: true
@ -30,7 +30,7 @@ parameters:
- "partner-models"
- name: MemTest
displayName: Run Memory Test and Concurrency Test
displayName: Run Memory and Concurrency Test
type: boolean
default: true
@ -147,11 +147,27 @@ jobs:
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
condition: always()
- task: PublishBuildArtifacts@1
inputs:
pathtoPublish: '$(Build.SourcesDirectory)/Artifact'
artifactName: 'result-$(Build.BuildNumber)'
- script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs'
displayName: 'Install dashboard dependencies'
- script: |
az --version || {
echo "Azure CLI not found, installing..."
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
}
displayName: 'Check and Install Azure CLI'
- task: AzureCLI@2
displayName: 'Parse Memory & Concurrency Test Records and Sync'
inputs:
azureSubscription: AIInfraBuildOnnxRuntimeOSS
scriptLocation: inlineScript
scriptType: bash
inlineScript: |
short_hash=$(git rev-parse --short HEAD) &&
commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py -r $(Build.SourcesDirectory)/Artifact/result_mem_test -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
- ${{ if eq(parameters.PostToDashboard, true) }}:
- script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs'
@ -165,7 +181,7 @@ jobs:
displayName: 'Check and Install Azure CLI'
- task: AzureCLI@2
displayName: 'Post EP Perf Results to Dashboard'
displayName: 'Azure CLI Post to Dashboard'
inputs:
azureSubscription: AIInfraBuildOnnxRuntimeOSS
scriptLocation: inlineScript
@ -173,7 +189,12 @@ jobs:
inlineScript: |
short_hash=$(git rev-parse --short HEAD) &&
commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
- task: PublishBuildArtifacts@1
inputs:
pathtoPublish: '$(Build.SourcesDirectory)/Artifact'
artifactName: 'result-$(Build.BuildNumber)'
- template: templates/component-governance-component-detection-steps.yml
parameters :