mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
[EP Perf] Fix on EP Perf (#20683)
### Description <!-- Describe your changes. --> * Partially revert [previous change](https://github.com/microsoft/onnxruntime/pull/19804), and * Redo concurrency_test_result parser outside of post.py * Add support of syncing memtest result to db ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> To fix the error when CI is running on two model groups. - When running on two model groups, the [previous change](https://github.com/microsoft/onnxruntime/pull/19804) wrongly navigates two levels up in the directory after running one model group, while one level is needed. After that, the script can't find another model group. - Running on one model group can't repro the issue
This commit is contained in:
parent
f5bfbd6d81
commit
47a178b518
3 changed files with 173 additions and 59 deletions
|
|
@ -0,0 +1,132 @@
|
|||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
import csv
|
||||
import datetime
|
||||
import os
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
from azure.kusto.data import KustoConnectionStringBuilder
|
||||
from azure.kusto.ingest import QueuedIngestClient
|
||||
from post import get_identifier, parse_arguments, write_table
|
||||
|
||||
|
||||
def parse_valgrind_log(input_path, output_path, keywords):
|
||||
is_definitely_lost = False
|
||||
is_ort_trt_related = False
|
||||
buffer = []
|
||||
leak_block = None
|
||||
leak_bytes = None
|
||||
keyword = None
|
||||
results = []
|
||||
|
||||
with open(input_path) as file:
|
||||
lines = file.readlines()
|
||||
|
||||
for line in lines:
|
||||
line = line.strip() # noqa: PLW2901
|
||||
# Remove "==xxxxx==" pattern from the line
|
||||
line = line.split("==")[-1].strip() # noqa: PLW2901
|
||||
|
||||
if "blocks are definitely lost in loss" in line:
|
||||
is_definitely_lost = True
|
||||
# Extract LeakBlock and LeakBytes
|
||||
match = re.search(r"([\d,]+) byte[s]? in ([\d,]+) block[s]?", line)
|
||||
if match:
|
||||
leak_bytes = match.group(1).replace(",", "")
|
||||
leak_block = match.group(2).replace(",", "")
|
||||
continue
|
||||
|
||||
if is_definitely_lost:
|
||||
if line:
|
||||
buffer.append(line)
|
||||
for word in keywords:
|
||||
if word in line:
|
||||
is_ort_trt_related = True
|
||||
keyword = word
|
||||
break
|
||||
|
||||
# End of section
|
||||
if is_definitely_lost and not line:
|
||||
if is_ort_trt_related:
|
||||
results.append((keyword, leak_block, leak_bytes, "\n".join(buffer)))
|
||||
# Reset var
|
||||
is_definitely_lost = False
|
||||
is_ort_trt_related = False
|
||||
buffer = []
|
||||
leak_block = None
|
||||
leak_bytes = None
|
||||
keyword = None
|
||||
|
||||
# Writing results to CSV
|
||||
with open(output_path, "w", newline="") as csvfile:
|
||||
csvwriter = csv.writer(csvfile)
|
||||
csvwriter.writerow(["Keyword", "LeakBlock", "LeakBytes", "ValgrindMessage"])
|
||||
for entry in results:
|
||||
csvwriter.writerow([entry[0], entry[1], entry[2], entry[3]])
|
||||
|
||||
|
||||
def parse_concurrency_test_log(input_path, output_path):
|
||||
with open(input_path) as log_file:
|
||||
log_content = log_file.read()
|
||||
|
||||
failed_cases_section = log_content.split("Failed Test Cases:")[1]
|
||||
|
||||
# passed = 1 if no failed test cases
|
||||
if failed_cases_section.strip() == "":
|
||||
passed = 1
|
||||
else:
|
||||
passed = 0
|
||||
|
||||
with open(output_path, "w", newline="") as csv_file:
|
||||
csv_writer = csv.writer(csv_file)
|
||||
csv_writer.writerow(["Passed", "Log"])
|
||||
csv_writer.writerow([passed, log_content])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
|
||||
# connect to database
|
||||
kcsb_ingest = KustoConnectionStringBuilder.with_az_cli_authentication(args.kusto_conn)
|
||||
ingest_client = QueuedIngestClient(kcsb_ingest)
|
||||
identifier = get_identifier(
|
||||
args.commit_datetime, args.commit_hash, args.trt_version, args.branch, args.use_tensorrt_oss_parser
|
||||
)
|
||||
upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
|
||||
|
||||
try:
|
||||
result_mem_test_path = args.report_folder
|
||||
os.chdir(result_mem_test_path)
|
||||
# Parse mem_test log
|
||||
logs = ["valgrind.log", "concurrency_test.log"]
|
||||
csv_paths = ["mem_test.csv", "concurrency_test.csv"]
|
||||
for log, csv_path in zip(logs, csv_paths):
|
||||
if os.path.exists(log):
|
||||
print(f"{identifier}: Parsing {log}")
|
||||
if log == logs[0]:
|
||||
parse_valgrind_log(log, csv_path, ["TensorrtExecutionProvider", "TensorRT"])
|
||||
else:
|
||||
parse_concurrency_test_log(log, csv_path)
|
||||
|
||||
# Upload to db
|
||||
for csv_path, db_table_name in zip(csv_paths, ["ep_valgrind_record", "ep_concurrencytest_record"]):
|
||||
if os.path.exists(csv_path):
|
||||
table = pd.read_csv(csv_path)
|
||||
write_table(
|
||||
ingest_client,
|
||||
args.database,
|
||||
table,
|
||||
db_table_name,
|
||||
upload_time,
|
||||
identifier,
|
||||
args.branch,
|
||||
args.commit_hash,
|
||||
args.commit_datetime,
|
||||
)
|
||||
print(f"{identifier}: {csv_path} is synced to db")
|
||||
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
|
|
@ -3,7 +3,6 @@
|
|||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
import argparse
|
||||
import csv
|
||||
import datetime
|
||||
import os
|
||||
import sys
|
||||
|
|
@ -421,11 +420,10 @@ def main():
|
|||
upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
|
||||
|
||||
try:
|
||||
# Load EP Perf test results from /result
|
||||
result_file = args.report_folder
|
||||
result_perf_test_path = os.path.join(result_file, "result")
|
||||
folders = os.listdir(result_perf_test_path)
|
||||
os.chdir(result_perf_test_path)
|
||||
|
||||
folders = os.listdir(result_file)
|
||||
os.chdir(result_file)
|
||||
|
||||
tables = [
|
||||
fail_name,
|
||||
|
|
@ -448,13 +446,13 @@ def main():
|
|||
for model_group in folders:
|
||||
os.chdir(model_group)
|
||||
csv_filenames = os.listdir()
|
||||
for csv_file in csv_filenames:
|
||||
table = pd.read_csv(csv_file)
|
||||
if session_name in csv_file:
|
||||
for csv in csv_filenames:
|
||||
table = pd.read_csv(csv)
|
||||
if session_name in csv:
|
||||
table_results[session_name] = pd.concat(
|
||||
[table_results[session_name], get_session(table, model_group)], ignore_index=True
|
||||
)
|
||||
elif specs_name in csv_file:
|
||||
elif specs_name in csv:
|
||||
table_results[specs_name] = pd.concat(
|
||||
[
|
||||
table_results[specs_name],
|
||||
|
|
@ -462,12 +460,12 @@ def main():
|
|||
],
|
||||
ignore_index=True,
|
||||
)
|
||||
elif fail_name in csv_file:
|
||||
elif fail_name in csv:
|
||||
table_results[fail_name] = pd.concat(
|
||||
[table_results[fail_name], get_failures(table, model_group)],
|
||||
ignore_index=True,
|
||||
)
|
||||
elif latency_name in csv_file:
|
||||
elif latency_name in csv:
|
||||
table_results[memory_name] = pd.concat(
|
||||
[table_results[memory_name], get_memory(table, model_group)],
|
||||
ignore_index=True,
|
||||
|
|
@ -477,11 +475,11 @@ def main():
|
|||
[table_results[latency_name], get_latency(table, model_group)],
|
||||
ignore_index=True,
|
||||
)
|
||||
elif status_name in csv_file:
|
||||
elif status_name in csv:
|
||||
table_results[status_name] = pd.concat(
|
||||
[table_results[status_name], get_status(table, model_group)], ignore_index=True
|
||||
)
|
||||
elif op_metrics_name in csv_file:
|
||||
elif op_metrics_name in csv:
|
||||
table = table.assign(Group=model_group)
|
||||
table_results[op_metrics_name] = pd.concat(
|
||||
[table_results[op_metrics_name], table], ignore_index=True
|
||||
|
|
@ -515,43 +513,6 @@ def main():
|
|||
args.commit_datetime,
|
||||
)
|
||||
|
||||
# Load concurrency test results
|
||||
result_mem_test_path = os.path.join(result_file, "result_mem_test")
|
||||
os.chdir(result_mem_test_path)
|
||||
log_path = "concurrency_test.log"
|
||||
if os.path.exists(log_path):
|
||||
print("Generating concurrency test report")
|
||||
with open(log_path) as log_file:
|
||||
log_content = log_file.read()
|
||||
|
||||
failed_cases_section = log_content.split("Failed Test Cases:")[1]
|
||||
|
||||
# passed = 1 if no failed test cases
|
||||
if failed_cases_section.strip() == "":
|
||||
passed = 1
|
||||
else:
|
||||
passed = 0
|
||||
|
||||
csv_path = "concurrency_test.csv"
|
||||
with open(csv_path, "w", newline="") as csv_file:
|
||||
csv_writer = csv.writer(csv_file)
|
||||
csv_writer.writerow(["Passed", "Log"])
|
||||
csv_writer.writerow([passed, log_content])
|
||||
|
||||
db_table_name = "ep_concurrencytest_record"
|
||||
table = pd.read_csv(csv_path)
|
||||
write_table(
|
||||
ingest_client,
|
||||
args.database,
|
||||
table,
|
||||
db_table_name,
|
||||
upload_time,
|
||||
identifier,
|
||||
args.branch,
|
||||
args.commit_hash,
|
||||
args.commit_datetime,
|
||||
)
|
||||
|
||||
except BaseException as e:
|
||||
print(str(e))
|
||||
sys.exit(1)
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
parameters:
|
||||
|
||||
- name: PostToDashboard
|
||||
displayName: Post to Dashboard
|
||||
displayName: Post EP Perf results to Dashboard
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
|
|
@ -30,7 +30,7 @@ parameters:
|
|||
- "partner-models"
|
||||
|
||||
- name: MemTest
|
||||
displayName: Run Memory Test and Concurrency Test
|
||||
displayName: Run Memory and Concurrency Test
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
|
|
@ -147,11 +147,27 @@ jobs:
|
|||
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
|
||||
condition: always()
|
||||
|
||||
- task: PublishBuildArtifacts@1
|
||||
inputs:
|
||||
pathtoPublish: '$(Build.SourcesDirectory)/Artifact'
|
||||
artifactName: 'result-$(Build.BuildNumber)'
|
||||
|
||||
- script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs'
|
||||
displayName: 'Install dashboard dependencies'
|
||||
|
||||
- script: |
|
||||
az --version || {
|
||||
echo "Azure CLI not found, installing..."
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
}
|
||||
displayName: 'Check and Install Azure CLI'
|
||||
|
||||
- task: AzureCLI@2
|
||||
displayName: 'Parse Memory & Concurrency Test Records and Sync'
|
||||
inputs:
|
||||
azureSubscription: AIInfraBuildOnnxRuntimeOSS
|
||||
scriptLocation: inlineScript
|
||||
scriptType: bash
|
||||
inlineScript: |
|
||||
short_hash=$(git rev-parse --short HEAD) &&
|
||||
commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
|
||||
python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py -r $(Build.SourcesDirectory)/Artifact/result_mem_test -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
|
||||
|
||||
- ${{ if eq(parameters.PostToDashboard, true) }}:
|
||||
|
||||
- script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs'
|
||||
|
|
@ -165,7 +181,7 @@ jobs:
|
|||
displayName: 'Check and Install Azure CLI'
|
||||
|
||||
- task: AzureCLI@2
|
||||
displayName: 'Post EP Perf Results to Dashboard'
|
||||
displayName: 'Azure CLI Post to Dashboard'
|
||||
inputs:
|
||||
azureSubscription: AIInfraBuildOnnxRuntimeOSS
|
||||
scriptLocation: inlineScript
|
||||
|
|
@ -173,7 +189,12 @@ jobs:
|
|||
inlineScript: |
|
||||
short_hash=$(git rev-parse --short HEAD) &&
|
||||
commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
|
||||
python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
|
||||
python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
|
||||
|
||||
- task: PublishBuildArtifacts@1
|
||||
inputs:
|
||||
pathtoPublish: '$(Build.SourcesDirectory)/Artifact'
|
||||
artifactName: 'result-$(Build.BuildNumber)'
|
||||
|
||||
- template: templates/component-governance-component-detection-steps.yml
|
||||
parameters :
|
||||
|
|
|
|||
Loading…
Reference in a new issue