Add Anubis metrics schema for local benchmark results uploading (#19018)

### Description
1. Add metrics.py for define the metrics schema used by Anubis
2. Add two examples (llama2 and whisper) of how to save local benchmark
results following Anubis metrics schema


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: Kyle Zhang <Xi.Zhang@microsoft.com>
Co-authored-by: ironman <bitzhangxi@outlook.com>
This commit is contained in:
gunandrose4u 2024-01-12 14:24:01 +08:00 committed by GitHub
parent 46dd0d3f52
commit e2c145d37f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 300 additions and 6 deletions

View file

@ -0,0 +1,164 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
import datetime
import json
from typing import Optional
import pandas as pd
class BaseObject:
def __init__(self):
self.customized = {}
def to_dict(self):
default_values = self.__dict__.copy()
default_values.pop("customized", None)
default_values.update(self.customized)
for k, v in default_values.items():
if isinstance(v, BaseObject):
default_values[k] = v.to_dict()
return {k: v for k, v in default_values.items() if v}
class ModelInfo(BaseObject):
def __init__(
self,
full_name: Optional[str] = None,
is_huggingface: Optional[bool] = False,
is_text_generation: Optional[bool] = False,
short_name: Optional[str] = None,
):
super().__init__()
self.full_name = full_name
self.is_huggingface = is_huggingface
self.is_text_generation = is_text_generation
self.short_name = short_name
self.input_shape = []
class BackendOptions(BaseObject):
def __init__(
self,
enable_profiling: Optional[bool] = False,
execution_provider: Optional[str] = None,
use_io_binding: Optional[bool] = False,
):
super().__init__()
self.enable_profiling = enable_profiling
self.execution_provider = execution_provider
self.use_io_binding = use_io_binding
class Config(BaseObject):
def __init__(
self,
backend: Optional[str] = "onnxruntime",
batch_size: Optional[int] = 1,
seq_length: Optional[int] = 0,
precision: Optional[str] = "fp32",
warmup_runs: Optional[int] = 1,
measured_runs: Optional[int] = 10,
):
super().__init__()
self.backend = backend
self.batch_size = batch_size
self.seq_length = seq_length
self.precision = precision
self.warmup_runs = warmup_runs
self.measured_runs = measured_runs
self.model_info = ModelInfo()
self.backend_options = BackendOptions()
class Metadata(BaseObject):
def __init__(
self,
device: Optional[str] = None,
package_name: Optional[str] = None,
package_version: Optional[str] = None,
platform: Optional[str] = None,
python_version: Optional[str] = None,
):
super().__init__()
self.device = device
self.package_name = package_name
self.package_version = package_version
self.platform = platform
self.python_version = python_version
class Metrics(BaseObject):
def __init__(
self,
latency_ms_mean: Optional[float] = 0.0,
throughput_qps: Optional[float] = 0.0,
max_memory_usage_GB: Optional[float] = 0.0,
):
super().__init__()
self.latency_ms_mean = latency_ms_mean
self.throughput_qps = throughput_qps
self.max_memory_usage_GB = max_memory_usage_GB
class BenchmarkRecord:
def __init__(
self,
model_name: str,
precision: str,
backend: str,
device: str,
package_name: str,
package_version: str,
batch_size: Optional[int] = 1,
warmup_runs: Optional[int] = 1,
measured_runs: Optional[int] = 10,
trigger_date: Optional[str] = None,
):
self.config = Config()
self.metrics = Metrics()
self.metadata = Metadata()
self.trigger_date = trigger_date or datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
self.config.model_info.full_name = model_name
self.config.precision = precision
self.config.backend = backend
self.config.batch_size = batch_size
self.config.warmup_runs = warmup_runs
self.config.measured_runs = measured_runs
self.metadata.device = device
self.metadata.package_name = package_name
self.metadata.package_version = package_version
def to_dict(self) -> dict:
return {
"config": self.config.to_dict(),
"metadata": self.metadata.to_dict(),
"metrics": self.metrics.to_dict(),
"trigger_date": self.trigger_date,
}
def to_json(self) -> str:
return json.dumps(self.to_dict(), default=str)
@classmethod
def save_as_csv(cls, file_name: str, records: list) -> None:
if records is None or len(records) == 0:
return
rds = [record.to_dict() for record in records]
df = pd.json_normalize(rds)
df.to_csv(file_name, index=False)
@classmethod
def save_as_json(cls, file_name: str, records: list) -> None:
if records is None or len(records) == 0:
return
rds = [record.to_dict() for record in records]
with open(file_name, "w") as f:
json.dump(rds, f, indent=4, default=str)

View file

@ -7,6 +7,7 @@ import subprocess
import torch
from benchmark_helper import setup_logger
from metrics import BenchmarkRecord
logger = logging.getLogger(__name__)
@ -121,11 +122,19 @@ def get_args():
help="Number of mins to attempt the benchmark before moving on",
)
parser.add_argument(
"--log-folder",
type=str,
default=None,
help="Path to folder to save logs and results",
)
args = parser.parse_args()
setattr(args, "model_size", args.model_name.split("/")[-1].replace(".", "-")) # noqa: B010
log_folder_name = f"./{args.model_size}_{args.precision}"
setattr(args, "log_folder", log_folder_name) # noqa: B010
if not args.log_folder:
args.log_folder = log_folder_name
os.makedirs(args.log_folder, exist_ok=True)
# Convert timeout value to secs
@ -197,6 +206,9 @@ def save_results(results, filename):
df = pd.DataFrame(
results,
columns=[
"Warmup Runs",
"Measured Runs",
"Model Name",
"Engine",
"Precision",
"Device",
@ -211,6 +223,8 @@ def save_results(results, filename):
)
# Set column types
df["Warmup Runs"] = df["Warmup Runs"].astype("int")
df["Measured Runs"] = df["Measured Runs"].astype("int")
df["Batch Size"] = df["Batch Size"].astype("int")
df["Sequence Length"] = df["Sequence Length"].astype("int")
df["Latency (s)"] = df["Latency (s)"].astype("float")
@ -218,7 +232,52 @@ def save_results(results, filename):
df["Throughput (tps)"] = df["Throughput (tps)"].astype("float")
df["Memory (GB)"] = df["Memory (GB)"].astype("float")
df.to_csv(filename, index=False)
# get package name and version
import pkg_resources
installed_packages = pkg_resources.working_set
installed_packages_list = sorted(
[
f"{i.key}=={i.version}"
for i in installed_packages
if i.key in ["ort-nightly-gpu", "ort-nightly", "onnxruntime", "onnxruntime-gpu"]
]
)
ort_pkg_name = ""
ort_pkg_version = ""
if installed_packages_list:
ort_pkg_name = installed_packages_list[0].split("==")[0]
ort_pkg_version = installed_packages_list[0].split("==")[1]
# Save results to csv with standard format
records = []
for _, row in df.iterrows():
if row["Engine"] == "optimum-ort":
record = BenchmarkRecord(
row["Model Name"], row["Precision"], "onnxruntime", row["Device"], ort_pkg_name, ort_pkg_version
)
elif row["Engine"] in ["pytorch-eager", "pytorch-compile"]:
record = BenchmarkRecord(
row["Model Name"], row["Precision"], "pytorch", row["Device"], torch.__name__, torch.__version__
)
else:
record = BenchmarkRecord(row["Model Name"], row["Precision"], row["Engine"], row["Device"], "", "")
record.config.warmup_runs = row["Warmup Runs"]
record.config.measured_runs = row["Measured Runs"]
record.config.batch_size = row["Batch Size"]
record.config.seq_length = row["Sequence Length"]
record.config.customized["measure_step"] = row["Step"]
record.config.customized["engine"] = row["Engine"]
record.metrics.customized["latency_s_mean"] = row["Latency (s)"]
record.metrics.latency_ms_mean = row["Latency (ms)"]
record.metrics.customized["throughput_tps"] = row["Throughput (tps)"]
record.metrics.max_memory_usage_GB = row["Memory (GB)"]
records.append(record)
BenchmarkRecord.save_as_csv(filename, records)
BenchmarkRecord.save_as_json(filename.replace(".csv", ".json"), records)
logger.info(f"Results saved in {filename}!")
@ -234,7 +293,7 @@ def benchmark(args, benchmark_cmd, engine):
# Create entries for csv
logger.info("Gathering data from log files...")
base_results = [engine, args.precision, args.device]
base_results = [args.warmup_runs, args.num_runs, args.model_name, engine, args.precision, args.device]
results = process_log_file(args.device_id, log_path, base_results)
return results

View file

@ -8,6 +8,7 @@ import subprocess
import librosa
import torch
from benchmark_helper import setup_logger
from metrics import BenchmarkRecord
from transformers import WhisperConfig, WhisperProcessor
logger = logging.getLogger(__name__)
@ -123,13 +124,21 @@ def get_args():
help="Number of mins to attempt the benchmark before moving on",
)
parser.add_argument(
"--log-folder",
type=str,
default=None,
help="Path to folder to save logs and results",
)
parser.add_argument("--tune", default=False, action="store_true")
args = parser.parse_args()
setattr(args, "model_size", args.model_name.split("/")[-1].replace(".", "-")) # noqa: B010
log_folder_name = f"./{args.model_size}-{args.precision}"
setattr(args, "log_folder", log_folder_name) # noqa: B010
if not args.log_folder:
args.log_folder = log_folder_name
os.makedirs(args.log_folder, exist_ok=True)
# Convert timeout value to secs
@ -235,6 +244,9 @@ def save_results(results, filename):
df = pd.DataFrame(
results,
columns=[
"Warmup Runs",
"Measured Runs",
"Model Name",
"Engine",
"Precision",
"Device",
@ -254,6 +266,8 @@ def save_results(results, filename):
)
# Set column types
df["Warmup Runs"] = df["Warmup Runs"].astype("int")
df["Measured Runs"] = df["Measured Runs"].astype("int")
df["Duration (s)"] = df["Duration (s)"].astype("float")
df["Token Length"] = df["Token Length"].astype("int")
df["Load Audio Latency (s)"] = df["Load Audio Latency (s)"].astype("float")
@ -266,7 +280,55 @@ def save_results(results, filename):
df["Memory (GB)"] = df["Memory (GB)"].astype("float")
df["Real Time Factor (RTF)"] = df["Real Time Factor (RTF)"].astype("float")
df.to_csv(filename, index=False)
# get package name and version
import pkg_resources
installed_packages = pkg_resources.working_set
installed_packages_list = sorted(
[
f"{i.key}=={i.version}"
for i in installed_packages
if i.key in ["ort-nightly-gpu", "ort-nightly", "onnxruntime", "onnxruntime-gpu"]
]
)
ort_pkg_name = ""
ort_pkg_version = ""
if installed_packages_list:
ort_pkg_name = installed_packages_list[0].split("==")[0]
ort_pkg_version = installed_packages_list[0].split("==")[1]
# Save results to csv with standard format
records = []
for _, row in df.iterrows():
if row["Engine"] == "onnxruntime":
record = BenchmarkRecord(
row["Model Name"], row["Precision"], row["Engine"], row["Device"], ort_pkg_name, ort_pkg_version
)
else:
record = BenchmarkRecord(
row["Model Name"], row["Precision"], row["Engine"], row["Device"], torch.__name__, torch.__version__
)
record.config.customized["audio_file"] = row["Audio File"]
record.config.warmup_runs = row["Warmup Runs"]
record.config.measured_runs = row["Measured Runs"]
record.metrics.customized["duration"] = row["Duration (s)"]
record.metrics.customized["token_length"] = row["Token Length"]
record.metrics.customized["load_audio_latency"] = row["Load Audio Latency (s)"]
record.metrics.customized["load_audio_throughput"] = row["Load Audio Throughput (qps)"]
record.metrics.customized["feature_extractor_latency_s"] = row["Feature Extractor Latency (s)"]
record.metrics.customized["feature_extractor_throughput_qps"] = row["Feature Extractor Throughput (qps)"]
record.metrics.customized["per_token_latency_ms"] = row["Per Token Latency (ms/token)"]
record.metrics.customized["rtf"] = row["Real Time Factor (RTF)"]
record.metrics.latency_ms_mean = row["Latency (s)"] * 1000
record.metrics.throughput_qps = row["Throughput (qps)"]
record.metrics.max_memory_usage_GB = row["Memory (GB)"]
records.append(record)
BenchmarkRecord.save_as_csv(filename, records)
BenchmarkRecord.save_as_json(filename.replace(".csv", ".json"), records)
logger.info(f"Results saved in {filename}!")
@ -282,7 +344,16 @@ def benchmark(args, benchmark_cmd, engine, audio_file, duration):
# Create entries for csv
logger.info("Gathering data from log files...")
base_results = [engine, args.precision, args.device, audio_file, duration]
base_results = [
args.warmup_runs,
args.num_runs,
args.model_name,
engine,
args.precision,
args.device,
audio_file,
duration,
]
results = process_log_file(args.device_id, log_path, base_results)
return results