pytorch/benchmarks/dynamo/check_accuracy.py

import argparse
import os
import sys
import textwrap

import pandas as pd


# Hack to have something similar to DISABLED_TEST. These models are flaky.

flaky_models = {
    "yolov3",
    "gluon_inception_v3",
    "detectron2_maskrcnn_r_101_c4",
    "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
}


def get_field(csv, model_name: str, field: str):
    try:
        return csv.loc[csv["name"] == model_name][field].item()
    except Exception:
        return None


def check_accuracy(actual_csv, expected_csv, expected_filename):
    failed = []
    improved = []

    for model in actual_csv["name"]:
        accuracy = get_field(actual_csv, model, "accuracy")
        expected_accuracy = get_field(expected_csv, model, "accuracy")

        if accuracy == expected_accuracy:
            status = "PASS" if expected_accuracy == "pass" else "XFAIL"
            print(f"{model:34}  {status}")
            continue
        elif model in flaky_models:
            if accuracy == "pass":
                # model passed but marked xfailed
                status = "PASS_BUT_FLAKY:"
            else:
                # model failed but marked passe
                status = "FAIL_BUT_FLAKY:"
        elif accuracy != "pass":
            status = "FAIL:"
            failed.append(model)
        else:
            status = "IMPROVED:"
            improved.append(model)
        print(
            f"{model:34}  {status:9} accuracy={accuracy}, expected={expected_accuracy}"
        )

    msg = ""
    if failed or improved:
        if failed:
            msg += textwrap.dedent(
                f"""
            Error: {len(failed)} models have accuracy status regressed:
                {' '.join(failed)}

            """
            )
        if improved:
            msg += textwrap.dedent(
                f"""
            Improvement: {len(improved)} models have accuracy status improved:
                {' '.join(improved)}

            """
            )
        sha = os.getenv("SHA1", "{your CI commit sha}")
        msg += textwrap.dedent(
            f"""
        If this change is expected, you can update `{expected_filename}` to reflect the new baseline.
        from pytorch/pytorch root, run
        `python benchmarks/dynamo/ci_expected_accuracy/update_expected.py {sha}`
        and then `git add` the resulting local changes to expected CSVs to your commit.
        """
        )
    return failed or improved, msg


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--actual", type=str, required=True)
    parser.add_argument("--expected", type=str, required=True)
    args = parser.parse_args()

    actual = pd.read_csv(args.actual)
    expected = pd.read_csv(args.expected)

    failed, msg = check_accuracy(actual, expected, args.expected)
    if failed:
        print(msg)
        sys.exit(1)


if __name__ == "__main__":
    main()
[CI] Use expected accuracy csv files to check benchmark test status (#98839) Pull Request resolved: https://github.com/pytorch/pytorch/pull/98839 Approved by: https://github.com/ezyang 2023-04-14 23:37:08 +00:00			`import argparse`
			`import os`
			`import sys`
			`import textwrap`

			`import pandas as pd`


[torch.compile][ci] Flaky models in CI (similar to DISABLED_TEST) (#128715) These models are really flaky. I went into the CI machine and ran the model many times, sometime it fails, sometimes it passes. Even Pytorch-eager results change from run to run, so the accuracy comparison is fundamentally broken/non-deterministic. I am hitting these issues more frequently in inlining work. There is nothing wrong with inlining, I think these models are on the edge of already-broken accuracy measurement, and inlining is just pushing it in more broken direction. Pull Request resolved: https://github.com/pytorch/pytorch/pull/128715 Approved by: https://github.com/eellison 2024-06-14 16:41:11 +00:00			`# Hack to have something similar to DISABLED_TEST. These models are flaky.`

			`flaky_models = {`
			`"yolov3",`
			`"gluon_inception_v3",`
[CI] Add inductor cpu accuracy test running on AVX2 runners (#128682) Pull Request resolved: https://github.com/pytorch/pytorch/pull/128682 Approved by: https://github.com/jgong5, https://github.com/desertfire 2024-07-26 13:24:41 +00:00			`"detectron2_maskrcnn_r_101_c4",`
Add XGLMForCausalLM to the flaky model list (#129776) Not failing on devGPU. Went to CI machine ... flaky. So adding to the flaky list. Pull Request resolved: https://github.com/pytorch/pytorch/pull/129776 Approved by: https://github.com/mlazos ghstack dependencies: #129583, #129610, #129775 2024-06-28 22:07:51 +00:00			`"XGLMForCausalLM", # discovered in https://github.com/pytorch/pytorch/pull/128148`
[torch.compile][ci] Flaky models in CI (similar to DISABLED_TEST) (#128715) These models are really flaky. I went into the CI machine and ran the model many times, sometime it fails, sometimes it passes. Even Pytorch-eager results change from run to run, so the accuracy comparison is fundamentally broken/non-deterministic. I am hitting these issues more frequently in inlining work. There is nothing wrong with inlining, I think these models are on the edge of already-broken accuracy measurement, and inlining is just pushing it in more broken direction. Pull Request resolved: https://github.com/pytorch/pytorch/pull/128715 Approved by: https://github.com/eellison 2024-06-14 16:41:11 +00:00			`}`


[CI] Use expected accuracy csv files to check benchmark test status (#98839) Pull Request resolved: https://github.com/pytorch/pytorch/pull/98839 Approved by: https://github.com/ezyang 2023-04-14 23:37:08 +00:00			`def get_field(csv, model_name: str, field: str):`
			`try:`
			`return csv.loc[csv["name"] == model_name][field].item()`
Fix unused Python variables outside torch/ and test/ (#136359) Pull Request resolved: https://github.com/pytorch/pytorch/pull/136359 Approved by: https://github.com/albanD 2024-12-11 14:00:52 +00:00			`except Exception:`
[CI] Use expected accuracy csv files to check benchmark test status (#98839) Pull Request resolved: https://github.com/pytorch/pytorch/pull/98839 Approved by: https://github.com/ezyang 2023-04-14 23:37:08 +00:00			`return None`


			`def check_accuracy(actual_csv, expected_csv, expected_filename):`
			`failed = []`
			`improved = []`

			`for model in actual_csv["name"]:`
			`accuracy = get_field(actual_csv, model, "accuracy")`
			`expected_accuracy = get_field(expected_csv, model, "accuracy")`

			`if accuracy == expected_accuracy:`
			`status = "PASS" if expected_accuracy == "pass" else "XFAIL"`
			`print(f"{model:34} {status}")`
			`continue`
[torch.compile][ci] Flaky models in CI (similar to DISABLED_TEST) (#128715) These models are really flaky. I went into the CI machine and ran the model many times, sometime it fails, sometimes it passes. Even Pytorch-eager results change from run to run, so the accuracy comparison is fundamentally broken/non-deterministic. I am hitting these issues more frequently in inlining work. There is nothing wrong with inlining, I think these models are on the edge of already-broken accuracy measurement, and inlining is just pushing it in more broken direction. Pull Request resolved: https://github.com/pytorch/pytorch/pull/128715 Approved by: https://github.com/eellison 2024-06-14 16:41:11 +00:00			`elif model in flaky_models:`
			`if accuracy == "pass":`
			`# model passed but marked xfailed`
			`status = "PASS_BUT_FLAKY:"`
			`else:`
			`# model failed but marked passe`
			`status = "FAIL_BUT_FLAKY:"`
[CI] Use expected accuracy csv files to check benchmark test status (#98839) Pull Request resolved: https://github.com/pytorch/pytorch/pull/98839 Approved by: https://github.com/ezyang 2023-04-14 23:37:08 +00:00			`elif accuracy != "pass":`
			`status = "FAIL:"`
			`failed.append(model)`
			`else:`
			`status = "IMPROVED:"`
			`improved.append(model)`
			`print(`
			`f"{model:34} {status:9} accuracy={accuracy}, expected={expected_accuracy}"`
			`)`

			`msg = ""`
			`if failed or improved:`
			`if failed:`
			`msg += textwrap.dedent(`
			`f"""`
			`Error: {len(failed)} models have accuracy status regressed:`
			`{' '.join(failed)}`

			`"""`
			`)`
			`if improved:`
			`msg += textwrap.dedent(`
			`f"""`
			`Improvement: {len(improved)} models have accuracy status improved:`
			`{' '.join(improved)}`

			`"""`
			`)`
			`sha = os.getenv("SHA1", "{your CI commit sha}")`
			`msg += textwrap.dedent(`
			`f"""`
			If this change is expected, you can update `{expected_filename}` to reflect the new baseline.
			`from pytorch/pytorch root, run`
			`python benchmarks/dynamo/ci_expected_accuracy/update_expected.py {sha}`
			and then `git add` the resulting local changes to expected CSVs to your commit.
			`"""`
			`)`
			`return failed or improved, msg`


			`def main():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--actual", type=str, required=True)`
			`parser.add_argument("--expected", type=str, required=True)`
			`args = parser.parse_args()`

			`actual = pd.read_csv(args.actual)`
			`expected = pd.read_csv(args.expected)`

			`failed, msg = check_accuracy(actual, expected, args.expected)`
			`if failed:`
			`print(msg)`
			`sys.exit(1)`


			`if __name__ == "__main__":`
			`main()`