From e96f10d27b5fc6426c019ffd57762ea7a26d886b Mon Sep 17 00:00:00 2001 From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com> Date: Tue, 16 May 2023 13:10:02 +0800 Subject: [PATCH] [ROCm] reduce batch size to fix CI error (#15714) ROCm CI batch size test occasionally fail. Try reduce batch size to fix it. error log: Non-zero status code returned while running FusedMatMul node. Name:'MatMul_2914_Grad/FusedMatMul_0' Status Message: HIP error hipErrorNotFound:named symbol not found Non-zero status code returned while running Gemm node. Name:'MatMul_2891_Grad/Gemm_5' Status Message: HIP error hipErrorNotFound:named symbol not found --- orttraining/tools/ci_test/run_batch_size_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orttraining/tools/ci_test/run_batch_size_test.py b/orttraining/tools/ci_test/run_batch_size_test.py index cd93c44cf7..ba2be03618 100755 --- a/orttraining/tools/ci_test/run_batch_size_test.py +++ b/orttraining/tools/ci_test/run_batch_size_test.py @@ -56,7 +56,7 @@ def main(): configs["MI100_32G"] = [ Config(True, 128, 192, 20, ""), Config(True, 512, 26, 80, ""), - Config(False, 128, 108, 20, ""), + Config(False, 128, 106, 20, ""), Config(False, 512, 16, 80, ""), ]