diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py index f78caed2e11..4e5ae496df3 100644 --- a/test/inductor/test_cutlass_backend.py +++ b/test/inductor/test_cutlass_backend.py @@ -602,6 +602,39 @@ class TestCutlassBackend(TestCase): # Broadcast last dim. compare_results(4096, 25728, 2048, 2.0, 0.4, [4096, 1]) + @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) + def test_addmm_with_expanded_bias(self): + torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False + + class MyModel(torch.nn.Module): + def forward(self, x, w): + bias = torch.zeros( + size=(w.size(1), x.size(0)), dtype=torch.float16, device="cuda" + ).t() + return torch.addmm(bias, x, w) + + with config.patch( + { + "max_autotune": True, + "autotune_in_subproc": False, + "max_autotune_gemm_backends": "ATEN,CUTLASS", + "cuda.cutlass_dir": _CUTLASS_DIR, + "cuda.cutlass_max_profiling_configs": 1, + } + ): + model = MyModel() + M, N, K = 2048, 3072, 6144 + x = torch.randn(M, K).cuda().half() + w = torch.randn(K, N).cuda().half() + + actual = AOTIRunnerUtil.run( + "cuda", + model, + (x, w), + ) + expected = model(x, w) + torch.testing.assert_close(expected, actual) + # TODO: Enable dynamic test cases when dynamic support is added. @unittest.skipIf(not SM80OrLater, "need sm_80") @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py index 14200943258..bfc18ffda02 100644 --- a/torch/_inductor/kernel/mm.py +++ b/torch/_inductor/kernel/mm.py @@ -557,6 +557,7 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None): [mat1, mat2, inp_expanded], alpha=alpha, beta=beta, + input_reorder=[2, 0, 1], ) if is_nonzero and use_ck_gemm_template(layout, m, n, k):