diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index aa36b35ebbc..3c5bba96053 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -2325,7 +2325,11 @@ static Tensor& linalg_vector_norm_impl(const Tensor& self, const Scalar& scalar_
   TORCH_CHECK(!result.defined() || out_dtype == result.scalar_type(),
     "linalg.vector_norm expected out tensor dtype ", out_dtype,
     " but got: ", result.scalar_type());
-  auto iter = make_reduction("vector_norm", result, self_, dim, keepdim, in_dtype, out_dtype);
+  // omit in_dtype in the following call, to avoid make_reduction explicitly casting input to out_dtype
+  auto iter = isComplexType(self.scalar_type()) ?
+      make_reduction("vector_norm", result, self_, dim, keepdim, in_dtype, out_dtype) :
+      make_reduction("vector_norm", result, self_, dim, keepdim, out_dtype);
+
   linalg_vector_norm_stub(iter.device_type(), iter, ord);
   return result;
 }
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index 33046fc5497..93e91842e4a 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -1030,7 +1030,10 @@ static Tensor& norm_out(Tensor &result, const Tensor &self, const optional<Scala
 
   ScalarType out_dtype = result.defined() ? result.scalar_type() : (opt_dtype.has_value() ? opt_dtype.value() : toValueType(self.scalar_type()));
 
-  auto iter = make_reduction("norm", result, self, dim, keepdim, in_dtype, out_dtype);
+// omit in_dtype in the following call, to avoid make_reduction explicitly casting input to out_dtype
+  auto iter = isComplexType(self.scalar_type()) ?
+      make_reduction("norm", result, self, dim, keepdim, in_dtype, out_dtype) :
+      make_reduction("norm", result, self, dim, keepdim, out_dtype);
 
   if (iter.numel() == 0) {
     result.zero_();
diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h
index 93d80820955..b443ed7ed9a 100644
--- a/aten/src/ATen/native/ReduceOpsUtils.h
+++ b/aten/src/ATen/native/ReduceOpsUtils.h
@@ -210,9 +210,9 @@ static TensorIterator make_reduction(
   // efficiency.
   // not generalize this to common mismatched input/output types to avoid cross
   // product of templated kernel launches.
-  const bool gpu_f16_to_f32 = (
-    self.is_cuda() && self.scalar_type() == kHalf && out_dtype == kFloat);
-  auto in_dtype = gpu_f16_to_f32 ? self.scalar_type() : out_dtype;
+  const bool gpu_lowp_to_f32 = (
+    self.is_cuda() && (self.scalar_type() == kHalf || self.scalar_type() == kBFloat16) && out_dtype == kFloat);
+  auto in_dtype = gpu_lowp_to_f32 ? self.scalar_type() : out_dtype;
   return make_reduction(name, result, self, dim, keepdim, in_dtype, out_dtype);
 }
 
diff --git a/aten/src/ATen/native/cuda/ReduceNormKernel.cu b/aten/src/ATen/native/cuda/ReduceNormKernel.cu
index a66448fce69..d4c629d60a4 100644
--- a/aten/src/ATen/native/cuda/ReduceNormKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceNormKernel.cu
@@ -45,41 +45,31 @@ void norm_kernel_cuda_impl(TensorIterator& iter, const Scalar& val) {
 
 }
 
-static void norm_kernel_cuda(TensorIterator& iter, const Scalar& p) {
-  if (iter.input_dtype() == kHalf) {
-    return norm_kernel_cuda_impl<at::Half, float>(iter, p);
-  } else if (iter.dtype(1) == kHalf && iter.input_dtype() == kFloat) {
+static void norm_dispatch(TensorIterator& iter, const Scalar& ord){
+  if (iter.dtype(0) == kHalf) {
+    return norm_kernel_cuda_impl<at::Half, float>(iter, ord);
+  } else if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
     // type promotion that does cast and reduction in a single kernel
-    return norm_kernel_cuda_impl<at::Half, float, float>(iter, p);
+    return norm_kernel_cuda_impl<at::Half, float, float>(iter, ord);
   }
-  else if(iter.input_dtype() == kBFloat16) {
-    return norm_kernel_cuda_impl<at::BFloat16, float>(iter, p);
-  } else if (iter.dtype(1) == kBFloat16 && iter.input_dtype() == kFloat) {
+  else if(iter.dtype(0) == kBFloat16) {
+    return norm_kernel_cuda_impl<at::BFloat16, float>(iter, ord);
+  } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
     // type promotion that does cast and reduction in a single kernel
-    return norm_kernel_cuda_impl<at::BFloat16, float, float>(iter, p);
+    return norm_kernel_cuda_impl<at::BFloat16, float, float>(iter, ord);
   }
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.input_dtype(), "norm_cuda", [&] {
-    norm_kernel_cuda_impl<scalar_t>(iter, p);
+    norm_kernel_cuda_impl<scalar_t>(iter, ord);
   });
 }
 
+static void norm_kernel_cuda(TensorIterator& iter, const Scalar& ord) {
+  norm_dispatch(iter, ord);
+}
+
 static void linalg_vector_norm_kernel_cuda(TensorIterator& iter, Scalar ord) {
   TORCH_CHECK(ord.isFloatingPoint(), "linalg.vector_norm expects ord to be float");
-  if (iter.output().scalar_type() == kHalf) {
-    return norm_kernel_cuda_impl<at::Half, float>(iter, ord);
-  } else if (iter.input_dtype() == kHalf && iter.output().scalar_type() == kFloat) {
-    // type promotion that does cast and reduction in a single kernel
-    return norm_kernel_cuda_impl<at::Half, float, float>(iter, ord);
-  }
-  else if(iter.output().scalar_type() == kBFloat16) {
-    return norm_kernel_cuda_impl<at::BFloat16, float>(iter, ord);
-  } else if (iter.input_dtype() == kBFloat16 && iter.output().scalar_type() == kFloat) {
-    // type promotion that does cast and reduction in a single kernel
-    return norm_kernel_cuda_impl<at::BFloat16, float, float>(iter, ord);
-  }
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.input_dtype(), "linalg_vector_norm_cuda", [&] {
-    norm_kernel_cuda_impl<scalar_t>(iter, ord);
-  });
+  norm_dispatch(iter, ord);
 }
 
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 0fdd82218be..ed8d7145f00 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -1562,6 +1562,24 @@ class TestLinalg(TestCase):
                 for ord in ord_settings:
                     run_test_case(input, ord, dim, keepdim)
 
+
+    @onlyCUDA
+    @dtypes(torch.bfloat16, torch.float16)
+    def test_norm_fused_type_promotion(self, device, dtype):
+        x = torch.randn(10, device=device, dtype=dtype)
+
+        def profile_and_check(fn, x, kwargs, fn_name):
+            with torch.profiler.profile(activities=(torch.profiler.ProfilerActivity.CPU,)) as p:
+                fn(x, **kwargs, dtype=torch.float)
+            # smoke check that profiler returned some events
+            self.assertTrue(fn_name in map(lambda e: e.name, p.events()))
+            # test that there was no explicit copy
+            self.assertFalse("aten::to" in map(lambda e: e.name, p.events()))
+
+        for f, kwargs, fn_name in zip((torch.norm, torch.linalg.vector_norm), ({"p" : 2}, {}),
+                                      ("aten::norm", "aten::linalg_vector_norm")):
+            profile_and_check(f, x, kwargs, fn_name)
+
     @skipMeta  # https://github.com/pytorch/pytorch/issues/53739
     @skipCPUIfNoLapack
     @skipCUDAIfNoMagma