Revert "Optimize transpose copy on CPU using fbgemm transpose (#83327)"

This reverts commit 04d8da88a6. Reverted https://github.com/pytorch/pytorch/pull/83327 on behalf of https://github.com/weiwangmeta due to breaking internal builds/causing out-of-bounds errors/training accuracy
2026-05-14 20:57:59 +00:00 · 2022-08-24 00:47:03 +00:00 · 2022-08-24 00:47:03 +00:00 · 84e45e7e90
commit 84e45e7e90
parent 591222f5d9
2 changed files with 0 additions and 67 deletions
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@ -16,7 +16,6 @@
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 #include <torch/library.h>
-#include <ATen/native/cpu/utils.h>

 #ifdef USE_FBGEMM
 #include <fbgemm/Fbgemm.h>
@ -27,53 +26,6 @@ namespace {

 using namespace at;

-bool fbgemm_copy_transpose_valid(const Tensor& self, const Tensor& src) {
-  const int MIN_SZ = 16 * 32;
-  if ((self.device().is_cpu() && src.device().is_cpu()) &&
-      (self.layout() == c10::kStrided) && (src.layout() == c10::kStrided) &&
-      !self.is_sparse() && !src.is_sparse() && self.is_contiguous() &&
-      (self.is_conj() == src.is_conj()) && (self.is_neg() == src.is_neg()) &&
-      !self.is_complex() && !src.is_complex() &&
-      self.sizes().equals(src.sizes()) && self.dim() >= 2 &&
-      src.size(src.dim() - 1) * src.size(src.dim() - 2) >= MIN_SZ &&
-      src.stride(src.dim() - 2) == 1 && src.stride(src.dim() - 1) == src.size(src.dim() - 2) &&
-      !(src.size(src.dim() - 2) == 1 && src.size(src.dim() - 1) == 1)) {
-      // Check src is in contiguous block
-      for (long i = 0; i < src.dim() - 2; i++) {
-        if (!(src.stride(i) == ((i + 1) == (src.dim() - 2)) ?
-                src.stride(src.dim() - 1) * src.size(src.dim() - 1) :  src.stride(i + 1) * src.size(i + 1))){
-              return false;
-            }
-      }
-  } else {
-    return false;
-  }
-  return true;
-}
-
-void fbgemm_copy_transpose_same_type(Tensor& self, const Tensor& src) {
-  auto block_size = src.size(src.dim() - 1) * src.size(src.dim() - 2);
-  auto ntrans = src.numel() / block_size;
-  AT_DISPATCH_ALL_TYPES_AND(kBFloat16, src.scalar_type(),
-    "fbgemm_transpose_copy_same_type", [&] {
-    at::parallel_for(
-    0,
-    ntrans,
-    at::internal::GRAIN_SIZE / block_size,
-    [&](int64_t begin, int64_t end) {
-      for (int64_t i = begin; i < end; i++) {
-        native::utils::transpose(
-        src.size(src.dim() - 1),
-        src.size(src.dim() - 2),
-        src.data_ptr<scalar_t>() + i * block_size,
-        src.stride(src.dim() - 1),
-        self.data_ptr<scalar_t>() + i * block_size,
-        self.stride(self.dim() - 2));
-      }
-    });
-  });
-}
-
 bool copy_transpose_valid(const Tensor& self, const Tensor& src) {
  const int MIN_SZ = 60 * 60;
  return self.is_contiguous() && src.numel() != 0 && src.dim() == 2 &&
@ -206,12 +158,6 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
      }
      return self;
    }
-
-    if (fbgemm::fbgemmSupportedCPU() && fbgemm_copy_transpose_valid(self, src) &&
-      src.dtype() == self.dtype() && (src.dtype() == at::kFloat || src.dtype() == at::kBFloat16)) {
-      fbgemm_copy_transpose_same_type(self, src);
-      return self;
-    }
  #endif

  if (self.is_same(src)) {
--- a/aten/src/ATen/native/cpu/utils.h
+++ b/aten/src/ATen/native/cpu/utils.h
@ -93,19 +93,6 @@ inline void transpose<float>(int64_t M, int64_t N, const float* src, int64_t ld_
  TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
  fbgemm::transpose_simd<float>(M, N, src, ld_src, dst, ld_dst);
 }
-
-template <>
-inline void transpose<BFloat16>(int64_t M, int64_t N, const BFloat16* src, int64_t ld_src, BFloat16* dst, int64_t ld_dst) {
-  TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
-  fbgemm::transpose_simd<uint16_t>(M, N, reinterpret_cast<const uint16_t*>(src), ld_src, reinterpret_cast<uint16_t*>(dst), ld_dst);
-}
-
-template <>
-inline void transpose<uint8_t>(int64_t M, int64_t N, const uint8_t* src, int64_t ld_src, uint8_t* dst, int64_t ld_dst) {
-  TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
-  fbgemm::transpose_simd<uint8_t>(M, N, src, ld_src, dst, ld_dst);
-}
-
 #endif

 } // namespace utils