Revert "Optimize transpose copy on CPU using fbgemm transpose (#83327)"

This reverts commit 04d8da88a6.

Reverted https://github.com/pytorch/pytorch/pull/83327 on behalf of https://github.com/weiwangmeta due to breaking internal builds/causing out-of-bounds errors/training accuracy
This commit is contained in:
PyTorch MergeBot 2022-08-24 00:47:03 +00:00
parent 591222f5d9
commit 84e45e7e90
2 changed files with 0 additions and 67 deletions

View file

@ -16,7 +16,6 @@
#include <ATen/Parallel.h>
#include <c10/util/irange.h>
#include <torch/library.h>
#include <ATen/native/cpu/utils.h>
#ifdef USE_FBGEMM
#include <fbgemm/Fbgemm.h>
@ -27,53 +26,6 @@ namespace {
using namespace at;
bool fbgemm_copy_transpose_valid(const Tensor& self, const Tensor& src) {
const int MIN_SZ = 16 * 32;
if ((self.device().is_cpu() && src.device().is_cpu()) &&
(self.layout() == c10::kStrided) && (src.layout() == c10::kStrided) &&
!self.is_sparse() && !src.is_sparse() && self.is_contiguous() &&
(self.is_conj() == src.is_conj()) && (self.is_neg() == src.is_neg()) &&
!self.is_complex() && !src.is_complex() &&
self.sizes().equals(src.sizes()) && self.dim() >= 2 &&
src.size(src.dim() - 1) * src.size(src.dim() - 2) >= MIN_SZ &&
src.stride(src.dim() - 2) == 1 && src.stride(src.dim() - 1) == src.size(src.dim() - 2) &&
!(src.size(src.dim() - 2) == 1 && src.size(src.dim() - 1) == 1)) {
// Check src is in contiguous block
for (long i = 0; i < src.dim() - 2; i++) {
if (!(src.stride(i) == ((i + 1) == (src.dim() - 2)) ?
src.stride(src.dim() - 1) * src.size(src.dim() - 1) : src.stride(i + 1) * src.size(i + 1))){
return false;
}
}
} else {
return false;
}
return true;
}
void fbgemm_copy_transpose_same_type(Tensor& self, const Tensor& src) {
auto block_size = src.size(src.dim() - 1) * src.size(src.dim() - 2);
auto ntrans = src.numel() / block_size;
AT_DISPATCH_ALL_TYPES_AND(kBFloat16, src.scalar_type(),
"fbgemm_transpose_copy_same_type", [&] {
at::parallel_for(
0,
ntrans,
at::internal::GRAIN_SIZE / block_size,
[&](int64_t begin, int64_t end) {
for (int64_t i = begin; i < end; i++) {
native::utils::transpose(
src.size(src.dim() - 1),
src.size(src.dim() - 2),
src.data_ptr<scalar_t>() + i * block_size,
src.stride(src.dim() - 1),
self.data_ptr<scalar_t>() + i * block_size,
self.stride(self.dim() - 2));
}
});
});
}
bool copy_transpose_valid(const Tensor& self, const Tensor& src) {
const int MIN_SZ = 60 * 60;
return self.is_contiguous() && src.numel() != 0 && src.dim() == 2 &&
@ -206,12 +158,6 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
}
return self;
}
if (fbgemm::fbgemmSupportedCPU() && fbgemm_copy_transpose_valid(self, src) &&
src.dtype() == self.dtype() && (src.dtype() == at::kFloat || src.dtype() == at::kBFloat16)) {
fbgemm_copy_transpose_same_type(self, src);
return self;
}
#endif
if (self.is_same(src)) {

View file

@ -93,19 +93,6 @@ inline void transpose<float>(int64_t M, int64_t N, const float* src, int64_t ld_
TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
fbgemm::transpose_simd<float>(M, N, src, ld_src, dst, ld_dst);
}
template <>
inline void transpose<BFloat16>(int64_t M, int64_t N, const BFloat16* src, int64_t ld_src, BFloat16* dst, int64_t ld_dst) {
TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
fbgemm::transpose_simd<uint16_t>(M, N, reinterpret_cast<const uint16_t*>(src), ld_src, reinterpret_cast<uint16_t*>(dst), ld_dst);
}
template <>
inline void transpose<uint8_t>(int64_t M, int64_t N, const uint8_t* src, int64_t ld_src, uint8_t* dst, int64_t ld_dst) {
TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
fbgemm::transpose_simd<uint8_t>(M, N, src, ld_src, dst, ld_dst);
}
#endif
} // namespace utils