mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Add back SLEEF and also use better cmake setup. (#7341)
This commit is contained in:
parent
7911a30081
commit
8dbeffab07
7 changed files with 275 additions and 37 deletions
|
|
@ -368,6 +368,19 @@ if (NOT TARGET cpuinfo)
|
|||
endif()
|
||||
TARGET_LINK_LIBRARIES(ATen cpuinfo)
|
||||
|
||||
if(MSVC)
|
||||
set(BUILD_SHARED_LIBS ON CACHE BOOL "Build sleef shared" FORCE)
|
||||
else()
|
||||
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
|
||||
endif()
|
||||
set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
|
||||
set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
|
||||
add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/sleef" ${CMAKE_BINARY_DIR}/sleef)
|
||||
set_property(TARGET sleef PROPERTY FOLDER "dependencies")
|
||||
include_directories(SYSTEM ${CMAKE_BINARY_DIR}/include)
|
||||
link_directories(${CMAKE_BINARY_DIR}/sleef/lib)
|
||||
TARGET_LINK_LIBRARIES(ATen sleef)
|
||||
|
||||
IF(CUDA_FOUND)
|
||||
IF ($ENV{ATEN_STATIC_CUDA})
|
||||
# CuFFT has a complicated static story (especially around CUDA < 9) because it has device callback support
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#include "intrinsics.h"
|
||||
#include "vec256_base.h"
|
||||
#include <sleef.h>
|
||||
|
||||
namespace at {
|
||||
namespace vec256 {
|
||||
|
|
@ -54,6 +55,36 @@ public:
|
|||
auto mask = _mm256_set1_pd(-0.f);
|
||||
return _mm256_andnot_pd(mask, values);
|
||||
}
|
||||
Vec256<double> acos() const {
|
||||
return Vec256<double>(Sleef_acosd4_u10(values));
|
||||
}
|
||||
Vec256<double> asin() const {
|
||||
return Vec256<double>(Sleef_asind4_u10(values));
|
||||
}
|
||||
Vec256<double> atan() const {
|
||||
return Vec256<double>(Sleef_atand4_u10(values));
|
||||
}
|
||||
Vec256<double> erf() const {
|
||||
return Vec256<double>(Sleef_erfd4_u10(values));
|
||||
}
|
||||
Vec256<double> exp() const {
|
||||
return Vec256<double>(Sleef_expd4_u10(values));
|
||||
}
|
||||
Vec256<double> expm1() const {
|
||||
return Vec256<double>(Sleef_expm1d4_u10(values));
|
||||
}
|
||||
Vec256<double> log() const {
|
||||
return Vec256<double>(Sleef_logd4_u10(values));
|
||||
}
|
||||
Vec256<double> log2() const {
|
||||
return Vec256<double>(Sleef_log2d4_u10(values));
|
||||
}
|
||||
Vec256<double> log10() const {
|
||||
return Vec256<double>(Sleef_log10d4_u10(values));
|
||||
}
|
||||
Vec256<double> log1p() const {
|
||||
return Vec256<double>(Sleef_log1pd4_u10(values));
|
||||
}
|
||||
Vec256<double> sin() const {
|
||||
return map(std::sin);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#include "intrinsics.h"
|
||||
#include "vec256_base.h"
|
||||
#include <sleef.h>
|
||||
#include <iostream>
|
||||
|
||||
namespace at {
|
||||
|
|
@ -55,6 +56,36 @@ public:
|
|||
auto mask = _mm256_set1_ps(-0.f);
|
||||
return _mm256_andnot_ps(mask, values);
|
||||
}
|
||||
Vec256<float> acos() const {
|
||||
return Vec256<float>(Sleef_acosf8_u10(values));
|
||||
}
|
||||
Vec256<float> asin() const {
|
||||
return Vec256<float>(Sleef_asinf8_u10(values));
|
||||
}
|
||||
Vec256<float> atan() const {
|
||||
return Vec256<float>(Sleef_atanf8_u10(values));
|
||||
}
|
||||
Vec256<float> erf() const {
|
||||
return Vec256<float>(Sleef_erff8_u10(values));
|
||||
}
|
||||
Vec256<float> exp() const {
|
||||
return Vec256<float>(Sleef_expf8_u10(values));
|
||||
}
|
||||
Vec256<float> expm1() const {
|
||||
return Vec256<float>(Sleef_expm1f8_u10(values));
|
||||
}
|
||||
Vec256<float> log() const {
|
||||
return Vec256<float>(Sleef_logf8_u10(values));
|
||||
}
|
||||
Vec256<float> log2() const {
|
||||
return Vec256<float>(Sleef_log2f8_u10(values));
|
||||
}
|
||||
Vec256<float> log10() const {
|
||||
return Vec256<float>(Sleef_log10f8_u10(values));
|
||||
}
|
||||
Vec256<float> log1p() const {
|
||||
return Vec256<float>(Sleef_log1pf8_u10(values));
|
||||
}
|
||||
Vec256<float> sin() const {
|
||||
return map(std::sin);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
#include "ATen/CPUApplyUtils.h"
|
||||
#include "ATen/Parallel.h"
|
||||
#include "ATen/native/cpu/UnaryOpsKernel.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
|
@ -65,6 +66,36 @@ Tensor& fill_(Tensor& self, const Tensor& value) {
|
|||
return result; \
|
||||
}
|
||||
|
||||
#define IMPLEMENT_UNARY_OP_VEC(op, opfn) \
|
||||
Tensor& _##op##__cpu(Tensor& self_) { \
|
||||
if (self_.numel() > 0) { \
|
||||
Tensor self = sort_strides(self_); \
|
||||
if (self.is_contiguous()) { \
|
||||
op##Impl(self, self); \
|
||||
} else { \
|
||||
AT_DISPATCH_FLOATING_TYPES(self.type(), op, [&] { \
|
||||
CPU_tensor_parallel_apply1<scalar_t>( \
|
||||
self, [](scalar_t& y) { y = opfn(y); }); \
|
||||
}); \
|
||||
} \
|
||||
} \
|
||||
return self_; \
|
||||
} \
|
||||
Tensor& _##op##_out_cpu(Tensor& result, const Tensor& self) { \
|
||||
result.resize_(self.sizes()); \
|
||||
if (result.numel() > 0) { \
|
||||
if (result.is_contiguous() && self.is_contiguous()) { \
|
||||
op##Impl(result, self); \
|
||||
} else { \
|
||||
AT_DISPATCH_FLOATING_TYPES(self.type(), op, [&] { \
|
||||
CPU_tensor_parallel_apply2<scalar_t, scalar_t>( \
|
||||
result, self, [](scalar_t& y, scalar_t& x) { y = opfn(x); }); \
|
||||
}); \
|
||||
} \
|
||||
} \
|
||||
return result; \
|
||||
}
|
||||
|
||||
IMPLEMENT_UNARY_OP_PREQUEL(abs)
|
||||
IMPLEMENT_UNARY_OP_PREQUEL(acos)
|
||||
IMPLEMENT_UNARY_OP_PREQUEL(asin)
|
||||
|
|
@ -99,48 +130,28 @@ Tensor& _tanh_out_cuda(Tensor& result, const Tensor& self) {
|
|||
return at::_th_tanh_out(result, self);
|
||||
}
|
||||
|
||||
Tensor& _abs__cpu(Tensor& self_) {
|
||||
if (self_.numel() > 0) {
|
||||
Tensor self = sort_strides(self_);
|
||||
AT_DISPATCH_ALL_TYPES(self.type(), abs, [&] {
|
||||
CPU_tensor_parallel_apply1<scalar_t>(
|
||||
self, [](scalar_t& y) { y = std::abs(y); });
|
||||
});
|
||||
}
|
||||
return self_;
|
||||
}
|
||||
Tensor& _abs_out_cpu(Tensor& result, const Tensor& self) {
|
||||
result.resize_(self.sizes());
|
||||
if (result.numel() > 0) {
|
||||
AT_DISPATCH_ALL_TYPES(self.type(), abs, [&] {
|
||||
CPU_tensor_parallel_apply2<scalar_t, scalar_t>(
|
||||
result, self, [](scalar_t& y, scalar_t& x) { y = std::abs(x); });
|
||||
});
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(acos, std::acos)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(asin, std::asin)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(atan, std::atan)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(ceil, std::ceil)
|
||||
IMPLEMENT_UNARY_OP_VEC(abs, std::abs)
|
||||
IMPLEMENT_UNARY_OP_VEC(acos, std::acos)
|
||||
IMPLEMENT_UNARY_OP_VEC(asin, std::asin)
|
||||
IMPLEMENT_UNARY_OP_VEC(atan, std::atan)
|
||||
IMPLEMENT_UNARY_OP_VEC(ceil, std::ceil)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(cos, std::cos)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(cosh, std::cosh)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(erf, std::erf)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(exp, std::exp)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(expm1, std::expm1)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(floor, std::floor)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(log, std::log)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(log10, std::log10)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(log1p, std::log1p)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(log2, std::log2)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(round, std::round)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(rsqrt, 1 / std::sqrt)
|
||||
IMPLEMENT_UNARY_OP_VEC(erf, std::erf)
|
||||
IMPLEMENT_UNARY_OP_VEC(exp, std::exp)
|
||||
IMPLEMENT_UNARY_OP_VEC(expm1, std::expm1)
|
||||
IMPLEMENT_UNARY_OP_VEC(floor, std::floor)
|
||||
IMPLEMENT_UNARY_OP_VEC(log, std::log)
|
||||
IMPLEMENT_UNARY_OP_VEC(log10, std::log10)
|
||||
IMPLEMENT_UNARY_OP_VEC(log1p, std::log1p)
|
||||
IMPLEMENT_UNARY_OP_VEC(log2, std::log2)
|
||||
IMPLEMENT_UNARY_OP_VEC(round, std::round)
|
||||
IMPLEMENT_UNARY_OP_VEC(rsqrt, 1 / std::sqrt)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(sin, std::sin)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(sinh, std::sinh)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(sqrt, std::sqrt)
|
||||
IMPLEMENT_UNARY_OP_VEC(sqrt, std::sqrt)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(tan, std::tan)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(tanh, std::tanh)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(trunc, std::trunc)
|
||||
IMPLEMENT_UNARY_OP_VEC(trunc, std::trunc)
|
||||
}
|
||||
} // namespace at
|
||||
|
|
|
|||
102
aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
Normal file
102
aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
#include "ATen/native/cpu/UnaryOpsKernel.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include "ATen/Dispatch.h"
|
||||
#include "ATen/Parallel.h"
|
||||
#include "ATen/cpu/vec256/vec256.h"
|
||||
#include "ATen/native/cpu/CapabilityDispatch.h"
|
||||
|
||||
namespace at { namespace native {
|
||||
namespace {
|
||||
|
||||
using namespace vec256;
|
||||
|
||||
template <typename scalar_t, typename F>
|
||||
static void
|
||||
unary_kernel(scalar_t* arr_out, const scalar_t* arr_in, int64_t size, F func) {
|
||||
using Vec = Vec256<scalar_t>;
|
||||
int64_t size_rounded = size - (size % Vec::size);
|
||||
int64_t k = 0;
|
||||
for (; k != size_rounded; k += Vec::size) {
|
||||
auto value = func(Vec::s_load(arr_in + k));
|
||||
value.store(arr_out + k);
|
||||
}
|
||||
auto leftover = size - k;
|
||||
if (leftover > 0) {
|
||||
Vec a;
|
||||
a.load_partial(arr_in + k, leftover);
|
||||
func(a).store_partial(arr_out + k, leftover);
|
||||
}
|
||||
}
|
||||
|
||||
template <class scalar_t, class F>
|
||||
static void parallel_apply(Tensor& result, const Tensor& self, F f) {
|
||||
internal::init_tbb_num_threads();
|
||||
|
||||
static tbb::affinity_partitioner ap;
|
||||
|
||||
auto arr_out = result.data<scalar_t>();
|
||||
auto arr_in = self.data<scalar_t>();
|
||||
int64_t size = self.numel();
|
||||
if (size < internal::TBB_GRAIN_SIZE) {
|
||||
unary_kernel(arr_out, arr_in, size, f);
|
||||
} else {
|
||||
tbb::parallel_for(
|
||||
tbb::blocked_range<int64_t>(0, size, internal::TBB_GRAIN_SIZE),
|
||||
[&](const tbb::blocked_range<int64_t>& r) {
|
||||
auto size = r.end() - r.begin();
|
||||
unary_kernel(arr_out + r.begin(), arr_in + r.begin(), size, f);
|
||||
},
|
||||
ap);
|
||||
}
|
||||
}
|
||||
|
||||
static void abs_kernel(Tensor& result, const Tensor& self) {
|
||||
AT_DISPATCH_ALL_TYPES(self.type(), "abs", [&] {
|
||||
parallel_apply<scalar_t>(
|
||||
result,
|
||||
self,
|
||||
[](const Vec256<scalar_t>& x) { return x.abs(); }); });
|
||||
}
|
||||
|
||||
static void rsqrt_kernel(Tensor& result, const Tensor& self) {
|
||||
AT_DISPATCH_FLOATING_TYPES(self.type(), "rsqrt", [&] {
|
||||
parallel_apply<scalar_t>(
|
||||
result,
|
||||
self,
|
||||
[](const Vec256<scalar_t>& x) { return Vec256<scalar_t>((scalar_t)(1)) / x.sqrt(); }); });
|
||||
}
|
||||
|
||||
#define IMPLEMENT_FLOAT_KERNEL(op) \
|
||||
static void op##_kernel(Tensor& result, const Tensor& self) { \
|
||||
AT_DISPATCH_FLOATING_TYPES(self.type(), #op, [&] { \
|
||||
parallel_apply<scalar_t>( \
|
||||
result, self, [](const Vec256<scalar_t>& x) { return x.op(); }); \
|
||||
}); \
|
||||
} \
|
||||
REGISTER_DISPATCH(op##Impl, &op##_kernel)
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
|
||||
REGISTER_DISPATCH(absImpl, &abs_kernel);
|
||||
REGISTER_DISPATCH(rsqrtImpl, &rsqrt_kernel);
|
||||
|
||||
IMPLEMENT_FLOAT_KERNEL(acos)
|
||||
IMPLEMENT_FLOAT_KERNEL(asin)
|
||||
IMPLEMENT_FLOAT_KERNEL(atan)
|
||||
IMPLEMENT_FLOAT_KERNEL(erf)
|
||||
IMPLEMENT_FLOAT_KERNEL(exp)
|
||||
IMPLEMENT_FLOAT_KERNEL(expm1)
|
||||
IMPLEMENT_FLOAT_KERNEL(log)
|
||||
IMPLEMENT_FLOAT_KERNEL(log10)
|
||||
IMPLEMENT_FLOAT_KERNEL(log1p)
|
||||
IMPLEMENT_FLOAT_KERNEL(log2)
|
||||
IMPLEMENT_FLOAT_KERNEL(ceil)
|
||||
IMPLEMENT_FLOAT_KERNEL(floor)
|
||||
IMPLEMENT_FLOAT_KERNEL(round)
|
||||
IMPLEMENT_FLOAT_KERNEL(sqrt)
|
||||
IMPLEMENT_FLOAT_KERNEL(trunc)
|
||||
|
||||
}} // namespace at::native
|
||||
49
aten/src/ATen/native/cpu/UnaryOpsKernel.h
Normal file
49
aten/src/ATen/native/cpu/UnaryOpsKernel.h
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
#pragma once
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <stdexcept>
|
||||
#include "CapabilityDispatch.h"
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
using unary_fn = void(*)(Tensor&, const Tensor&);
|
||||
|
||||
extern DispatchStub<unary_fn> absImpl;
|
||||
extern DispatchStub<unary_fn> acosImpl;
|
||||
extern DispatchStub<unary_fn> asinImpl;
|
||||
extern DispatchStub<unary_fn> atanImpl;
|
||||
extern DispatchStub<unary_fn> ceilImpl;
|
||||
extern DispatchStub<unary_fn> erfImpl;
|
||||
extern DispatchStub<unary_fn> expImpl;
|
||||
extern DispatchStub<unary_fn> expm1Impl;
|
||||
extern DispatchStub<unary_fn> fracImpl;
|
||||
extern DispatchStub<unary_fn> floorImpl;
|
||||
extern DispatchStub<unary_fn> logImpl;
|
||||
extern DispatchStub<unary_fn> log10Impl;
|
||||
extern DispatchStub<unary_fn> log1pImpl;
|
||||
extern DispatchStub<unary_fn> log2Impl;
|
||||
extern DispatchStub<unary_fn> roundImpl;
|
||||
extern DispatchStub<unary_fn> rsqrtImpl;
|
||||
extern DispatchStub<unary_fn> sqrtImpl;
|
||||
extern DispatchStub<unary_fn> truncImpl;
|
||||
|
||||
|
||||
// Missing unary functions
|
||||
// digamma
|
||||
// lgamma
|
||||
|
||||
// TODO: See below
|
||||
// erfinv
|
||||
// fill
|
||||
// frac
|
||||
// clone
|
||||
// contiguous
|
||||
// clamp/_min/_max
|
||||
// neg
|
||||
// reciprocal
|
||||
// sigmoid
|
||||
// sign
|
||||
// zero
|
||||
|
||||
|
||||
}} // namespace at::native
|
||||
1
third_party/sleef
vendored
Submodule
1
third_party/sleef
vendored
Submodule
|
|
@ -0,0 +1 @@
|
|||
Subproject commit 6ff7a135a1e31979d1e1844a2e7171dfbd34f54f
|
||||
Loading…
Reference in a new issue