diff --git a/aten/src/ATen/native/RangeFactories.cpp b/aten/src/ATen/native/RangeFactories.cpp index 48db240e807..5ecc0f15933 100644 --- a/aten/src/ATen/native/RangeFactories.cpp +++ b/aten/src/ATen/native/RangeFactories.cpp @@ -1,12 +1,12 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #include #include #include #include #include #include -#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -195,38 +195,7 @@ Tensor& range_out_no_step(const Scalar& start, const Scalar& end, Tensor& result Tensor& arange_out(const Scalar& start, const Scalar& end, const Scalar& step, Tensor& result) { AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, result.scalar_type(), "arange_cpu", [&]() { - using accscalar_t = at::acc_type; - auto xstart = start.to(); - auto xend = end.to(); - auto xstep = step.to(); - - TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); - TORCH_CHECK(std::isfinite(static_cast(xstart)) && - std::isfinite(static_cast(xend)), - "unsupported range: ", xstart, " -> ", xend); - TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), - "upper bound and larger bound inconsistent with step sign"); - - // we use double precision for (start - end) / step - // to compute size_d for consistency across devices. - // The problem with using accscalar_t is that accscalar_t might be float32 on gpu for a float32 scalar_t, - // but double on cpu for the same, - // and the effective output size starts differing on CPU vs GPU because of precision issues, which - // we dont want. - // the corner-case we do want to take into account is int64_t, which has higher precision than double - double size_d; - if constexpr (std::is_same_v) { - int64_t sgn = (xstep > 0) - (xstep < 0); - size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); - } else { - size_d = std::ceil(static_cast(end.to() - start.to()) - / step.to()); - } - - TORCH_CHECK(size_d >= 0 && size_d <= static_cast(std::numeric_limits::max()), - "invalid size, possible overflow?"); - - int64_t size = static_cast(size_d); + int64_t size = compute_arange_size(start, end, step); int64_t numel = result.numel(); if (numel != size) { diff --git a/aten/src/ATen/native/RangeUtils.h b/aten/src/ATen/native/RangeUtils.h new file mode 100644 index 00000000000..d1756db7501 --- /dev/null +++ b/aten/src/ATen/native/RangeUtils.h @@ -0,0 +1,45 @@ +#include +#include +#include + +namespace at { + +namespace native { + +template +int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar& step) { + using accscalar_t = at::acc_type; + auto xstart = start.to(); + auto xend = end.to(); + auto xstep = step.to(); + + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK(std::isfinite(static_cast(xstart)) && + std::isfinite(static_cast(xend)), + "unsupported range: ", xstart, " -> ", xend); + TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), + "upper bound and larger bound inconsistent with step sign"); + + // we use double precision for (start - end) / step + // to compute size_d for consistency across devices. + // The problem with using accscalar_t is that accscalar_t might be float32 on gpu for a float32 scalar_t, + // but double on cpu for the same, + // and the effective output size starts differing on CPU vs GPU because of precision issues, which + // we dont want. + // the corner-case we do want to take into account is int64_t, which has higher precision than double + double size_d; + if constexpr (std::is_same_v) { + int64_t sgn = (xstep > 0) - (xstep < 0); + size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); + } else { + size_d = std::ceil(static_cast(end.to() - start.to()) + / step.to()); + } + + TORCH_CHECK(size_d >= 0 && size_d <= static_cast(std::numeric_limits::max()), + "invalid size, possible overflow?"); + + return static_cast(size_d); +} + +}} // namespace at::native