mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Summary:
This reverts commit e8c268746b.
It occasionally produces wrong results.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/57299
Reviewed By: wat3rBro
Differential Revision: D28102706
Pulled By: ngimel
fbshipit-source-id: d7618e104d854c3b96aa502fb4e30041b9aab5df
This commit is contained in:
parent
49dbe1798f
commit
65968ab817
4 changed files with 54 additions and 247 deletions
|
|
@ -3,161 +3,81 @@
|
|||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/native/TensorFactories.h>
|
||||
#include <ATen/cuda/cub.cuh>
|
||||
#include <ATen/native/cuda/Randperm.cuh>
|
||||
|
||||
#include <limits>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
|
||||
// [Algorithm of randperm]
|
||||
//
|
||||
// randperm is implemented by sorting an arange tensor of size n with randomly
|
||||
// generated keys. When random keys are different from each other, all different
|
||||
// permutations have the same probability.
|
||||
//
|
||||
// However, there is a pitfall here:
|
||||
// For better performance, these N random keys are generated independently,
|
||||
// and there is no effort to make sure they are different at the time of generation.
|
||||
// When two keys are identical, stable sorting algorithms will not permute these two keys.
|
||||
// As a result, (0, 1) will appear more often than (1, 0).
|
||||
//
|
||||
// To overcome this pitfall we first carefully choose the number of bits in these keys,
|
||||
// so that the probability of having duplicate keys is under a threshold. Let q be the
|
||||
// threshold probability for having non-duplicate keys, then it can be proved that[1]
|
||||
// the number of bits required is: ceil(log2(n - (6 n^2 + 1) / (12 log(q))))
|
||||
//
|
||||
// Then after sort, we lauch a separate kernel that additionally shuffles any islands
|
||||
// of values whose keys matched. The algorithm of this kernel is as follows:
|
||||
// Each thread reads its key and the keys of its neighbors to tell if it's part of an island.
|
||||
// For each island, the first thread in the island sees a key match at index i+1 but not index i-1.
|
||||
// This thread considers itself the "island leader". The island leader then reads more indices to
|
||||
// the right to figure out how big the island is. Most likely, the island will be very small,
|
||||
// just a few values. The island leader then rolls that many RNG, uses them to additionally
|
||||
// shuffle values within the island using serial Fisher-Yates, and writes them out.
|
||||
//
|
||||
// Reference
|
||||
// [1] https://osf.io/af2hy/
|
||||
|
||||
Tensor& randperm_out_cuda(int64_t n, c10::optional<Generator> generator, Tensor& result) {
|
||||
TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
|
||||
TORCH_CHECK(!generator.has_value() || (generator.has_value() && result.device() == generator->device()), "Expected a '", result.device(), "' generator device but found '", generator->device(), "'");
|
||||
TORCH_CHECK(n <= std::numeric_limits<int>::max(),
|
||||
"randperm of tensors larger than INT_MAX is not supported yet in pytorch");
|
||||
check_supported_max_int_with_precision(n, result);
|
||||
|
||||
result.resize_({n});
|
||||
|
||||
auto range = at::arange(n, result.options());
|
||||
|
||||
// shuffled_data points to the underlying data of the output tensor if the tensor is contiguous; otherwise it
|
||||
// points to a new tensor.
|
||||
Tensor shuffled;
|
||||
void *shuffled_data;
|
||||
if (result.is_contiguous()) {
|
||||
shuffled_data = result.data_ptr();
|
||||
} else {
|
||||
shuffled = at::empty(n, result.options());
|
||||
shuffled_data = shuffled.data_ptr();
|
||||
if (n < 30000) { // For small inputs, we offload it to CPU instead.
|
||||
auto result_cpu = at::empty({n}, result.options().device(kCPU));
|
||||
randperm_out(result_cpu, n, generator);
|
||||
return result.copy_(result_cpu);
|
||||
}
|
||||
|
||||
auto opt = TensorOptions().device(result.device());
|
||||
|
||||
// See note [Algorithm of randperm]
|
||||
const double log_threshold_12 = std::log(0.9) * 12;
|
||||
double nd = static_cast<double>(n);
|
||||
|
||||
#if !defined(_MSC_VER)
|
||||
constexpr bool is_reduced_bits = true;
|
||||
int bits = std::min(64,
|
||||
static_cast<int>(std::ceil(std::log2(nd - (6 * nd * nd + 1) / log_threshold_12))));
|
||||
#else
|
||||
// For some unknown reason, randperm_handle_duplicate_keys is causing test failures.
|
||||
// Without this additional permutation kernel, we should sort on as much bits as we can.
|
||||
constexpr bool is_reduced_bits = false;
|
||||
int bits = 64;
|
||||
#if 0
|
||||
// This if condition should never be true because if n >= 30000 and the tensor has a Half type,
|
||||
// check_supported_max_int_with_precision should have reported an error. This snippet is commented out but left here
|
||||
// for the sake of clarity, because Half in thrust is spotty, and we do not want future change unaware of this.
|
||||
if (result.scalar_type() == at::ScalarType::Half) { // Half in thrust is spotty. Avoid.
|
||||
auto result_float = at::empty({n}, initialTensorOptions().device(Device(DeviceType::CUDA)));
|
||||
return result.copy_(randperm_out_cuda(result_float, n, generator));
|
||||
}
|
||||
#endif
|
||||
|
||||
if (n == 0) {
|
||||
return result;
|
||||
} else if (bits <= 8) {
|
||||
auto keys = at::empty(result.sizes(), opt.dtype(kByte)).random_(generator);
|
||||
auto keys_tmp = at::empty_like(keys);
|
||||
auto keys_out = keys_tmp.data_ptr<uint8_t>();
|
||||
AT_DISPATCH_ALL_TYPES_AND(kHalf, result.scalar_type(), "randperm_out_cuda", [&] {
|
||||
auto shuffled_data_ = reinterpret_cast<scalar_t*>(shuffled_data);
|
||||
at::cuda::cub::sort_pairs<uint8_t, scalar_t>(
|
||||
keys.data_ptr<uint8_t>(), keys_out,
|
||||
range.data_ptr<scalar_t>(), shuffled_data_,
|
||||
n, false, 0, bits);
|
||||
// Generate random values for the keys array
|
||||
AT_DISPATCH_ALL_TYPES(
|
||||
result.scalar_type(), "randperm_out_cuda", [&] {
|
||||
TORCH_CHECK(n <= std::numeric_limits<int>::max(),
|
||||
"randperm of tensors larger than INT_MAX is not supported yet in pytorch");
|
||||
|
||||
if (is_reduced_bits) {
|
||||
// This causes failing tests on MSVC for unknown reason.
|
||||
randperm_handle_duplicate_keys(keys_out, shuffled_data_, bits, n, generator);
|
||||
auto keys = at::empty(result.sizes(), result.options()).random_(generator);
|
||||
auto range = at::arange(n, result.options());
|
||||
auto keys_tmp = at::empty_like(keys);
|
||||
|
||||
// shuffled_data points to the underlying data of the output tensor if the tensor is contiguous; otherwise it
|
||||
// points to a new tensor.
|
||||
Tensor shuffled;
|
||||
scalar_t *shuffled_data;
|
||||
if (result.is_contiguous()) {
|
||||
shuffled_data = result.data_ptr<scalar_t>();
|
||||
} else {
|
||||
shuffled = at::empty(n, result.options());
|
||||
shuffled_data = shuffled.data_ptr<scalar_t>();
|
||||
}
|
||||
|
||||
});
|
||||
} else if (bits <= 16) {
|
||||
auto keys = at::empty(result.sizes(), opt.dtype(kShort)).random_(
|
||||
std::numeric_limits<int16_t>::min(), std::numeric_limits<int16_t>::max(), generator);
|
||||
auto keys_tmp = at::empty_like(keys);
|
||||
auto keys_out = keys_tmp.data_ptr<int16_t>();
|
||||
AT_DISPATCH_ALL_TYPES_AND(kHalf, result.scalar_type(), "randperm_out_cuda", [&] {
|
||||
auto shuffled_data_ = reinterpret_cast<scalar_t*>(shuffled_data);
|
||||
at::cuda::cub::sort_pairs<int16_t, scalar_t>(
|
||||
keys.data_ptr<int16_t>(), keys_out,
|
||||
range.data_ptr<scalar_t>(), shuffled_data_,
|
||||
n, false, 0, bits);
|
||||
// Use the sorted order of keys to rearrange the result array
|
||||
size_t temp_storage_bytes = 0;
|
||||
|
||||
if (is_reduced_bits) {
|
||||
// This causes failing tests on MSVC for unknown reason.
|
||||
randperm_handle_duplicate_keys(keys_out, shuffled_data_, bits, n, generator);
|
||||
cub::DeviceRadixSort::SortPairs(
|
||||
nullptr, temp_storage_bytes,
|
||||
keys.data_ptr<scalar_t>(), keys_tmp.data_ptr<scalar_t>(),
|
||||
range.data_ptr<scalar_t>(), shuffled_data, n,
|
||||
0, sizeof(scalar_t) * 8, at::cuda::getCurrentCUDAStream());
|
||||
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
|
||||
auto dataPtr = allocator.allocate(temp_storage_bytes);
|
||||
cub::DeviceRadixSort::SortPairs(
|
||||
dataPtr.get(), temp_storage_bytes,
|
||||
keys.data_ptr<scalar_t>(), keys_tmp.data_ptr<scalar_t>(),
|
||||
range.data_ptr<scalar_t>(), shuffled_data, n,
|
||||
0, sizeof(scalar_t) * 8, at::cuda::getCurrentCUDAStream());
|
||||
|
||||
if (!result.is_contiguous()) {
|
||||
result.copy_(shuffled);
|
||||
}
|
||||
|
||||
});
|
||||
} else if (bits <= 32) {
|
||||
auto keys = at::empty(result.sizes(), opt.dtype(kInt)).random_(
|
||||
std::numeric_limits<int>::min(), std::numeric_limits<int>::max(), generator);
|
||||
auto keys_tmp = at::empty_like(keys);
|
||||
auto keys_out = keys_tmp.data_ptr<int>();
|
||||
AT_DISPATCH_ALL_TYPES_AND(kHalf, result.scalar_type(), "randperm_out_cuda", [&] {
|
||||
auto shuffled_data_ = reinterpret_cast<scalar_t*>(shuffled_data);
|
||||
at::cuda::cub::sort_pairs<int, scalar_t>(
|
||||
keys.data_ptr<int>(), keys_out,
|
||||
range.data_ptr<scalar_t>(), shuffled_data_,
|
||||
n, false, 0, bits);
|
||||
|
||||
if (is_reduced_bits) {
|
||||
// This causes failing tests on MSVC for unknown reason.
|
||||
randperm_handle_duplicate_keys(keys_out, shuffled_data_, bits, n, generator);
|
||||
}
|
||||
|
||||
});
|
||||
} else {
|
||||
auto keys = at::empty(result.sizes(), opt.dtype(kLong)).random_(
|
||||
std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max(), generator);
|
||||
auto keys_tmp = at::empty_like(keys);
|
||||
auto keys_out = keys_tmp.data_ptr<int64_t>();
|
||||
AT_DISPATCH_ALL_TYPES_AND(kHalf, result.scalar_type(), "randperm_out_cuda", [&] {
|
||||
auto shuffled_data_ = reinterpret_cast<scalar_t*>(shuffled_data);
|
||||
at::cuda::cub::sort_pairs<int64_t, scalar_t>(
|
||||
keys.data_ptr<int64_t>(), keys_out,
|
||||
range.data_ptr<scalar_t>(), shuffled_data_,
|
||||
n, false, 0, bits);
|
||||
|
||||
if (is_reduced_bits) {
|
||||
// This causes failing tests on MSVC for unknown reason.
|
||||
randperm_handle_duplicate_keys(keys_out, shuffled_data_, bits, n, generator);
|
||||
}
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
if (!result.is_contiguous()) {
|
||||
result.copy_(shuffled);
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}} // namespace at::native
|
||||
|
|
|
|||
|
|
@ -1,56 +0,0 @@
|
|||
#include <ATen/CUDAGeneratorImpl.h>
|
||||
#include <ATen/cuda/CUDAGraphsUtils.cuh>
|
||||
#include <ATen/Utils.h>
|
||||
|
||||
#include <curand.h>
|
||||
#include <curand_kernel.h>
|
||||
#include <curand_philox4x32_x.h>
|
||||
|
||||
namespace {
|
||||
|
||||
// See note [Algorithm of randperm]
|
||||
template<typename T, typename scalar_t>
|
||||
__global__ void randperm_handle_duplicate_keys_kernel(T *keys, scalar_t *data, T mask, int n, at::PhiloxCudaState philox_args) {
|
||||
int tid = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
|
||||
// find the beginning of islands
|
||||
if (tid >= n - 1) return; // out of range
|
||||
if ((keys[tid] & mask) != (keys[tid + 1] & mask)) return; // not in an island
|
||||
if (tid != 0 && (keys[tid] & mask) == (keys[tid - 1] & mask)) return; // not the beginning of an island
|
||||
|
||||
// find the size of islands
|
||||
int island_size = 0;
|
||||
while ((keys[tid + ++island_size] & mask) == (keys[tid] & mask));
|
||||
|
||||
// do random permutation inside each island.
|
||||
data += tid;
|
||||
auto seeds = at::cuda::philox::unpack(philox_args);
|
||||
curandStatePhilox4_32_10_t state;
|
||||
curand_init(std::get<0>(seeds), tid, std::get<1>(seeds), &state);
|
||||
for (int i = island_size - 1; i > 0; i--) {
|
||||
unsigned int r = curand(&state) % (i + 1);
|
||||
if (i != r) {
|
||||
scalar_t tmp = data[i];
|
||||
data[i] = data[r];
|
||||
data[r] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// See note [Algorithm of randperm]
|
||||
template<typename T, typename scalar_t>
|
||||
void randperm_handle_duplicate_keys(T *keys, scalar_t *data, int bits, int64_t n, c10::optional<at::Generator> &gen_) {
|
||||
auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(gen_, at::cuda::detail::getDefaultCUDAGenerator());
|
||||
int64_t counter_offset = n;
|
||||
at::PhiloxCudaState rng_engine_inputs;
|
||||
{
|
||||
// See Note [Acquire lock when using random generators]
|
||||
std::lock_guard<std::mutex> lock(gen->mutex_);
|
||||
rng_engine_inputs = gen->philox_cuda_state(counter_offset);
|
||||
}
|
||||
T mask = static_cast<T>((1UL << bits) - 1);
|
||||
randperm_handle_duplicate_keys_kernel<<<(n + 511) / 512, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
|
||||
keys, data, mask, n, rng_engine_inputs);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -2,11 +2,8 @@
|
|||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/native/cuda/Randperm.cuh>
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <vector>
|
||||
|
||||
#include <curand.h>
|
||||
#include <curand_kernel.h>
|
||||
|
|
@ -151,57 +148,3 @@ TEST(DistributionsTest, TestPhiloxIncrementSmallMultinomialTensor) {
|
|||
// expected uniforms will start from counter offset of 4
|
||||
assert_with_expected_uniforms(4);
|
||||
}
|
||||
|
||||
__managed__ int keys[] = {
|
||||
1, (1 << 15) + 1, (1 << 16) + 1,
|
||||
2, (1 << 14) + 2
|
||||
};
|
||||
|
||||
__managed__ int values[] = { 1, 2, 3, 4, 5 };
|
||||
|
||||
std::vector<std::vector<int>> valid_perms1 = {
|
||||
{1, 2, 3}, {1, 3, 2}, {2, 1, 3}, {2, 3, 1}, {3, 1, 2}, {3, 2, 1}
|
||||
};
|
||||
std::vector<std::vector<int>> valid_perms2 = {
|
||||
{4, 5}, {5, 4}
|
||||
};
|
||||
|
||||
TEST(RandomPermutationTest, TestIslandShuffle) {
|
||||
if (!at::cuda::is_available()) return;
|
||||
at::manual_seed(123);
|
||||
|
||||
bool shuffled1 = false;
|
||||
bool shuffled2 = false;
|
||||
for (int i = 0; i < 100; i++) {
|
||||
cudaDeviceSynchronize();
|
||||
c10::optional<at::Generator> gen = c10::nullopt;
|
||||
randperm_handle_duplicate_keys(keys, values, 8, 5, gen);
|
||||
cudaDeviceSynchronize();
|
||||
std::vector<int> slice1 = {values[0], values[1], values[2]};
|
||||
std::vector<int> slice2 = {values[3], values[4]};
|
||||
if (slice1 != valid_perms1[0]) {
|
||||
shuffled1 = true;
|
||||
}
|
||||
if (slice2 != valid_perms2[0]) {
|
||||
shuffled2 = true;
|
||||
}
|
||||
bool passed1 = false;
|
||||
bool passed2 = false;
|
||||
for (auto &i : valid_perms1) {
|
||||
if (i == slice1) {
|
||||
passed1 = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (auto &i : valid_perms2) {
|
||||
if (i == slice2) {
|
||||
passed2 = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ASSERT_TRUE(passed1);
|
||||
ASSERT_TRUE(passed2);
|
||||
}
|
||||
ASSERT_TRUE(shuffled1);
|
||||
ASSERT_TRUE(shuffled2);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3262,9 +3262,9 @@ class TestRandomTensorCreation(TestCase):
|
|||
# see https://github.com/pytorch/pytorch/issues/54282
|
||||
rng_device = [device]
|
||||
|
||||
# Test core functionality. On CUDA, different value of n has different
|
||||
# code path
|
||||
for n in (5, 100, 50000, 100000):
|
||||
# Test core functionality. On CUDA, for small n, randperm is offloaded to CPU instead. For large n, randperm is
|
||||
# executed on GPU.
|
||||
for n in (100, 50000, 100000):
|
||||
# Ensure both integer and floating-point numbers are tested. Half follows an execution path that is
|
||||
# different from others on CUDA.
|
||||
for dtype in (torch.long, torch.half, torch.float):
|
||||
|
|
|
|||
Loading…
Reference in a new issue