Re-implement pin_memory to be device-agnostic by leveraging the Accelerator concept (#126376)

This PR re-implements pin memory aiming to get rid of the optional `device` argument and makes all related APIs to be device-agnostic. We add two new abstract APIs in [AcceleratorHooksInterface](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/detail/AcceleratorHooksInterface.h#L12) and redefine pin memory as: "Pin memory is always pinned for the current accelerator device". In detail, it uses [getAcceleratorHooksInterface](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/Context.h#L61) in pin_memory/is_pinned to get an appropriate device and invoke the corresponding overridden interfaces, instead of using BackendSelect and then dispatching to CUDA or other specific backends' implement methods.

Note: For new backends who want to implement and use pin memory, just inherit AcceleratorHooksInterface and overwrite the `isPinnedPtr` and `getPinnedMemoryAllocator` methods.

Additional context: To avoid BC-breaking, this PR just preserves the `device` arg of related APIs and would throw a deprecation warning if `device` arg is passed. Another PR will be submitted to update all PT callers (`Tensor.is_pinned()`, `Tensor.pin_memory()`...) not to pass this arg based on this PR. In future, `device` arg will be actually removed.

Relates #124908
Relates #14560
Pull Request resolved: https://github.com/pytorch/pytorch/pull/126376
Approved by: https://github.com/albanD
This commit is contained in:
wizzniu 2024-07-23 01:44:15 +00:00 committed by PyTorch MergeBot
parent 074b420641
commit 8963623494
25 changed files with 210 additions and 203 deletions

View file

@ -73,6 +73,8 @@ class TORCH_API Context {
return at::detail::getPrivateUse1Hooks();
} else if (device_type == at::kMTIA) {
return at::detail::getMTIAHooks();
} else if (device_type == at::kHIP) {
return at::detail::getHIPHooks();
} else {
AT_ERROR(
c10::DeviceTypeName(device_type), " device type not an accelerator.");
@ -94,8 +96,22 @@ class TORCH_API Context {
AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
}
}
static bool isPinnedPtr(const void* data) {
return detail::getCUDAHooks().isPinnedPtr(data);
bool isPinnedPtr(
const void* data,
std::optional<c10::DeviceType> device_type = std::nullopt) {
auto opt_device_type =
device_type.has_value() ? device_type.value() : at::getAccelerator();
if (!opt_device_type.has_value() || // there is no accelerator
!at::isAccelerator(
opt_device_type.value())) { // passed device not an accelerator
return false;
}
return getAcceleratorHooksInterface(opt_device_type.value())
.isPinnedPtr(data);
}
Allocator* getPinnedMemoryAllocator(
std::optional<c10::DeviceType> device_type = std::nullopt) {
return getAcceleratorHooksInterface(device_type).getPinnedMemoryAllocator();
}
static bool hasOpenMP();
static bool hasMKL();

View file

@ -2,7 +2,7 @@
#include <ATen/DeviceAccelerator.h>
namespace at {
C10_API std::optional<DeviceType> getAccelerator(bool checked) {
std::optional<c10::DeviceType> getAccelerator(bool checked) {
#define DETECT_AND_ASSIGN_ACCELERATOR(device_name) \
if (at::has##device_name()) { \
device_type = k##device_name; \
@ -20,11 +20,13 @@ C10_API std::optional<DeviceType> getAccelerator(bool checked) {
// first.
return kPrivateUse1;
}
std::optional<DeviceType> device_type = std::nullopt;
std::optional<c10::DeviceType> device_type = std::nullopt;
bool is_accelerator_detected = false;
DETECT_AND_ASSIGN_ACCELERATOR(CUDA)
DETECT_AND_ASSIGN_ACCELERATOR(MTIA)
DETECT_AND_ASSIGN_ACCELERATOR(XPU)
DETECT_AND_ASSIGN_ACCELERATOR(HIP)
DETECT_AND_ASSIGN_ACCELERATOR(MPS)
if (checked) {
TORCH_CHECK(
device_type, "Cannot access accelerator device when none is available.")
@ -34,4 +36,18 @@ C10_API std::optional<DeviceType> getAccelerator(bool checked) {
#undef DETECT_AND_ASSIGN_ACCELERATOR
}
bool isAccelerator(c10::DeviceType d) {
switch (d) {
case at::kCUDA:
case at::kMTIA:
case at::kXPU:
case at::kHIP:
case at::kMPS:
case at::kPrivateUse1:
return true;
default:
return false;
}
}
} // namespace at

View file

@ -13,9 +13,7 @@
// - It provides a set of common APIs as defined by AcceleratorHooksInterface
//
// As of today, accelerator devices are (in no particular order):
// CUDA, MTIA, XPU, PrivateUse1
// We want to add once all the proper APIs are supported and tested:
// HIP, MPS
// CUDA, MTIA, XPU, HIP, MPS, PrivateUse1
namespace at {
@ -24,4 +22,6 @@ namespace at {
// When checked is true, the returned optional always has a value.
TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);
TORCH_API bool isAccelerator(c10::DeviceType d);
} // namespace at

View file

@ -1,32 +0,0 @@
#include <ATen/cuda/PinnedMemoryAllocator.h>
#include <ATen/Context.h>
#include <ATen/Config.h>
#include <ATen/TensorUtils.h>
#include <c10/core/Storage.h>
#include <ATen/ATen.h>
#include <ATen/CPUFunctions.h>
namespace at::native {
bool is_pinned_cuda(const Tensor& self, std::optional<Device> device) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda());
// TODO: unhook this
return detail::getCUDAHooks().isPinnedPtr(self.storage().data());
}
Tensor _pin_memory_cuda(const Tensor& self, std::optional<Device> device) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda());
auto* allocator = at::cuda::getPinnedMemoryAllocator();
auto storage = Storage(
Storage::use_byte_size_t(),
detail::computeStorageNbytes(
self.sizes(), self.strides(), self.dtype().itemsize()),
allocator,
/*resizable=*/false);
auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
tensor.copy_(self);
return tensor;
}
} // namespace at::native

View file

@ -2,6 +2,7 @@
#include <c10/core/Device.h>
#include <c10/core/Stream.h>
#include <c10/core/Allocator.h>
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
namespace at {
@ -40,6 +41,15 @@ struct TORCH_API AcceleratorHooksInterface {
TORCH_CHECK(false, "Backend doesn't support maybeExchangeDevice()");
return -1;
}
virtual bool isPinnedPtr(const void* data) const {
return false;
}
virtual Allocator* getPinnedMemoryAllocator() const {
TORCH_CHECK(false, "Backend doesn't support getPinnedMemoryAllocator()");
return nullptr;
}
};
} // namespace at

View file

@ -77,7 +77,7 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
TORCH_CHECK(false, "Cannot get device of pointer on CUDA without ATen_cuda library. ", CUDA_HELP);
}
virtual bool isPinnedPtr(const void* /*data*/) const {
virtual bool isPinnedPtr(const void* data) const override {
return false;
}
@ -121,7 +121,7 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
return -1;
}
virtual Allocator* getPinnedMemoryAllocator() const {
virtual Allocator* getPinnedMemoryAllocator() const override {
TORCH_CHECK(false, "Pinned memory requires CUDA. ", CUDA_HELP);
}

View file

@ -6,6 +6,8 @@
#include <c10/util/Registry.h>
#include <ATen/detail/AcceleratorHooksInterface.h>
#include <memory>
namespace at {
@ -19,7 +21,7 @@ namespace at {
// which we may want to call into from CPU code (and thus must be dynamically
// dispatched, to allow for separate compilation of HIP code). See
// CUDAHooksInterface for more detailed motivation.
struct TORCH_API HIPHooksInterface {
struct TORCH_API HIPHooksInterface : AcceleratorHooksInterface {
// This should never actually be implemented, but it is used to
// squelch -Werror=non-virtual-dtor
virtual ~HIPHooksInterface() = default;
@ -41,7 +43,11 @@ struct TORCH_API HIPHooksInterface {
return -1;
}
virtual Allocator* getPinnedMemoryAllocator() const {
virtual bool isPinnedPtr(const void* data) const override {
return false;
}
virtual Allocator* getPinnedMemoryAllocator() const override {
AT_ERROR("Pinned memory requires HIP.");
}
@ -52,6 +58,10 @@ struct TORCH_API HIPHooksInterface {
virtual int getNumGPUs() const {
return 0;
}
virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
AT_ERROR("Cannot check primary context without ATen_hip library.");
}
};
// NB: dummy argument to suppress "ISO C++11 requires at least one argument

View file

@ -94,6 +94,12 @@ struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface {
bool hasPrimaryContext(DeviceIndex device_index) const override {
FAIL_MPSHOOKS_FUNC(__func__);
}
virtual bool isPinnedPtr(const void* data) const override {
return false;
}
virtual Allocator* getPinnedMemoryAllocator() const override {
FAIL_MPSHOOKS_FUNC(__func__);
}
#undef FAIL_MPSHOOKS_FUNC
};

View file

@ -6,6 +6,8 @@
#include <c10/core/Stream.h>
#include <c10/util/Registry.h>
#include <c10/core/Allocator.h>
#include <ATen/detail/AcceleratorHooksInterface.h>
#include <string>
@ -88,6 +90,15 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
virtual void setCurrentStream(const c10::Stream& stream) const {
FAIL_MTIAHOOKS_FUNC(__func__);
}
virtual bool isPinnedPtr(const void* data) const override {
return false;
}
virtual Allocator* getPinnedMemoryAllocator() const override {
FAIL_MTIAHOOKS_FUNC(__func__);
return nullptr;
}
};
struct TORCH_API MTIAHooksArgs {};

View file

@ -24,7 +24,11 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
"You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDeviceFromPtr`.");
}
virtual Allocator* getPinnedMemoryAllocator() const {
virtual bool isPinnedPtr(const void* data) const override {
return false;
}
virtual Allocator* getPinnedMemoryAllocator() const override {
TORCH_CHECK(
false,
"You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getPinnedMemoryAllocator`.");

View file

@ -58,15 +58,15 @@ struct TORCH_API XPUHooksInterface : AcceleratorHooksInterface{
TORCH_CHECK(false, "Cannot synchronize XPU device without ATen_xpu library.");
}
virtual Allocator* getPinnedMemoryAllocator() const {
virtual Allocator* getPinnedMemoryAllocator() const override {
TORCH_CHECK(false, "Cannot get XPU pinned memory allocator without ATen_xpu library.");
}
virtual bool isPinnedPtr(const void* /*data*/) const {
virtual bool isPinnedPtr(const void* data) const override {
return false;
}
virtual bool hasPrimaryContext(DeviceIndex /*device_index*/) const override{
virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
TORCH_CHECK(false, "Cannot query primary context without ATen_xpu library.");
}
};

View file

@ -3,8 +3,6 @@
#include <ATen/CPUFunctions.h>
#include <ATen/EmptyTensor.h>
#include <ATen/mps/MPSAllocator.h>
#include <ATen/ops/_pin_memory_native.h>
#include <ATen/ops/is_pinned_native.h>
#include <c10/core/Allocator.h>
#include <c10/core/Storage.h>
@ -860,31 +858,12 @@ IMPSAllocator* getIMPSAllocator(bool sharedAllocator) {
return nullptr;
}
} // namespace at::mps
namespace at::native {
// torch.is_pinned() implementation
// Pinned memory will be helpful on Apple Silicon Macs with Unified memory as we
// will be able to use SharedStorageMode for MTLBuffer allocations. This will
// avoid extra copies on DataLoading operations.
bool is_pinned_mps(const Tensor& self, std::optional<Device> device) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps());
return at::mps::_getSharedAllocator().isSharedBuffer(self.storage().data());
bool isMPSPinnedPtr(const void* data) {
return at::mps::_getSharedAllocator().isSharedBuffer(data);
}
// torch.pin_memory() implementation
Tensor _pin_memory_mps(const Tensor& self, std::optional<Device> device) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps());
auto* shared_allocator = at::mps::getIMPSAllocator(true);
TORCH_CHECK(shared_allocator, "unable to pin memory on a non-unified memory device");
const size_t storage_size = at::detail::computeStorageNbytes(self.sizes(), self.strides(), self.dtype().itemsize());
std::cerr << "Pinning memory of size " << storage_size / 1024UL << " KB\n";
auto storage = Storage(Storage::use_byte_size_t(), storage_size, shared_allocator, false);
auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
tensor.copy_(self);
return tensor;
}
} // namespace at::native
} // namespace at::mps

View file

@ -59,4 +59,6 @@ C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback);
IMPSAllocator* getIMPSAllocator(bool sharedAllocator = false);
bool isMPSPinnedPtr(const void* data);
} // namespace at::mps

View file

@ -68,6 +68,11 @@ struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface
return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
}
Stream getNewStream(Device, int priority = 0) const override {
(void)priority;
return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
}
Stream getDefaultStream(Device d) const override {
return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
}

View file

@ -34,6 +34,8 @@ struct MPSHooks : public at::MPSHooksInterface {
size_t getDriverAllocatedMemory() const override;
size_t getRecommendedMaxMemory() const override;
void setMemoryFraction(double ratio) const override;
bool isPinnedPtr(const void* data) const override;
Allocator* getPinnedMemoryAllocator() const override;
// MPSProfiler interface
void profilerStartTrace(const std::string& mode, bool waitUntilCompleted) const override;

View file

@ -124,6 +124,14 @@ double MPSHooks::elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event
return at::mps::getMPSEventPool()->elapsedTime(start_event_id, end_event_id);
}
bool MPSHooks::isPinnedPtr(const void* data) const {
return at::mps::isMPSPinnedPtr(data);
}
Allocator* MPSHooks::getPinnedMemoryAllocator() const {
return at::mps::getIMPSAllocator(true);
}
using at::MPSHooksRegistry;
using at::RegistererMPSHooksRegistry;

View file

@ -1,15 +1,21 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/MemoryOverlap.h>
#include <ATen/Context.h>
#include <c10/core/Storage.h>
#include <ATen/EmptyTensor.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#include <ATen/CPUFunctions.h>
#else
#include <ATen/ops/_debug_has_internal_overlap_native.h>
#include <ATen/ops/_pin_memory.h>
#include <ATen/ops/is_pinned_native.h>
#include <ATen/ops/pin_memory_native.h>
#include <ATen/ops/_pin_memory_native.h>
#include <ATen/ops/empty_cpu_dispatch.h>
#endif
namespace at::native {
@ -19,15 +25,28 @@ int64_t _debug_has_internal_overlap(const Tensor& self) {
return static_cast<int64_t>(at::has_internal_overlap(self));
}
// Technically, we could force backends to explicitly say "no, we don't support
// pinned memory, always return false", but this makes life a little easier when
// you haven't loaded the backend extension at all (which can happen, e.g., on a
// CPU build of PyTorch and you try to check if something is CUDA pinned)
bool is_pinned_default(const Tensor& self, std::optional<Device> device) {
return false;
bool is_pinned(const Tensor& self, std::optional<c10::Device> device) {
std::optional<c10::DeviceType> opt_device_type;
if (device.has_value()) {
TORCH_WARN_DEPRECATION(
"The argument 'device' of Tensor.is_pinned() ",
"is deprecated. Please do not pass this argument.")
opt_device_type = device.value().type();
}
// Only CPU tensors can be pinned
if (!self.is_cpu()) {
return false;
}
// Use getAcceleratorHooksInterface to make is_pinned device-agnostic
return at::globalContext().isPinnedPtr(self.storage().data(), opt_device_type);
}
Tensor pin_memory(const Tensor& self, std::optional<Device> device) {
Tensor pin_memory(const Tensor& self, std::optional<c10::Device> device) {
if (device.has_value()) {
TORCH_WARN_DEPRECATION(
"The argument 'device' of Tensor.pin_memory() ",
"is deprecated. Please do not pass this argument.")
}
// Kind of mad that I have to do two dynamic dispatches here, pretty
// annoying
if (self.is_pinned(device)) {
@ -36,4 +55,21 @@ Tensor pin_memory(const Tensor& self, std::optional<Device> device) {
return at::_pin_memory(self, device);
}
Tensor _pin_memory(const Tensor& self, std::optional<c10::Device> device) {
TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned");
// Use getAcceleratorHooksInterface to make pin_memory device-agnostic
auto* allocator = device.has_value()?
at::globalContext().getPinnedMemoryAllocator(device.value().type()):
at::globalContext().getPinnedMemoryAllocator();
auto storage = Storage(
Storage::use_byte_size_t(),
detail::computeStorageNbytes(
self.sizes(), self.strides(), self.dtype().itemsize()),
allocator,
/*resizable=*/false);
auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
tensor.copy_(self);
return tensor;
}
} // namespace at::native

View file

@ -4545,9 +4545,10 @@
- func: is_pinned(Tensor self, Device? device=None) -> bool
variants: method
dispatch:
NestedTensorCUDA, CUDA: is_pinned_cuda
MPS: is_pinned_mps
CompositeExplicitAutograd: is_pinned_default
# the NestedTensor keys are necessary because NestedTensor has been removed
# from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys]
CompositeExplicitAutograd, NestedTensorCPU: is_pinned
SparseCPU, SparseCsrCPU: is_pinned_sparse
# TODO: add a copy kwarg that guarantees that the tensor is put into fresh
# pinned memory
@ -4557,9 +4558,9 @@
# Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor
- func: _pin_memory(Tensor self, Device? device=None) -> Tensor
dispatch:
CUDA: _pin_memory_cuda
MPS: _pin_memory_mps
NestedTensorCUDA, NestedTensorCPU: _pin_memory_nested
NestedTensorCPU: _pin_memory_nested
SparseCPU, SparseCsrCPU: _pin_memory_sparse
CompositeExplicitAutograd: _pin_memory
autogen: _pin_memory.out
- func: pinverse(Tensor self, float rcond=1e-15) -> Tensor

View file

@ -71,6 +71,8 @@
#include <ATen/ops/threshold_backward_native.h>
#include <ATen/ops/trunc.h>
#include <ATen/ops/trunc_native.h>
#include <ATen/ops/is_pinned_native.h>
#include <ATen/ops/_pin_memory_native.h>
#endif
namespace at::native {
@ -280,4 +282,23 @@ Tensor& nan_to_num_sparse_(
return nan_to_num_sparse_out(self, nan, posinf, neginf, self);
}
bool is_pinned_sparse(const Tensor& self, std::optional<c10::Device> device) {
if (device.has_value()) {
TORCH_WARN_DEPRECATION(
"The argument 'device' of Tensor.is_pinned() ",
"is deprecated. Please do not pass this argument.")
}
// Currently, we don't support pin memory for sparse tensor.
// so always return false
return false;
}
Tensor _pin_memory_sparse(const Tensor& self, std::optional<c10::Device> device) {
// Here, we throw an error rather than return self tensor. This
// is because we always return the pinned memory tensor, while
// giving unpinned tensor might mislead users.
TORCH_CHECK_NOT_IMPLEMENTED(
false, "'aten::_pin_memory' is not implemented for sparse tensor.");
}
} // namespace at::native

View file

@ -11,8 +11,6 @@
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Operators.h>
#else
#include <ATen/ops/is_pinned_ops.h>
#include <ATen/ops/_pin_memory_ops.h>
${ops_headers}
#endif
@ -23,31 +21,8 @@ namespace {
${backend_select_method_definitions}
bool is_pinned(const Tensor& self, std::optional<at::Device> device) {
// Only CPU tensors can be pinned
if (!self.is_cpu()) {
return false;
}
// TODO: fetch scalar type from Tensor? But it doesn't really matter...
DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(std::nullopt, self.layout(), device.value_or(at::kCUDA)));
return at::_ops::is_pinned::redispatch(_dk, self, device);
}
at::Tensor _pin_memory(const Tensor& self, std::optional<at::Device> device) {
TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned");
DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(std::nullopt, self.layout(), device.value_or(at::kCUDA)));
if (self.is_nested()) {
constexpr auto nested_key_set = c10::DispatchKeySet(
{c10::DispatchKey::NestedTensor, c10::DispatchKey::AutogradNestedTensor});
_dk = _dk.add(self.key_set() & nested_key_set);
}
return at::_ops::_pin_memory::redispatch(_dk, self, device);
}
TORCH_LIBRARY_IMPL(aten, BackendSelect, m) {
${backend_select_function_registrations};
m.impl(TORCH_SELECTIVE_NAME("aten::is_pinned"), TORCH_FN(is_pinned));
m.impl(TORCH_SELECTIVE_NAME("aten::_pin_memory"), TORCH_FN(_pin_memory));
}
} // namespace

View file

@ -418,38 +418,6 @@ at::Tensor& custom_set_source_Storage_storage_offset(at::Tensor& result,
return result;
}
// basic dummy functions related to pin_memory.
std::vector<void*> custom_pinned_data_ptr;
at::Tensor custom__pin_memory(const at::Tensor& self, std::optional<at::Device> device) {
TORCH_CHECK(
self.device().is_cpu(),
"cannot pin '",
self.toString(),
"' only dense CPU tensors can be pinned");
// record pinned data ptr
at::Tensor dump_pinned_tensor = self * 1.0;
custom_pinned_data_ptr.push_back(dump_pinned_tensor.storage().data_ptr().get());
return dump_pinned_tensor;
}
bool custom_is_pinned(const at::Tensor& self, std::optional<at::Device> device) {
// Only CPU tensors can be pinned
if (!self.is_cpu()) {
return false;
}
void* query_pinned_ptr = self.storage().data_ptr().get();
for (const auto& iter_ptr : custom_pinned_data_ptr) {
if (iter_ptr == query_pinned_ptr) {
return true;
}
}
return false;
}
const at::Tensor& custom_resize_(const at::Tensor& self, at::IntArrayRef size,
std::optional<at::MemoryFormat> optional_memory_format) {
at::TensorImpl* tensor_impl = self.unsafeGetTensorImpl();
@ -545,8 +513,6 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
m.impl("empty_strided", &custom_empty_strided);
m.impl("set_.source_Storage", &custom_set_source_Storage);
m.impl("set_.source_Storage_storage_offset",&custom_set_source_Storage_storage_offset);
m.impl("_pin_memory", &custom__pin_memory);
m.impl("is_pinned", &custom_is_pinned);
m.impl("resize_", &custom_resize_);
m.impl("as_strided", at::native::as_strided_tensorimpl);
m.impl("quantize_per_tensor", at::native::quantize_per_tensor);
@ -612,6 +578,9 @@ void set_custom_device_index(c10::DeviceIndex device_index) {
custom_device_index = device_index;
}
// a global flag used for dummy pin_memory of custom device
bool custom_pinned_flag = false;
struct FooHooksArgs : public at::PrivateUse1HooksArgs {};
struct FooHooksInterface : public at::PrivateUse1HooksInterface {
@ -621,6 +590,16 @@ struct FooHooksInterface : public at::PrivateUse1HooksInterface {
static auto device_gen = make_generator_privateuse1(device_index);
return device_gen;
}
// this is a simple implementation, custom_pinned_flag will be set as true
// once tensor.pin_memory() is called. And then tensor.is_pinned()
// always return true no matter what tensor it's called on.
bool isPinnedPtr(const void* data) const override {
return custom_pinned_flag;
}
c10::Allocator* getPinnedMemoryAllocator() const override {
custom_pinned_flag = true;
return c10::GetCPUAllocator();
}
};
TORCH_DECLARE_REGISTRY(PrivateUse1HooksRegistry, FooHooksInterface, FooHooksArgs);

View file

@ -343,71 +343,24 @@ class TestCppExtensionOpenRgistration(common.TestCase):
cpu_tensor_pin = cpu_tensor.pin_memory("foo")
self.assertTrue(cpu_tensor_pin.is_pinned("foo"))
# Test storage pin_memory on custom device string
# Test storage pin_memory and is_pin
cpu_storage = cpu_tensor.storage()
foo_device = torch.device("foo")
self.assertFalse(cpu_storage.is_pinned("foo"))
# We implement a dummy pin_memory of no practical significance
# for custom device. Once tensor.pin_memory() has been called,
# then tensor.is_pinned() will always return true no matter
# what tensor it's called on.
self.assertTrue(cpu_storage.is_pinned("foo"))
cpu_storage_pin = cpu_storage.pin_memory("foo")
self.assertFalse(cpu_storage.is_pinned())
self.assertFalse(cpu_storage.is_pinned("foo"))
self.assertFalse(cpu_storage.is_pinned(foo_device))
self.assertFalse(cpu_storage_pin.is_pinned())
self.assertTrue(cpu_storage_pin.is_pinned("foo"))
self.assertTrue(cpu_storage_pin.is_pinned(foo_device))
cpu_storage_pin_already = cpu_storage_pin.pin_memory("foo")
self.assertTrue(cpu_storage_pin.is_pinned("foo"))
self.assertTrue(cpu_storage_pin.is_pinned(foo_device))
self.assertTrue(cpu_storage_pin_already.is_pinned("foo"))
self.assertTrue(cpu_storage_pin_already.is_pinned(foo_device))
self.assertFalse(cpu_storage.is_pinned("foo"))
cpu_storage_pinned = cpu_storage.pin_memory(foo_device)
self.assertFalse(cpu_storage.is_pinned())
self.assertFalse(cpu_storage.is_pinned("foo"))
self.assertFalse(cpu_storage.is_pinned(foo_device))
self.assertFalse(cpu_storage_pinned.is_pinned())
cpu_storage_pinned = cpu_storage.pin_memory("foo")
self.assertTrue(cpu_storage_pinned.is_pinned("foo"))
self.assertTrue(cpu_storage_pinned.is_pinned(foo_device))
# Test untyped storage pin_memory and is_pin
cpu_tensor = torch.randn([3, 2, 1, 4])
cpu_untyped_storage = cpu_tensor.untyped_storage()
self.assertFalse(cpu_untyped_storage.is_pinned())
self.assertFalse(cpu_untyped_storage.is_pinned("foo"))
self.assertTrue(cpu_untyped_storage.is_pinned("foo"))
cpu_untyped_storage_pinned = cpu_untyped_storage.pin_memory("foo")
self.assertFalse(cpu_untyped_storage_pinned.is_pinned())
self.assertTrue(cpu_untyped_storage_pinned.is_pinned("foo"))
self.assertTrue(cpu_untyped_storage_pinned.is_pinned(foo_device))
cpu_untyped_storage_pinned = cpu_untyped_storage.pin_memory(foo_device)
self.assertFalse(cpu_untyped_storage_pinned.is_pinned())
self.assertTrue(cpu_untyped_storage_pinned.is_pinned("foo"))
self.assertTrue(cpu_untyped_storage_pinned.is_pinned(foo_device))
with self.assertRaisesRegex(TypeError, "positional arguments but 3 were given"):
cpu_untyped_storage_pinned.is_pinned("foo1", "foo2")
# Test storage pin_memory on error device
self.assertFalse(cpu_storage_pinned.is_pinned("hpu"))
self.assertFalse(cpu_untyped_storage_pinned.is_pinned("hpu"))
invalid_device = torch.device("hpu")
self.assertFalse(cpu_untyped_storage_pinned.is_pinned(invalid_device))
with self.assertRaisesRegex(
NotImplementedError, "with arguments from the 'HPU' backend"
):
cpu_storage.pin_memory("hpu")
with self.assertRaisesRegex(
NotImplementedError, "with arguments from the 'HPU' backend"
):
cpu_untyped_storage.pin_memory("hpu")
with self.assertRaisesRegex(
NotImplementedError, "with arguments from the 'HPU' backend"
):
cpu_untyped_storage.pin_memory(invalid_device)
@unittest.skip(
"Temporarily disable due to the tiny differences between clang++ and g++ in defining static variable in inline function"

View file

@ -14,6 +14,7 @@ from torch.testing._internal.common_utils import (
IS_LINUX,
skipIfTorchDynamo,
TEST_CUDA,
TEST_MPS,
TEST_PRIVATEUSE1,
TEST_XPU,
)
@ -37,7 +38,13 @@ def remove_build_path():
# Since we use a fake MTIA device backend to test generic Stream/Event, device backends are mutual exclusive to each other.
# The test will be skipped if any of the following conditions are met:
@unittest.skipIf(
IS_ARM64 or not IS_LINUX or TEST_CUDA or TEST_XPU or TEST_PRIVATEUSE1 or TEST_ROCM,
IS_ARM64
or not IS_LINUX
or TEST_CUDA
or TEST_XPU
or TEST_MPS
or TEST_PRIVATEUSE1
or TEST_ROCM,
"Only on linux platform and mutual exclusive to other backends",
)
@torch.testing._internal.common_utils.markDynamoStrictTest

View file

@ -8432,9 +8432,7 @@ tensor([[[1.+1.j, 1.+1.j, 1.+1.j, ..., 1.+1.j, 1.+1.j, 1.+1.j],
def test_pin_memory(self):
x = torch.randn(3, 5)
self.assertFalse(x.is_pinned())
if not torch.cuda.is_available():
self.assertRaises(RuntimeError, lambda: x.pin_memory())
else:
if torch.cuda.is_available():
pinned = x.pin_memory()
self.assertTrue(pinned.is_pinned())
self.assertEqual(pinned, x)

View file

@ -29,7 +29,7 @@ static PyObject* THPStream_pynew(
int64_t priority = 0;
static torch::PythonArgParser parser({
"Steram(Device device=None, *, int64_t priority=0)",
"Stream(Device device=None, *, int64_t priority=0)",
"Stream(int64_t stream_id, int64_t device_index, int64_t device_type, *, int64_t priority=0)",
});