mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Re-implement pin_memory to be device-agnostic by leveraging the Accelerator concept (#126376)
This PR re-implements pin memory aiming to get rid of the optional `device` argument and makes all related APIs to be device-agnostic. We add two new abstract APIs in [AcceleratorHooksInterface](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/detail/AcceleratorHooksInterface.h#L12) and redefine pin memory as: "Pin memory is always pinned for the current accelerator device". In detail, it uses [getAcceleratorHooksInterface](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/Context.h#L61) in pin_memory/is_pinned to get an appropriate device and invoke the corresponding overridden interfaces, instead of using BackendSelect and then dispatching to CUDA or other specific backends' implement methods. Note: For new backends who want to implement and use pin memory, just inherit AcceleratorHooksInterface and overwrite the `isPinnedPtr` and `getPinnedMemoryAllocator` methods. Additional context: To avoid BC-breaking, this PR just preserves the `device` arg of related APIs and would throw a deprecation warning if `device` arg is passed. Another PR will be submitted to update all PT callers (`Tensor.is_pinned()`, `Tensor.pin_memory()`...) not to pass this arg based on this PR. In future, `device` arg will be actually removed. Relates #124908 Relates #14560 Pull Request resolved: https://github.com/pytorch/pytorch/pull/126376 Approved by: https://github.com/albanD
This commit is contained in:
parent
074b420641
commit
8963623494
25 changed files with 210 additions and 203 deletions
|
|
@ -73,6 +73,8 @@ class TORCH_API Context {
|
|||
return at::detail::getPrivateUse1Hooks();
|
||||
} else if (device_type == at::kMTIA) {
|
||||
return at::detail::getMTIAHooks();
|
||||
} else if (device_type == at::kHIP) {
|
||||
return at::detail::getHIPHooks();
|
||||
} else {
|
||||
AT_ERROR(
|
||||
c10::DeviceTypeName(device_type), " device type not an accelerator.");
|
||||
|
|
@ -94,8 +96,22 @@ class TORCH_API Context {
|
|||
AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
|
||||
}
|
||||
}
|
||||
static bool isPinnedPtr(const void* data) {
|
||||
return detail::getCUDAHooks().isPinnedPtr(data);
|
||||
bool isPinnedPtr(
|
||||
const void* data,
|
||||
std::optional<c10::DeviceType> device_type = std::nullopt) {
|
||||
auto opt_device_type =
|
||||
device_type.has_value() ? device_type.value() : at::getAccelerator();
|
||||
if (!opt_device_type.has_value() || // there is no accelerator
|
||||
!at::isAccelerator(
|
||||
opt_device_type.value())) { // passed device not an accelerator
|
||||
return false;
|
||||
}
|
||||
return getAcceleratorHooksInterface(opt_device_type.value())
|
||||
.isPinnedPtr(data);
|
||||
}
|
||||
Allocator* getPinnedMemoryAllocator(
|
||||
std::optional<c10::DeviceType> device_type = std::nullopt) {
|
||||
return getAcceleratorHooksInterface(device_type).getPinnedMemoryAllocator();
|
||||
}
|
||||
static bool hasOpenMP();
|
||||
static bool hasMKL();
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
#include <ATen/DeviceAccelerator.h>
|
||||
namespace at {
|
||||
|
||||
C10_API std::optional<DeviceType> getAccelerator(bool checked) {
|
||||
std::optional<c10::DeviceType> getAccelerator(bool checked) {
|
||||
#define DETECT_AND_ASSIGN_ACCELERATOR(device_name) \
|
||||
if (at::has##device_name()) { \
|
||||
device_type = k##device_name; \
|
||||
|
|
@ -20,11 +20,13 @@ C10_API std::optional<DeviceType> getAccelerator(bool checked) {
|
|||
// first.
|
||||
return kPrivateUse1;
|
||||
}
|
||||
std::optional<DeviceType> device_type = std::nullopt;
|
||||
std::optional<c10::DeviceType> device_type = std::nullopt;
|
||||
bool is_accelerator_detected = false;
|
||||
DETECT_AND_ASSIGN_ACCELERATOR(CUDA)
|
||||
DETECT_AND_ASSIGN_ACCELERATOR(MTIA)
|
||||
DETECT_AND_ASSIGN_ACCELERATOR(XPU)
|
||||
DETECT_AND_ASSIGN_ACCELERATOR(HIP)
|
||||
DETECT_AND_ASSIGN_ACCELERATOR(MPS)
|
||||
if (checked) {
|
||||
TORCH_CHECK(
|
||||
device_type, "Cannot access accelerator device when none is available.")
|
||||
|
|
@ -34,4 +36,18 @@ C10_API std::optional<DeviceType> getAccelerator(bool checked) {
|
|||
#undef DETECT_AND_ASSIGN_ACCELERATOR
|
||||
}
|
||||
|
||||
bool isAccelerator(c10::DeviceType d) {
|
||||
switch (d) {
|
||||
case at::kCUDA:
|
||||
case at::kMTIA:
|
||||
case at::kXPU:
|
||||
case at::kHIP:
|
||||
case at::kMPS:
|
||||
case at::kPrivateUse1:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace at
|
||||
|
|
|
|||
|
|
@ -13,9 +13,7 @@
|
|||
// - It provides a set of common APIs as defined by AcceleratorHooksInterface
|
||||
//
|
||||
// As of today, accelerator devices are (in no particular order):
|
||||
// CUDA, MTIA, XPU, PrivateUse1
|
||||
// We want to add once all the proper APIs are supported and tested:
|
||||
// HIP, MPS
|
||||
// CUDA, MTIA, XPU, HIP, MPS, PrivateUse1
|
||||
|
||||
namespace at {
|
||||
|
||||
|
|
@ -24,4 +22,6 @@ namespace at {
|
|||
// When checked is true, the returned optional always has a value.
|
||||
TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);
|
||||
|
||||
TORCH_API bool isAccelerator(c10::DeviceType d);
|
||||
|
||||
} // namespace at
|
||||
|
|
|
|||
|
|
@ -1,32 +0,0 @@
|
|||
#include <ATen/cuda/PinnedMemoryAllocator.h>
|
||||
#include <ATen/Context.h>
|
||||
#include <ATen/Config.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <c10/core/Storage.h>
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/CPUFunctions.h>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
bool is_pinned_cuda(const Tensor& self, std::optional<Device> device) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda());
|
||||
// TODO: unhook this
|
||||
return detail::getCUDAHooks().isPinnedPtr(self.storage().data());
|
||||
}
|
||||
|
||||
Tensor _pin_memory_cuda(const Tensor& self, std::optional<Device> device) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda());
|
||||
auto* allocator = at::cuda::getPinnedMemoryAllocator();
|
||||
auto storage = Storage(
|
||||
Storage::use_byte_size_t(),
|
||||
detail::computeStorageNbytes(
|
||||
self.sizes(), self.strides(), self.dtype().itemsize()),
|
||||
allocator,
|
||||
/*resizable=*/false);
|
||||
auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
|
||||
tensor.copy_(self);
|
||||
return tensor;
|
||||
}
|
||||
|
||||
|
||||
} // namespace at::native
|
||||
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/core/Stream.h>
|
||||
#include <c10/core/Allocator.h>
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
|
||||
namespace at {
|
||||
|
||||
|
|
@ -40,6 +41,15 @@ struct TORCH_API AcceleratorHooksInterface {
|
|||
TORCH_CHECK(false, "Backend doesn't support maybeExchangeDevice()");
|
||||
return -1;
|
||||
}
|
||||
|
||||
virtual bool isPinnedPtr(const void* data) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual Allocator* getPinnedMemoryAllocator() const {
|
||||
TORCH_CHECK(false, "Backend doesn't support getPinnedMemoryAllocator()");
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace at
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
|
|||
TORCH_CHECK(false, "Cannot get device of pointer on CUDA without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
virtual bool isPinnedPtr(const void* /*data*/) const {
|
||||
virtual bool isPinnedPtr(const void* data) const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -121,7 +121,7 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
|
|||
return -1;
|
||||
}
|
||||
|
||||
virtual Allocator* getPinnedMemoryAllocator() const {
|
||||
virtual Allocator* getPinnedMemoryAllocator() const override {
|
||||
TORCH_CHECK(false, "Pinned memory requires CUDA. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@
|
|||
|
||||
#include <c10/util/Registry.h>
|
||||
|
||||
#include <ATen/detail/AcceleratorHooksInterface.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace at {
|
||||
|
|
@ -19,7 +21,7 @@ namespace at {
|
|||
// which we may want to call into from CPU code (and thus must be dynamically
|
||||
// dispatched, to allow for separate compilation of HIP code). See
|
||||
// CUDAHooksInterface for more detailed motivation.
|
||||
struct TORCH_API HIPHooksInterface {
|
||||
struct TORCH_API HIPHooksInterface : AcceleratorHooksInterface {
|
||||
// This should never actually be implemented, but it is used to
|
||||
// squelch -Werror=non-virtual-dtor
|
||||
virtual ~HIPHooksInterface() = default;
|
||||
|
|
@ -41,7 +43,11 @@ struct TORCH_API HIPHooksInterface {
|
|||
return -1;
|
||||
}
|
||||
|
||||
virtual Allocator* getPinnedMemoryAllocator() const {
|
||||
virtual bool isPinnedPtr(const void* data) const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual Allocator* getPinnedMemoryAllocator() const override {
|
||||
AT_ERROR("Pinned memory requires HIP.");
|
||||
}
|
||||
|
||||
|
|
@ -52,6 +58,10 @@ struct TORCH_API HIPHooksInterface {
|
|||
virtual int getNumGPUs() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
|
||||
AT_ERROR("Cannot check primary context without ATen_hip library.");
|
||||
}
|
||||
};
|
||||
|
||||
// NB: dummy argument to suppress "ISO C++11 requires at least one argument
|
||||
|
|
|
|||
|
|
@ -94,6 +94,12 @@ struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface {
|
|||
bool hasPrimaryContext(DeviceIndex device_index) const override {
|
||||
FAIL_MPSHOOKS_FUNC(__func__);
|
||||
}
|
||||
virtual bool isPinnedPtr(const void* data) const override {
|
||||
return false;
|
||||
}
|
||||
virtual Allocator* getPinnedMemoryAllocator() const override {
|
||||
FAIL_MPSHOOKS_FUNC(__func__);
|
||||
}
|
||||
#undef FAIL_MPSHOOKS_FUNC
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@
|
|||
#include <c10/core/Stream.h>
|
||||
#include <c10/util/Registry.h>
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
|
||||
#include <ATen/detail/AcceleratorHooksInterface.h>
|
||||
|
||||
#include <string>
|
||||
|
|
@ -88,6 +90,15 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
|
|||
virtual void setCurrentStream(const c10::Stream& stream) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
}
|
||||
|
||||
virtual bool isPinnedPtr(const void* data) const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual Allocator* getPinnedMemoryAllocator() const override {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
struct TORCH_API MTIAHooksArgs {};
|
||||
|
|
|
|||
|
|
@ -24,7 +24,11 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
|
|||
"You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDeviceFromPtr`.");
|
||||
}
|
||||
|
||||
virtual Allocator* getPinnedMemoryAllocator() const {
|
||||
virtual bool isPinnedPtr(const void* data) const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual Allocator* getPinnedMemoryAllocator() const override {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getPinnedMemoryAllocator`.");
|
||||
|
|
|
|||
|
|
@ -58,15 +58,15 @@ struct TORCH_API XPUHooksInterface : AcceleratorHooksInterface{
|
|||
TORCH_CHECK(false, "Cannot synchronize XPU device without ATen_xpu library.");
|
||||
}
|
||||
|
||||
virtual Allocator* getPinnedMemoryAllocator() const {
|
||||
virtual Allocator* getPinnedMemoryAllocator() const override {
|
||||
TORCH_CHECK(false, "Cannot get XPU pinned memory allocator without ATen_xpu library.");
|
||||
}
|
||||
|
||||
virtual bool isPinnedPtr(const void* /*data*/) const {
|
||||
virtual bool isPinnedPtr(const void* data) const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool hasPrimaryContext(DeviceIndex /*device_index*/) const override{
|
||||
virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
|
||||
TORCH_CHECK(false, "Cannot query primary context without ATen_xpu library.");
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -3,8 +3,6 @@
|
|||
#include <ATen/CPUFunctions.h>
|
||||
#include <ATen/EmptyTensor.h>
|
||||
#include <ATen/mps/MPSAllocator.h>
|
||||
#include <ATen/ops/_pin_memory_native.h>
|
||||
#include <ATen/ops/is_pinned_native.h>
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/core/Storage.h>
|
||||
|
||||
|
|
@ -860,31 +858,12 @@ IMPSAllocator* getIMPSAllocator(bool sharedAllocator) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
} // namespace at::mps
|
||||
|
||||
namespace at::native {
|
||||
|
||||
// torch.is_pinned() implementation
|
||||
// Pinned memory will be helpful on Apple Silicon Macs with Unified memory as we
|
||||
// will be able to use SharedStorageMode for MTLBuffer allocations. This will
|
||||
// avoid extra copies on DataLoading operations.
|
||||
bool is_pinned_mps(const Tensor& self, std::optional<Device> device) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps());
|
||||
return at::mps::_getSharedAllocator().isSharedBuffer(self.storage().data());
|
||||
bool isMPSPinnedPtr(const void* data) {
|
||||
return at::mps::_getSharedAllocator().isSharedBuffer(data);
|
||||
}
|
||||
|
||||
// torch.pin_memory() implementation
|
||||
Tensor _pin_memory_mps(const Tensor& self, std::optional<Device> device) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps());
|
||||
auto* shared_allocator = at::mps::getIMPSAllocator(true);
|
||||
TORCH_CHECK(shared_allocator, "unable to pin memory on a non-unified memory device");
|
||||
|
||||
const size_t storage_size = at::detail::computeStorageNbytes(self.sizes(), self.strides(), self.dtype().itemsize());
|
||||
std::cerr << "Pinning memory of size " << storage_size / 1024UL << " KB\n";
|
||||
auto storage = Storage(Storage::use_byte_size_t(), storage_size, shared_allocator, false);
|
||||
auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
|
||||
tensor.copy_(self);
|
||||
return tensor;
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
} // namespace at::mps
|
||||
|
|
|
|||
|
|
@ -59,4 +59,6 @@ C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback);
|
|||
|
||||
IMPSAllocator* getIMPSAllocator(bool sharedAllocator = false);
|
||||
|
||||
bool isMPSPinnedPtr(const void* data);
|
||||
|
||||
} // namespace at::mps
|
||||
|
|
|
|||
|
|
@ -68,6 +68,11 @@ struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface
|
|||
return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
|
||||
}
|
||||
|
||||
Stream getNewStream(Device, int priority = 0) const override {
|
||||
(void)priority;
|
||||
return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
|
||||
}
|
||||
|
||||
Stream getDefaultStream(Device d) const override {
|
||||
return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,6 +34,8 @@ struct MPSHooks : public at::MPSHooksInterface {
|
|||
size_t getDriverAllocatedMemory() const override;
|
||||
size_t getRecommendedMaxMemory() const override;
|
||||
void setMemoryFraction(double ratio) const override;
|
||||
bool isPinnedPtr(const void* data) const override;
|
||||
Allocator* getPinnedMemoryAllocator() const override;
|
||||
|
||||
// MPSProfiler interface
|
||||
void profilerStartTrace(const std::string& mode, bool waitUntilCompleted) const override;
|
||||
|
|
|
|||
|
|
@ -124,6 +124,14 @@ double MPSHooks::elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event
|
|||
return at::mps::getMPSEventPool()->elapsedTime(start_event_id, end_event_id);
|
||||
}
|
||||
|
||||
bool MPSHooks::isPinnedPtr(const void* data) const {
|
||||
return at::mps::isMPSPinnedPtr(data);
|
||||
}
|
||||
|
||||
Allocator* MPSHooks::getPinnedMemoryAllocator() const {
|
||||
return at::mps::getIMPSAllocator(true);
|
||||
}
|
||||
|
||||
using at::MPSHooksRegistry;
|
||||
using at::RegistererMPSHooksRegistry;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,15 +1,21 @@
|
|||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/MemoryOverlap.h>
|
||||
#include <ATen/Context.h>
|
||||
#include <c10/core/Storage.h>
|
||||
#include <ATen/EmptyTensor.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/CPUFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/_debug_has_internal_overlap_native.h>
|
||||
#include <ATen/ops/_pin_memory.h>
|
||||
#include <ATen/ops/is_pinned_native.h>
|
||||
#include <ATen/ops/pin_memory_native.h>
|
||||
#include <ATen/ops/_pin_memory_native.h>
|
||||
#include <ATen/ops/empty_cpu_dispatch.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
|
@ -19,15 +25,28 @@ int64_t _debug_has_internal_overlap(const Tensor& self) {
|
|||
return static_cast<int64_t>(at::has_internal_overlap(self));
|
||||
}
|
||||
|
||||
// Technically, we could force backends to explicitly say "no, we don't support
|
||||
// pinned memory, always return false", but this makes life a little easier when
|
||||
// you haven't loaded the backend extension at all (which can happen, e.g., on a
|
||||
// CPU build of PyTorch and you try to check if something is CUDA pinned)
|
||||
bool is_pinned_default(const Tensor& self, std::optional<Device> device) {
|
||||
return false;
|
||||
bool is_pinned(const Tensor& self, std::optional<c10::Device> device) {
|
||||
std::optional<c10::DeviceType> opt_device_type;
|
||||
if (device.has_value()) {
|
||||
TORCH_WARN_DEPRECATION(
|
||||
"The argument 'device' of Tensor.is_pinned() ",
|
||||
"is deprecated. Please do not pass this argument.")
|
||||
opt_device_type = device.value().type();
|
||||
}
|
||||
// Only CPU tensors can be pinned
|
||||
if (!self.is_cpu()) {
|
||||
return false;
|
||||
}
|
||||
// Use getAcceleratorHooksInterface to make is_pinned device-agnostic
|
||||
return at::globalContext().isPinnedPtr(self.storage().data(), opt_device_type);
|
||||
}
|
||||
|
||||
Tensor pin_memory(const Tensor& self, std::optional<Device> device) {
|
||||
Tensor pin_memory(const Tensor& self, std::optional<c10::Device> device) {
|
||||
if (device.has_value()) {
|
||||
TORCH_WARN_DEPRECATION(
|
||||
"The argument 'device' of Tensor.pin_memory() ",
|
||||
"is deprecated. Please do not pass this argument.")
|
||||
}
|
||||
// Kind of mad that I have to do two dynamic dispatches here, pretty
|
||||
// annoying
|
||||
if (self.is_pinned(device)) {
|
||||
|
|
@ -36,4 +55,21 @@ Tensor pin_memory(const Tensor& self, std::optional<Device> device) {
|
|||
return at::_pin_memory(self, device);
|
||||
}
|
||||
|
||||
Tensor _pin_memory(const Tensor& self, std::optional<c10::Device> device) {
|
||||
TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned");
|
||||
// Use getAcceleratorHooksInterface to make pin_memory device-agnostic
|
||||
auto* allocator = device.has_value()?
|
||||
at::globalContext().getPinnedMemoryAllocator(device.value().type()):
|
||||
at::globalContext().getPinnedMemoryAllocator();
|
||||
auto storage = Storage(
|
||||
Storage::use_byte_size_t(),
|
||||
detail::computeStorageNbytes(
|
||||
self.sizes(), self.strides(), self.dtype().itemsize()),
|
||||
allocator,
|
||||
/*resizable=*/false);
|
||||
auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
|
||||
tensor.copy_(self);
|
||||
return tensor;
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
|
|
|||
|
|
@ -4545,9 +4545,10 @@
|
|||
- func: is_pinned(Tensor self, Device? device=None) -> bool
|
||||
variants: method
|
||||
dispatch:
|
||||
NestedTensorCUDA, CUDA: is_pinned_cuda
|
||||
MPS: is_pinned_mps
|
||||
CompositeExplicitAutograd: is_pinned_default
|
||||
# the NestedTensor keys are necessary because NestedTensor has been removed
|
||||
# from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys]
|
||||
CompositeExplicitAutograd, NestedTensorCPU: is_pinned
|
||||
SparseCPU, SparseCsrCPU: is_pinned_sparse
|
||||
|
||||
# TODO: add a copy kwarg that guarantees that the tensor is put into fresh
|
||||
# pinned memory
|
||||
|
|
@ -4557,9 +4558,9 @@
|
|||
# Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor
|
||||
- func: _pin_memory(Tensor self, Device? device=None) -> Tensor
|
||||
dispatch:
|
||||
CUDA: _pin_memory_cuda
|
||||
MPS: _pin_memory_mps
|
||||
NestedTensorCUDA, NestedTensorCPU: _pin_memory_nested
|
||||
NestedTensorCPU: _pin_memory_nested
|
||||
SparseCPU, SparseCsrCPU: _pin_memory_sparse
|
||||
CompositeExplicitAutograd: _pin_memory
|
||||
autogen: _pin_memory.out
|
||||
|
||||
- func: pinverse(Tensor self, float rcond=1e-15) -> Tensor
|
||||
|
|
|
|||
|
|
@ -71,6 +71,8 @@
|
|||
#include <ATen/ops/threshold_backward_native.h>
|
||||
#include <ATen/ops/trunc.h>
|
||||
#include <ATen/ops/trunc_native.h>
|
||||
#include <ATen/ops/is_pinned_native.h>
|
||||
#include <ATen/ops/_pin_memory_native.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
|
@ -280,4 +282,23 @@ Tensor& nan_to_num_sparse_(
|
|||
return nan_to_num_sparse_out(self, nan, posinf, neginf, self);
|
||||
}
|
||||
|
||||
bool is_pinned_sparse(const Tensor& self, std::optional<c10::Device> device) {
|
||||
if (device.has_value()) {
|
||||
TORCH_WARN_DEPRECATION(
|
||||
"The argument 'device' of Tensor.is_pinned() ",
|
||||
"is deprecated. Please do not pass this argument.")
|
||||
}
|
||||
// Currently, we don't support pin memory for sparse tensor.
|
||||
// so always return false
|
||||
return false;
|
||||
}
|
||||
|
||||
Tensor _pin_memory_sparse(const Tensor& self, std::optional<c10::Device> device) {
|
||||
// Here, we throw an error rather than return self tensor. This
|
||||
// is because we always return the pinned memory tensor, while
|
||||
// giving unpinned tensor might mislead users.
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false, "'aten::_pin_memory' is not implemented for sparse tensor.");
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
|
|
|||
|
|
@ -11,8 +11,6 @@
|
|||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Operators.h>
|
||||
#else
|
||||
#include <ATen/ops/is_pinned_ops.h>
|
||||
#include <ATen/ops/_pin_memory_ops.h>
|
||||
|
||||
${ops_headers}
|
||||
#endif
|
||||
|
|
@ -23,31 +21,8 @@ namespace {
|
|||
|
||||
${backend_select_method_definitions}
|
||||
|
||||
bool is_pinned(const Tensor& self, std::optional<at::Device> device) {
|
||||
// Only CPU tensors can be pinned
|
||||
if (!self.is_cpu()) {
|
||||
return false;
|
||||
}
|
||||
// TODO: fetch scalar type from Tensor? But it doesn't really matter...
|
||||
DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(std::nullopt, self.layout(), device.value_or(at::kCUDA)));
|
||||
return at::_ops::is_pinned::redispatch(_dk, self, device);
|
||||
}
|
||||
|
||||
at::Tensor _pin_memory(const Tensor& self, std::optional<at::Device> device) {
|
||||
TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned");
|
||||
DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(std::nullopt, self.layout(), device.value_or(at::kCUDA)));
|
||||
if (self.is_nested()) {
|
||||
constexpr auto nested_key_set = c10::DispatchKeySet(
|
||||
{c10::DispatchKey::NestedTensor, c10::DispatchKey::AutogradNestedTensor});
|
||||
_dk = _dk.add(self.key_set() & nested_key_set);
|
||||
}
|
||||
return at::_ops::_pin_memory::redispatch(_dk, self, device);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(aten, BackendSelect, m) {
|
||||
${backend_select_function_registrations};
|
||||
m.impl(TORCH_SELECTIVE_NAME("aten::is_pinned"), TORCH_FN(is_pinned));
|
||||
m.impl(TORCH_SELECTIVE_NAME("aten::_pin_memory"), TORCH_FN(_pin_memory));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
|
|
|||
|
|
@ -418,38 +418,6 @@ at::Tensor& custom_set_source_Storage_storage_offset(at::Tensor& result,
|
|||
return result;
|
||||
}
|
||||
|
||||
// basic dummy functions related to pin_memory.
|
||||
std::vector<void*> custom_pinned_data_ptr;
|
||||
|
||||
at::Tensor custom__pin_memory(const at::Tensor& self, std::optional<at::Device> device) {
|
||||
TORCH_CHECK(
|
||||
self.device().is_cpu(),
|
||||
"cannot pin '",
|
||||
self.toString(),
|
||||
"' only dense CPU tensors can be pinned");
|
||||
|
||||
// record pinned data ptr
|
||||
at::Tensor dump_pinned_tensor = self * 1.0;
|
||||
custom_pinned_data_ptr.push_back(dump_pinned_tensor.storage().data_ptr().get());
|
||||
|
||||
return dump_pinned_tensor;
|
||||
}
|
||||
|
||||
bool custom_is_pinned(const at::Tensor& self, std::optional<at::Device> device) {
|
||||
// Only CPU tensors can be pinned
|
||||
if (!self.is_cpu()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
void* query_pinned_ptr = self.storage().data_ptr().get();
|
||||
for (const auto& iter_ptr : custom_pinned_data_ptr) {
|
||||
if (iter_ptr == query_pinned_ptr) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
const at::Tensor& custom_resize_(const at::Tensor& self, at::IntArrayRef size,
|
||||
std::optional<at::MemoryFormat> optional_memory_format) {
|
||||
at::TensorImpl* tensor_impl = self.unsafeGetTensorImpl();
|
||||
|
|
@ -545,8 +513,6 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
|
|||
m.impl("empty_strided", &custom_empty_strided);
|
||||
m.impl("set_.source_Storage", &custom_set_source_Storage);
|
||||
m.impl("set_.source_Storage_storage_offset",&custom_set_source_Storage_storage_offset);
|
||||
m.impl("_pin_memory", &custom__pin_memory);
|
||||
m.impl("is_pinned", &custom_is_pinned);
|
||||
m.impl("resize_", &custom_resize_);
|
||||
m.impl("as_strided", at::native::as_strided_tensorimpl);
|
||||
m.impl("quantize_per_tensor", at::native::quantize_per_tensor);
|
||||
|
|
@ -612,6 +578,9 @@ void set_custom_device_index(c10::DeviceIndex device_index) {
|
|||
custom_device_index = device_index;
|
||||
}
|
||||
|
||||
// a global flag used for dummy pin_memory of custom device
|
||||
bool custom_pinned_flag = false;
|
||||
|
||||
struct FooHooksArgs : public at::PrivateUse1HooksArgs {};
|
||||
|
||||
struct FooHooksInterface : public at::PrivateUse1HooksInterface {
|
||||
|
|
@ -621,6 +590,16 @@ struct FooHooksInterface : public at::PrivateUse1HooksInterface {
|
|||
static auto device_gen = make_generator_privateuse1(device_index);
|
||||
return device_gen;
|
||||
}
|
||||
// this is a simple implementation, custom_pinned_flag will be set as true
|
||||
// once tensor.pin_memory() is called. And then tensor.is_pinned()
|
||||
// always return true no matter what tensor it's called on.
|
||||
bool isPinnedPtr(const void* data) const override {
|
||||
return custom_pinned_flag;
|
||||
}
|
||||
c10::Allocator* getPinnedMemoryAllocator() const override {
|
||||
custom_pinned_flag = true;
|
||||
return c10::GetCPUAllocator();
|
||||
}
|
||||
};
|
||||
|
||||
TORCH_DECLARE_REGISTRY(PrivateUse1HooksRegistry, FooHooksInterface, FooHooksArgs);
|
||||
|
|
|
|||
|
|
@ -343,71 +343,24 @@ class TestCppExtensionOpenRgistration(common.TestCase):
|
|||
cpu_tensor_pin = cpu_tensor.pin_memory("foo")
|
||||
self.assertTrue(cpu_tensor_pin.is_pinned("foo"))
|
||||
|
||||
# Test storage pin_memory on custom device string
|
||||
# Test storage pin_memory and is_pin
|
||||
cpu_storage = cpu_tensor.storage()
|
||||
foo_device = torch.device("foo")
|
||||
self.assertFalse(cpu_storage.is_pinned("foo"))
|
||||
# We implement a dummy pin_memory of no practical significance
|
||||
# for custom device. Once tensor.pin_memory() has been called,
|
||||
# then tensor.is_pinned() will always return true no matter
|
||||
# what tensor it's called on.
|
||||
self.assertTrue(cpu_storage.is_pinned("foo"))
|
||||
|
||||
cpu_storage_pin = cpu_storage.pin_memory("foo")
|
||||
self.assertFalse(cpu_storage.is_pinned())
|
||||
self.assertFalse(cpu_storage.is_pinned("foo"))
|
||||
self.assertFalse(cpu_storage.is_pinned(foo_device))
|
||||
self.assertFalse(cpu_storage_pin.is_pinned())
|
||||
self.assertTrue(cpu_storage_pin.is_pinned("foo"))
|
||||
self.assertTrue(cpu_storage_pin.is_pinned(foo_device))
|
||||
|
||||
cpu_storage_pin_already = cpu_storage_pin.pin_memory("foo")
|
||||
self.assertTrue(cpu_storage_pin.is_pinned("foo"))
|
||||
self.assertTrue(cpu_storage_pin.is_pinned(foo_device))
|
||||
self.assertTrue(cpu_storage_pin_already.is_pinned("foo"))
|
||||
self.assertTrue(cpu_storage_pin_already.is_pinned(foo_device))
|
||||
self.assertFalse(cpu_storage.is_pinned("foo"))
|
||||
|
||||
cpu_storage_pinned = cpu_storage.pin_memory(foo_device)
|
||||
self.assertFalse(cpu_storage.is_pinned())
|
||||
self.assertFalse(cpu_storage.is_pinned("foo"))
|
||||
self.assertFalse(cpu_storage.is_pinned(foo_device))
|
||||
self.assertFalse(cpu_storage_pinned.is_pinned())
|
||||
cpu_storage_pinned = cpu_storage.pin_memory("foo")
|
||||
self.assertTrue(cpu_storage_pinned.is_pinned("foo"))
|
||||
self.assertTrue(cpu_storage_pinned.is_pinned(foo_device))
|
||||
|
||||
# Test untyped storage pin_memory and is_pin
|
||||
cpu_tensor = torch.randn([3, 2, 1, 4])
|
||||
cpu_untyped_storage = cpu_tensor.untyped_storage()
|
||||
self.assertFalse(cpu_untyped_storage.is_pinned())
|
||||
self.assertFalse(cpu_untyped_storage.is_pinned("foo"))
|
||||
self.assertTrue(cpu_untyped_storage.is_pinned("foo"))
|
||||
|
||||
cpu_untyped_storage_pinned = cpu_untyped_storage.pin_memory("foo")
|
||||
self.assertFalse(cpu_untyped_storage_pinned.is_pinned())
|
||||
self.assertTrue(cpu_untyped_storage_pinned.is_pinned("foo"))
|
||||
self.assertTrue(cpu_untyped_storage_pinned.is_pinned(foo_device))
|
||||
|
||||
cpu_untyped_storage_pinned = cpu_untyped_storage.pin_memory(foo_device)
|
||||
self.assertFalse(cpu_untyped_storage_pinned.is_pinned())
|
||||
self.assertTrue(cpu_untyped_storage_pinned.is_pinned("foo"))
|
||||
self.assertTrue(cpu_untyped_storage_pinned.is_pinned(foo_device))
|
||||
|
||||
with self.assertRaisesRegex(TypeError, "positional arguments but 3 were given"):
|
||||
cpu_untyped_storage_pinned.is_pinned("foo1", "foo2")
|
||||
|
||||
# Test storage pin_memory on error device
|
||||
self.assertFalse(cpu_storage_pinned.is_pinned("hpu"))
|
||||
self.assertFalse(cpu_untyped_storage_pinned.is_pinned("hpu"))
|
||||
invalid_device = torch.device("hpu")
|
||||
self.assertFalse(cpu_untyped_storage_pinned.is_pinned(invalid_device))
|
||||
|
||||
with self.assertRaisesRegex(
|
||||
NotImplementedError, "with arguments from the 'HPU' backend"
|
||||
):
|
||||
cpu_storage.pin_memory("hpu")
|
||||
with self.assertRaisesRegex(
|
||||
NotImplementedError, "with arguments from the 'HPU' backend"
|
||||
):
|
||||
cpu_untyped_storage.pin_memory("hpu")
|
||||
with self.assertRaisesRegex(
|
||||
NotImplementedError, "with arguments from the 'HPU' backend"
|
||||
):
|
||||
cpu_untyped_storage.pin_memory(invalid_device)
|
||||
|
||||
@unittest.skip(
|
||||
"Temporarily disable due to the tiny differences between clang++ and g++ in defining static variable in inline function"
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ from torch.testing._internal.common_utils import (
|
|||
IS_LINUX,
|
||||
skipIfTorchDynamo,
|
||||
TEST_CUDA,
|
||||
TEST_MPS,
|
||||
TEST_PRIVATEUSE1,
|
||||
TEST_XPU,
|
||||
)
|
||||
|
|
@ -37,7 +38,13 @@ def remove_build_path():
|
|||
# Since we use a fake MTIA device backend to test generic Stream/Event, device backends are mutual exclusive to each other.
|
||||
# The test will be skipped if any of the following conditions are met:
|
||||
@unittest.skipIf(
|
||||
IS_ARM64 or not IS_LINUX or TEST_CUDA or TEST_XPU or TEST_PRIVATEUSE1 or TEST_ROCM,
|
||||
IS_ARM64
|
||||
or not IS_LINUX
|
||||
or TEST_CUDA
|
||||
or TEST_XPU
|
||||
or TEST_MPS
|
||||
or TEST_PRIVATEUSE1
|
||||
or TEST_ROCM,
|
||||
"Only on linux platform and mutual exclusive to other backends",
|
||||
)
|
||||
@torch.testing._internal.common_utils.markDynamoStrictTest
|
||||
|
|
|
|||
|
|
@ -8432,9 +8432,7 @@ tensor([[[1.+1.j, 1.+1.j, 1.+1.j, ..., 1.+1.j, 1.+1.j, 1.+1.j],
|
|||
def test_pin_memory(self):
|
||||
x = torch.randn(3, 5)
|
||||
self.assertFalse(x.is_pinned())
|
||||
if not torch.cuda.is_available():
|
||||
self.assertRaises(RuntimeError, lambda: x.pin_memory())
|
||||
else:
|
||||
if torch.cuda.is_available():
|
||||
pinned = x.pin_memory()
|
||||
self.assertTrue(pinned.is_pinned())
|
||||
self.assertEqual(pinned, x)
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ static PyObject* THPStream_pynew(
|
|||
int64_t priority = 0;
|
||||
|
||||
static torch::PythonArgParser parser({
|
||||
"Steram(Device device=None, *, int64_t priority=0)",
|
||||
"Stream(Device device=None, *, int64_t priority=0)",
|
||||
"Stream(int64_t stream_id, int64_t device_index, int64_t device_type, *, int64_t priority=0)",
|
||||
});
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue