diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index eb9d021fc21..7fad49ab661 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -73,6 +73,8 @@ class TORCH_API Context { return at::detail::getPrivateUse1Hooks(); } else if (device_type == at::kMTIA) { return at::detail::getMTIAHooks(); + } else if (device_type == at::kHIP) { + return at::detail::getHIPHooks(); } else { AT_ERROR( c10::DeviceTypeName(device_type), " device type not an accelerator."); @@ -94,8 +96,22 @@ class TORCH_API Context { AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled."); } } - static bool isPinnedPtr(const void* data) { - return detail::getCUDAHooks().isPinnedPtr(data); + bool isPinnedPtr( + const void* data, + std::optional device_type = std::nullopt) { + auto opt_device_type = + device_type.has_value() ? device_type.value() : at::getAccelerator(); + if (!opt_device_type.has_value() || // there is no accelerator + !at::isAccelerator( + opt_device_type.value())) { // passed device not an accelerator + return false; + } + return getAcceleratorHooksInterface(opt_device_type.value()) + .isPinnedPtr(data); + } + Allocator* getPinnedMemoryAllocator( + std::optional device_type = std::nullopt) { + return getAcceleratorHooksInterface(device_type).getPinnedMemoryAllocator(); } static bool hasOpenMP(); static bool hasMKL(); diff --git a/aten/src/ATen/DeviceAccelerator.cpp b/aten/src/ATen/DeviceAccelerator.cpp index 5b093cc9cbc..18025a9962a 100644 --- a/aten/src/ATen/DeviceAccelerator.cpp +++ b/aten/src/ATen/DeviceAccelerator.cpp @@ -2,7 +2,7 @@ #include namespace at { -C10_API std::optional getAccelerator(bool checked) { +std::optional getAccelerator(bool checked) { #define DETECT_AND_ASSIGN_ACCELERATOR(device_name) \ if (at::has##device_name()) { \ device_type = k##device_name; \ @@ -20,11 +20,13 @@ C10_API std::optional getAccelerator(bool checked) { // first. return kPrivateUse1; } - std::optional device_type = std::nullopt; + std::optional device_type = std::nullopt; bool is_accelerator_detected = false; DETECT_AND_ASSIGN_ACCELERATOR(CUDA) DETECT_AND_ASSIGN_ACCELERATOR(MTIA) DETECT_AND_ASSIGN_ACCELERATOR(XPU) + DETECT_AND_ASSIGN_ACCELERATOR(HIP) + DETECT_AND_ASSIGN_ACCELERATOR(MPS) if (checked) { TORCH_CHECK( device_type, "Cannot access accelerator device when none is available.") @@ -34,4 +36,18 @@ C10_API std::optional getAccelerator(bool checked) { #undef DETECT_AND_ASSIGN_ACCELERATOR } +bool isAccelerator(c10::DeviceType d) { + switch (d) { + case at::kCUDA: + case at::kMTIA: + case at::kXPU: + case at::kHIP: + case at::kMPS: + case at::kPrivateUse1: + return true; + default: + return false; + } +} + } // namespace at diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h index 3eedb9945ef..7840911bd6b 100644 --- a/aten/src/ATen/DeviceAccelerator.h +++ b/aten/src/ATen/DeviceAccelerator.h @@ -13,9 +13,7 @@ // - It provides a set of common APIs as defined by AcceleratorHooksInterface // // As of today, accelerator devices are (in no particular order): -// CUDA, MTIA, XPU, PrivateUse1 -// We want to add once all the proper APIs are supported and tested: -// HIP, MPS +// CUDA, MTIA, XPU, HIP, MPS, PrivateUse1 namespace at { @@ -24,4 +22,6 @@ namespace at { // When checked is true, the returned optional always has a value. TORCH_API std::optional getAccelerator(bool checked = false); +TORCH_API bool isAccelerator(c10::DeviceType d); + } // namespace at diff --git a/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp b/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp deleted file mode 100644 index 0c3e3782564..00000000000 --- a/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -namespace at::native { - -bool is_pinned_cuda(const Tensor& self, std::optional device) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda()); - // TODO: unhook this - return detail::getCUDAHooks().isPinnedPtr(self.storage().data()); -} - -Tensor _pin_memory_cuda(const Tensor& self, std::optional device) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda()); - auto* allocator = at::cuda::getPinnedMemoryAllocator(); - auto storage = Storage( - Storage::use_byte_size_t(), - detail::computeStorageNbytes( - self.sizes(), self.strides(), self.dtype().itemsize()), - allocator, - /*resizable=*/false); - auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides()); - tensor.copy_(self); - return tensor; -} - - -} // namespace at::native diff --git a/aten/src/ATen/detail/AcceleratorHooksInterface.h b/aten/src/ATen/detail/AcceleratorHooksInterface.h index d36e1c3f10c..7eefdfc7269 100644 --- a/aten/src/ATen/detail/AcceleratorHooksInterface.h +++ b/aten/src/ATen/detail/AcceleratorHooksInterface.h @@ -2,6 +2,7 @@ #include #include +#include C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter") namespace at { @@ -40,6 +41,15 @@ struct TORCH_API AcceleratorHooksInterface { TORCH_CHECK(false, "Backend doesn't support maybeExchangeDevice()"); return -1; } + + virtual bool isPinnedPtr(const void* data) const { + return false; + } + + virtual Allocator* getPinnedMemoryAllocator() const { + TORCH_CHECK(false, "Backend doesn't support getPinnedMemoryAllocator()"); + return nullptr; + } }; } // namespace at diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index 1349a580bca..9c37f6a82d7 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -77,7 +77,7 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface { TORCH_CHECK(false, "Cannot get device of pointer on CUDA without ATen_cuda library. ", CUDA_HELP); } - virtual bool isPinnedPtr(const void* /*data*/) const { + virtual bool isPinnedPtr(const void* data) const override { return false; } @@ -121,7 +121,7 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface { return -1; } - virtual Allocator* getPinnedMemoryAllocator() const { + virtual Allocator* getPinnedMemoryAllocator() const override { TORCH_CHECK(false, "Pinned memory requires CUDA. ", CUDA_HELP); } diff --git a/aten/src/ATen/detail/HIPHooksInterface.h b/aten/src/ATen/detail/HIPHooksInterface.h index 72a90ba5f9d..2d60a5f4048 100644 --- a/aten/src/ATen/detail/HIPHooksInterface.h +++ b/aten/src/ATen/detail/HIPHooksInterface.h @@ -6,6 +6,8 @@ #include +#include + #include namespace at { @@ -19,7 +21,7 @@ namespace at { // which we may want to call into from CPU code (and thus must be dynamically // dispatched, to allow for separate compilation of HIP code). See // CUDAHooksInterface for more detailed motivation. -struct TORCH_API HIPHooksInterface { +struct TORCH_API HIPHooksInterface : AcceleratorHooksInterface { // This should never actually be implemented, but it is used to // squelch -Werror=non-virtual-dtor virtual ~HIPHooksInterface() = default; @@ -41,7 +43,11 @@ struct TORCH_API HIPHooksInterface { return -1; } - virtual Allocator* getPinnedMemoryAllocator() const { + virtual bool isPinnedPtr(const void* data) const override { + return false; + } + + virtual Allocator* getPinnedMemoryAllocator() const override { AT_ERROR("Pinned memory requires HIP."); } @@ -52,6 +58,10 @@ struct TORCH_API HIPHooksInterface { virtual int getNumGPUs() const { return 0; } + + virtual bool hasPrimaryContext(DeviceIndex device_index) const override { + AT_ERROR("Cannot check primary context without ATen_hip library."); + } }; // NB: dummy argument to suppress "ISO C++11 requires at least one argument diff --git a/aten/src/ATen/detail/MPSHooksInterface.h b/aten/src/ATen/detail/MPSHooksInterface.h index a06ee40c255..869bf8134ea 100644 --- a/aten/src/ATen/detail/MPSHooksInterface.h +++ b/aten/src/ATen/detail/MPSHooksInterface.h @@ -94,6 +94,12 @@ struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface { bool hasPrimaryContext(DeviceIndex device_index) const override { FAIL_MPSHOOKS_FUNC(__func__); } + virtual bool isPinnedPtr(const void* data) const override { + return false; + } + virtual Allocator* getPinnedMemoryAllocator() const override { + FAIL_MPSHOOKS_FUNC(__func__); + } #undef FAIL_MPSHOOKS_FUNC }; diff --git a/aten/src/ATen/detail/MTIAHooksInterface.h b/aten/src/ATen/detail/MTIAHooksInterface.h index 9b93d30fcc8..c55ac3437e6 100644 --- a/aten/src/ATen/detail/MTIAHooksInterface.h +++ b/aten/src/ATen/detail/MTIAHooksInterface.h @@ -6,6 +6,8 @@ #include #include +#include + #include #include @@ -88,6 +90,15 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface { virtual void setCurrentStream(const c10::Stream& stream) const { FAIL_MTIAHOOKS_FUNC(__func__); } + + virtual bool isPinnedPtr(const void* data) const override { + return false; + } + + virtual Allocator* getPinnedMemoryAllocator() const override { + FAIL_MTIAHOOKS_FUNC(__func__); + return nullptr; + } }; struct TORCH_API MTIAHooksArgs {}; diff --git a/aten/src/ATen/detail/PrivateUse1HooksInterface.h b/aten/src/ATen/detail/PrivateUse1HooksInterface.h index b6310ec66b3..0b6b84f8bf8 100644 --- a/aten/src/ATen/detail/PrivateUse1HooksInterface.h +++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.h @@ -24,7 +24,11 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface { "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDeviceFromPtr`."); } - virtual Allocator* getPinnedMemoryAllocator() const { + virtual bool isPinnedPtr(const void* data) const override { + return false; + } + + virtual Allocator* getPinnedMemoryAllocator() const override { TORCH_CHECK( false, "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getPinnedMemoryAllocator`."); diff --git a/aten/src/ATen/detail/XPUHooksInterface.h b/aten/src/ATen/detail/XPUHooksInterface.h index b3e1f175c27..320808907f0 100644 --- a/aten/src/ATen/detail/XPUHooksInterface.h +++ b/aten/src/ATen/detail/XPUHooksInterface.h @@ -58,15 +58,15 @@ struct TORCH_API XPUHooksInterface : AcceleratorHooksInterface{ TORCH_CHECK(false, "Cannot synchronize XPU device without ATen_xpu library."); } - virtual Allocator* getPinnedMemoryAllocator() const { + virtual Allocator* getPinnedMemoryAllocator() const override { TORCH_CHECK(false, "Cannot get XPU pinned memory allocator without ATen_xpu library."); } - virtual bool isPinnedPtr(const void* /*data*/) const { + virtual bool isPinnedPtr(const void* data) const override { return false; } - virtual bool hasPrimaryContext(DeviceIndex /*device_index*/) const override{ + virtual bool hasPrimaryContext(DeviceIndex device_index) const override { TORCH_CHECK(false, "Cannot query primary context without ATen_xpu library."); } }; diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm index 87a1e2c1b18..f546d986354 100644 --- a/aten/src/ATen/mps/MPSAllocator.mm +++ b/aten/src/ATen/mps/MPSAllocator.mm @@ -3,8 +3,6 @@ #include #include #include -#include -#include #include #include @@ -860,31 +858,12 @@ IMPSAllocator* getIMPSAllocator(bool sharedAllocator) { return nullptr; } -} // namespace at::mps - -namespace at::native { - // torch.is_pinned() implementation // Pinned memory will be helpful on Apple Silicon Macs with Unified memory as we // will be able to use SharedStorageMode for MTLBuffer allocations. This will // avoid extra copies on DataLoading operations. -bool is_pinned_mps(const Tensor& self, std::optional device) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps()); - return at::mps::_getSharedAllocator().isSharedBuffer(self.storage().data()); +bool isMPSPinnedPtr(const void* data) { + return at::mps::_getSharedAllocator().isSharedBuffer(data); } -// torch.pin_memory() implementation -Tensor _pin_memory_mps(const Tensor& self, std::optional device) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps()); - auto* shared_allocator = at::mps::getIMPSAllocator(true); - TORCH_CHECK(shared_allocator, "unable to pin memory on a non-unified memory device"); - - const size_t storage_size = at::detail::computeStorageNbytes(self.sizes(), self.strides(), self.dtype().itemsize()); - std::cerr << "Pinning memory of size " << storage_size / 1024UL << " KB\n"; - auto storage = Storage(Storage::use_byte_size_t(), storage_size, shared_allocator, false); - auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides()); - tensor.copy_(self); - return tensor; -} - -} // namespace at::native +} // namespace at::mps diff --git a/aten/src/ATen/mps/MPSAllocatorInterface.h b/aten/src/ATen/mps/MPSAllocatorInterface.h index cce232fd693..9aa4769f76e 100644 --- a/aten/src/ATen/mps/MPSAllocatorInterface.h +++ b/aten/src/ATen/mps/MPSAllocatorInterface.h @@ -59,4 +59,6 @@ C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback); IMPSAllocator* getIMPSAllocator(bool sharedAllocator = false); +bool isMPSPinnedPtr(const void* data); + } // namespace at::mps diff --git a/aten/src/ATen/mps/MPSGuardImpl.h b/aten/src/ATen/mps/MPSGuardImpl.h index 1b57d296676..cb50df2faea 100644 --- a/aten/src/ATen/mps/MPSGuardImpl.h +++ b/aten/src/ATen/mps/MPSGuardImpl.h @@ -68,6 +68,11 @@ struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0)); } + Stream getNewStream(Device, int priority = 0) const override { + (void)priority; + return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0)); + } + Stream getDefaultStream(Device d) const override { return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0)); } diff --git a/aten/src/ATen/mps/MPSHooks.h b/aten/src/ATen/mps/MPSHooks.h index 7a9a24e8017..4858c0609f5 100644 --- a/aten/src/ATen/mps/MPSHooks.h +++ b/aten/src/ATen/mps/MPSHooks.h @@ -34,6 +34,8 @@ struct MPSHooks : public at::MPSHooksInterface { size_t getDriverAllocatedMemory() const override; size_t getRecommendedMaxMemory() const override; void setMemoryFraction(double ratio) const override; + bool isPinnedPtr(const void* data) const override; + Allocator* getPinnedMemoryAllocator() const override; // MPSProfiler interface void profilerStartTrace(const std::string& mode, bool waitUntilCompleted) const override; diff --git a/aten/src/ATen/mps/MPSHooks.mm b/aten/src/ATen/mps/MPSHooks.mm index 285c0771c3c..e5bf149e5bd 100644 --- a/aten/src/ATen/mps/MPSHooks.mm +++ b/aten/src/ATen/mps/MPSHooks.mm @@ -124,6 +124,14 @@ double MPSHooks::elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event return at::mps::getMPSEventPool()->elapsedTime(start_event_id, end_event_id); } +bool MPSHooks::isPinnedPtr(const void* data) const { + return at::mps::isMPSPinnedPtr(data); +} + +Allocator* MPSHooks::getPinnedMemoryAllocator() const { + return at::mps::getIMPSAllocator(true); +} + using at::MPSHooksRegistry; using at::RegistererMPSHooksRegistry; diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp index fefe9ab5a8d..dd69c2fd251 100644 --- a/aten/src/ATen/native/Memory.cpp +++ b/aten/src/ATen/native/Memory.cpp @@ -1,15 +1,21 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include +#include +#include +#include #ifndef AT_PER_OPERATOR_HEADERS #include #include +#include #else #include #include #include #include +#include +#include #endif namespace at::native { @@ -19,15 +25,28 @@ int64_t _debug_has_internal_overlap(const Tensor& self) { return static_cast(at::has_internal_overlap(self)); } -// Technically, we could force backends to explicitly say "no, we don't support -// pinned memory, always return false", but this makes life a little easier when -// you haven't loaded the backend extension at all (which can happen, e.g., on a -// CPU build of PyTorch and you try to check if something is CUDA pinned) -bool is_pinned_default(const Tensor& self, std::optional device) { - return false; +bool is_pinned(const Tensor& self, std::optional device) { + std::optional opt_device_type; + if (device.has_value()) { + TORCH_WARN_DEPRECATION( + "The argument 'device' of Tensor.is_pinned() ", + "is deprecated. Please do not pass this argument.") + opt_device_type = device.value().type(); + } + // Only CPU tensors can be pinned + if (!self.is_cpu()) { + return false; + } + // Use getAcceleratorHooksInterface to make is_pinned device-agnostic + return at::globalContext().isPinnedPtr(self.storage().data(), opt_device_type); } -Tensor pin_memory(const Tensor& self, std::optional device) { +Tensor pin_memory(const Tensor& self, std::optional device) { + if (device.has_value()) { + TORCH_WARN_DEPRECATION( + "The argument 'device' of Tensor.pin_memory() ", + "is deprecated. Please do not pass this argument.") + } // Kind of mad that I have to do two dynamic dispatches here, pretty // annoying if (self.is_pinned(device)) { @@ -36,4 +55,21 @@ Tensor pin_memory(const Tensor& self, std::optional device) { return at::_pin_memory(self, device); } +Tensor _pin_memory(const Tensor& self, std::optional device) { + TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned"); + // Use getAcceleratorHooksInterface to make pin_memory device-agnostic + auto* allocator = device.has_value()? + at::globalContext().getPinnedMemoryAllocator(device.value().type()): + at::globalContext().getPinnedMemoryAllocator(); + auto storage = Storage( + Storage::use_byte_size_t(), + detail::computeStorageNbytes( + self.sizes(), self.strides(), self.dtype().itemsize()), + allocator, + /*resizable=*/false); + auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides()); + tensor.copy_(self); + return tensor; +} + } // namespace at::native diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index d5138fe0e52..32fef6f23fd 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -4545,9 +4545,10 @@ - func: is_pinned(Tensor self, Device? device=None) -> bool variants: method dispatch: - NestedTensorCUDA, CUDA: is_pinned_cuda - MPS: is_pinned_mps - CompositeExplicitAutograd: is_pinned_default + # the NestedTensor keys are necessary because NestedTensor has been removed + # from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys] + CompositeExplicitAutograd, NestedTensorCPU: is_pinned + SparseCPU, SparseCsrCPU: is_pinned_sparse # TODO: add a copy kwarg that guarantees that the tensor is put into fresh # pinned memory @@ -4557,9 +4558,9 @@ # Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor - func: _pin_memory(Tensor self, Device? device=None) -> Tensor dispatch: - CUDA: _pin_memory_cuda - MPS: _pin_memory_mps - NestedTensorCUDA, NestedTensorCPU: _pin_memory_nested + NestedTensorCPU: _pin_memory_nested + SparseCPU, SparseCsrCPU: _pin_memory_sparse + CompositeExplicitAutograd: _pin_memory autogen: _pin_memory.out - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor diff --git a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp index e1e1c05b567..71bb05d95de 100644 --- a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp +++ b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp @@ -71,6 +71,8 @@ #include #include #include +#include +#include #endif namespace at::native { @@ -280,4 +282,23 @@ Tensor& nan_to_num_sparse_( return nan_to_num_sparse_out(self, nan, posinf, neginf, self); } +bool is_pinned_sparse(const Tensor& self, std::optional device) { + if (device.has_value()) { + TORCH_WARN_DEPRECATION( + "The argument 'device' of Tensor.is_pinned() ", + "is deprecated. Please do not pass this argument.") + } + // Currently, we don't support pin memory for sparse tensor. + // so always return false + return false; +} + +Tensor _pin_memory_sparse(const Tensor& self, std::optional device) { + // Here, we throw an error rather than return self tensor. This + // is because we always return the pinned memory tensor, while + // giving unpinned tensor might mislead users. + TORCH_CHECK_NOT_IMPLEMENTED( + false, "'aten::_pin_memory' is not implemented for sparse tensor."); +} + } // namespace at::native diff --git a/aten/src/ATen/templates/RegisterBackendSelect.cpp b/aten/src/ATen/templates/RegisterBackendSelect.cpp index aab49224895..018cf358f11 100644 --- a/aten/src/ATen/templates/RegisterBackendSelect.cpp +++ b/aten/src/ATen/templates/RegisterBackendSelect.cpp @@ -11,8 +11,6 @@ #ifndef AT_PER_OPERATOR_HEADERS #include #else -#include -#include ${ops_headers} #endif @@ -23,31 +21,8 @@ namespace { ${backend_select_method_definitions} -bool is_pinned(const Tensor& self, std::optional device) { - // Only CPU tensors can be pinned - if (!self.is_cpu()) { - return false; - } - // TODO: fetch scalar type from Tensor? But it doesn't really matter... - DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(std::nullopt, self.layout(), device.value_or(at::kCUDA))); - return at::_ops::is_pinned::redispatch(_dk, self, device); -} - -at::Tensor _pin_memory(const Tensor& self, std::optional device) { - TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned"); - DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(std::nullopt, self.layout(), device.value_or(at::kCUDA))); - if (self.is_nested()) { - constexpr auto nested_key_set = c10::DispatchKeySet( - {c10::DispatchKey::NestedTensor, c10::DispatchKey::AutogradNestedTensor}); - _dk = _dk.add(self.key_set() & nested_key_set); - } - return at::_ops::_pin_memory::redispatch(_dk, self, device); -} - TORCH_LIBRARY_IMPL(aten, BackendSelect, m) { ${backend_select_function_registrations}; - m.impl(TORCH_SELECTIVE_NAME("aten::is_pinned"), TORCH_FN(is_pinned)); - m.impl(TORCH_SELECTIVE_NAME("aten::_pin_memory"), TORCH_FN(_pin_memory)); } } // namespace diff --git a/test/cpp_extensions/open_registration_extension.cpp b/test/cpp_extensions/open_registration_extension.cpp index 99bf97833cc..cf66c38976d 100644 --- a/test/cpp_extensions/open_registration_extension.cpp +++ b/test/cpp_extensions/open_registration_extension.cpp @@ -418,38 +418,6 @@ at::Tensor& custom_set_source_Storage_storage_offset(at::Tensor& result, return result; } -// basic dummy functions related to pin_memory. -std::vector custom_pinned_data_ptr; - -at::Tensor custom__pin_memory(const at::Tensor& self, std::optional device) { - TORCH_CHECK( - self.device().is_cpu(), - "cannot pin '", - self.toString(), - "' only dense CPU tensors can be pinned"); - - // record pinned data ptr - at::Tensor dump_pinned_tensor = self * 1.0; - custom_pinned_data_ptr.push_back(dump_pinned_tensor.storage().data_ptr().get()); - - return dump_pinned_tensor; -} - -bool custom_is_pinned(const at::Tensor& self, std::optional device) { - // Only CPU tensors can be pinned - if (!self.is_cpu()) { - return false; - } - - void* query_pinned_ptr = self.storage().data_ptr().get(); - for (const auto& iter_ptr : custom_pinned_data_ptr) { - if (iter_ptr == query_pinned_ptr) { - return true; - } - } - return false; -} - const at::Tensor& custom_resize_(const at::Tensor& self, at::IntArrayRef size, std::optional optional_memory_format) { at::TensorImpl* tensor_impl = self.unsafeGetTensorImpl(); @@ -545,8 +513,6 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { m.impl("empty_strided", &custom_empty_strided); m.impl("set_.source_Storage", &custom_set_source_Storage); m.impl("set_.source_Storage_storage_offset",&custom_set_source_Storage_storage_offset); - m.impl("_pin_memory", &custom__pin_memory); - m.impl("is_pinned", &custom_is_pinned); m.impl("resize_", &custom_resize_); m.impl("as_strided", at::native::as_strided_tensorimpl); m.impl("quantize_per_tensor", at::native::quantize_per_tensor); @@ -612,6 +578,9 @@ void set_custom_device_index(c10::DeviceIndex device_index) { custom_device_index = device_index; } +// a global flag used for dummy pin_memory of custom device +bool custom_pinned_flag = false; + struct FooHooksArgs : public at::PrivateUse1HooksArgs {}; struct FooHooksInterface : public at::PrivateUse1HooksInterface { @@ -621,6 +590,16 @@ struct FooHooksInterface : public at::PrivateUse1HooksInterface { static auto device_gen = make_generator_privateuse1(device_index); return device_gen; } + // this is a simple implementation, custom_pinned_flag will be set as true + // once tensor.pin_memory() is called. And then tensor.is_pinned() + // always return true no matter what tensor it's called on. + bool isPinnedPtr(const void* data) const override { + return custom_pinned_flag; + } + c10::Allocator* getPinnedMemoryAllocator() const override { + custom_pinned_flag = true; + return c10::GetCPUAllocator(); + } }; TORCH_DECLARE_REGISTRY(PrivateUse1HooksRegistry, FooHooksInterface, FooHooksArgs); diff --git a/test/test_cpp_extensions_open_device_registration.py b/test/test_cpp_extensions_open_device_registration.py index 23cd281ba56..4c9d36af1f0 100644 --- a/test/test_cpp_extensions_open_device_registration.py +++ b/test/test_cpp_extensions_open_device_registration.py @@ -343,71 +343,24 @@ class TestCppExtensionOpenRgistration(common.TestCase): cpu_tensor_pin = cpu_tensor.pin_memory("foo") self.assertTrue(cpu_tensor_pin.is_pinned("foo")) - # Test storage pin_memory on custom device string + # Test storage pin_memory and is_pin cpu_storage = cpu_tensor.storage() - foo_device = torch.device("foo") - self.assertFalse(cpu_storage.is_pinned("foo")) + # We implement a dummy pin_memory of no practical significance + # for custom device. Once tensor.pin_memory() has been called, + # then tensor.is_pinned() will always return true no matter + # what tensor it's called on. + self.assertTrue(cpu_storage.is_pinned("foo")) - cpu_storage_pin = cpu_storage.pin_memory("foo") - self.assertFalse(cpu_storage.is_pinned()) - self.assertFalse(cpu_storage.is_pinned("foo")) - self.assertFalse(cpu_storage.is_pinned(foo_device)) - self.assertFalse(cpu_storage_pin.is_pinned()) - self.assertTrue(cpu_storage_pin.is_pinned("foo")) - self.assertTrue(cpu_storage_pin.is_pinned(foo_device)) - - cpu_storage_pin_already = cpu_storage_pin.pin_memory("foo") - self.assertTrue(cpu_storage_pin.is_pinned("foo")) - self.assertTrue(cpu_storage_pin.is_pinned(foo_device)) - self.assertTrue(cpu_storage_pin_already.is_pinned("foo")) - self.assertTrue(cpu_storage_pin_already.is_pinned(foo_device)) - self.assertFalse(cpu_storage.is_pinned("foo")) - - cpu_storage_pinned = cpu_storage.pin_memory(foo_device) - self.assertFalse(cpu_storage.is_pinned()) - self.assertFalse(cpu_storage.is_pinned("foo")) - self.assertFalse(cpu_storage.is_pinned(foo_device)) - self.assertFalse(cpu_storage_pinned.is_pinned()) + cpu_storage_pinned = cpu_storage.pin_memory("foo") self.assertTrue(cpu_storage_pinned.is_pinned("foo")) - self.assertTrue(cpu_storage_pinned.is_pinned(foo_device)) # Test untyped storage pin_memory and is_pin cpu_tensor = torch.randn([3, 2, 1, 4]) cpu_untyped_storage = cpu_tensor.untyped_storage() - self.assertFalse(cpu_untyped_storage.is_pinned()) - self.assertFalse(cpu_untyped_storage.is_pinned("foo")) + self.assertTrue(cpu_untyped_storage.is_pinned("foo")) cpu_untyped_storage_pinned = cpu_untyped_storage.pin_memory("foo") - self.assertFalse(cpu_untyped_storage_pinned.is_pinned()) self.assertTrue(cpu_untyped_storage_pinned.is_pinned("foo")) - self.assertTrue(cpu_untyped_storage_pinned.is_pinned(foo_device)) - - cpu_untyped_storage_pinned = cpu_untyped_storage.pin_memory(foo_device) - self.assertFalse(cpu_untyped_storage_pinned.is_pinned()) - self.assertTrue(cpu_untyped_storage_pinned.is_pinned("foo")) - self.assertTrue(cpu_untyped_storage_pinned.is_pinned(foo_device)) - - with self.assertRaisesRegex(TypeError, "positional arguments but 3 were given"): - cpu_untyped_storage_pinned.is_pinned("foo1", "foo2") - - # Test storage pin_memory on error device - self.assertFalse(cpu_storage_pinned.is_pinned("hpu")) - self.assertFalse(cpu_untyped_storage_pinned.is_pinned("hpu")) - invalid_device = torch.device("hpu") - self.assertFalse(cpu_untyped_storage_pinned.is_pinned(invalid_device)) - - with self.assertRaisesRegex( - NotImplementedError, "with arguments from the 'HPU' backend" - ): - cpu_storage.pin_memory("hpu") - with self.assertRaisesRegex( - NotImplementedError, "with arguments from the 'HPU' backend" - ): - cpu_untyped_storage.pin_memory("hpu") - with self.assertRaisesRegex( - NotImplementedError, "with arguments from the 'HPU' backend" - ): - cpu_untyped_storage.pin_memory(invalid_device) @unittest.skip( "Temporarily disable due to the tiny differences between clang++ and g++ in defining static variable in inline function" diff --git a/test/test_cpp_extensions_stream_and_event.py b/test/test_cpp_extensions_stream_and_event.py index 9f2290fea5a..c26e8b2b1a8 100644 --- a/test/test_cpp_extensions_stream_and_event.py +++ b/test/test_cpp_extensions_stream_and_event.py @@ -14,6 +14,7 @@ from torch.testing._internal.common_utils import ( IS_LINUX, skipIfTorchDynamo, TEST_CUDA, + TEST_MPS, TEST_PRIVATEUSE1, TEST_XPU, ) @@ -37,7 +38,13 @@ def remove_build_path(): # Since we use a fake MTIA device backend to test generic Stream/Event, device backends are mutual exclusive to each other. # The test will be skipped if any of the following conditions are met: @unittest.skipIf( - IS_ARM64 or not IS_LINUX or TEST_CUDA or TEST_XPU or TEST_PRIVATEUSE1 or TEST_ROCM, + IS_ARM64 + or not IS_LINUX + or TEST_CUDA + or TEST_XPU + or TEST_MPS + or TEST_PRIVATEUSE1 + or TEST_ROCM, "Only on linux platform and mutual exclusive to other backends", ) @torch.testing._internal.common_utils.markDynamoStrictTest diff --git a/test/test_torch.py b/test/test_torch.py index 6cb28f12307..3e782eabe92 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -8432,9 +8432,7 @@ tensor([[[1.+1.j, 1.+1.j, 1.+1.j, ..., 1.+1.j, 1.+1.j, 1.+1.j], def test_pin_memory(self): x = torch.randn(3, 5) self.assertFalse(x.is_pinned()) - if not torch.cuda.is_available(): - self.assertRaises(RuntimeError, lambda: x.pin_memory()) - else: + if torch.cuda.is_available(): pinned = x.pin_memory() self.assertTrue(pinned.is_pinned()) self.assertEqual(pinned, x) diff --git a/torch/csrc/Stream.cpp b/torch/csrc/Stream.cpp index 179f4f1390a..cff0dce5194 100644 --- a/torch/csrc/Stream.cpp +++ b/torch/csrc/Stream.cpp @@ -29,7 +29,7 @@ static PyObject* THPStream_pynew( int64_t priority = 0; static torch::PythonArgParser parser({ - "Steram(Device device=None, *, int64_t priority=0)", + "Stream(Device device=None, *, int64_t priority=0)", "Stream(int64_t stream_id, int64_t device_index, int64_t device_type, *, int64_t priority=0)", });