Re-implement pin_memory to be device-agnostic by leveraging the Accelerator concept (#126376)

This PR re-implements pin memory aiming to get rid of the optional `device` argument and makes all related APIs to be device-agnostic. We add two new abstract APIs in [AcceleratorHooksInterface](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/detail/AcceleratorHooksInterface.h#L12) and redefine pin memory as: "Pin memory is always pinned for the current accelerator device". In detail, it uses [getAcceleratorHooksInterface](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/Context.h#L61) in pin_memory/is_pinned to get an appropriate device and invoke the corresponding overridden interfaces, instead of using BackendSelect and then dispatching to CUDA or other specific backends' implement methods. Note: For new backends who want to implement and use pin memory, just inherit AcceleratorHooksInterface and overwrite the `isPinnedPtr` and `getPinnedMemoryAllocator` methods. Additional context: To avoid BC-breaking, this PR just preserves the `device` arg of related APIs and would throw a deprecation warning if `device` arg is passed. Another PR will be submitted to update all PT callers (`Tensor.is_pinned()`, `Tensor.pin_memory()`...) not to pass this arg based on this PR. In future, `device` arg will be actually removed. Relates #124908 Relates #14560 Pull Request resolved: https://github.com/pytorch/pytorch/pull/126376 Approved by: https://github.com/albanD
2026-05-14 20:57:59 +00:00 · 2024-07-23 01:44:15 +00:00 · 2024-07-23 01:44:15 +00:00 · 8963623494
commit 8963623494
parent 074b420641
25 changed files with 210 additions and 203 deletions
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -73,6 +73,8 @@ class TORCH_API Context {
      return at::detail::getPrivateUse1Hooks();
    } else if (device_type == at::kMTIA) {
      return at::detail::getMTIAHooks();
+    } else if (device_type == at::kHIP) {
+      return at::detail::getHIPHooks();
    } else {
      AT_ERROR(
          c10::DeviceTypeName(device_type), " device type not an accelerator.");
@ -94,8 +96,22 @@ class TORCH_API Context {
      AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
    }
  }
-  static bool isPinnedPtr(const void* data) {
-    return detail::getCUDAHooks().isPinnedPtr(data);
+  bool isPinnedPtr(
+      const void* data,
+      std::optional<c10::DeviceType> device_type = std::nullopt) {
+    auto opt_device_type =
+        device_type.has_value() ? device_type.value() : at::getAccelerator();
+    if (!opt_device_type.has_value() || // there is no accelerator
+        !at::isAccelerator(
+            opt_device_type.value())) { // passed device not an accelerator
+      return false;
+    }
+    return getAcceleratorHooksInterface(opt_device_type.value())
+        .isPinnedPtr(data);
+  }
+  Allocator* getPinnedMemoryAllocator(
+      std::optional<c10::DeviceType> device_type = std::nullopt) {
+    return getAcceleratorHooksInterface(device_type).getPinnedMemoryAllocator();
  }
  static bool hasOpenMP();
  static bool hasMKL();
--- a/aten/src/ATen/DeviceAccelerator.cpp
+++ b/aten/src/ATen/DeviceAccelerator.cpp
@ -2,7 +2,7 @@
 #include <ATen/DeviceAccelerator.h>
 namespace at {

-C10_API std::optional<DeviceType> getAccelerator(bool checked) {
+std::optional<c10::DeviceType> getAccelerator(bool checked) {
 #define DETECT_AND_ASSIGN_ACCELERATOR(device_name) \
  if (at::has##device_name()) {                    \
    device_type = k##device_name;                  \
@ -20,11 +20,13 @@ C10_API std::optional<DeviceType> getAccelerator(bool checked) {
    // first.
    return kPrivateUse1;
  }
-  std::optional<DeviceType> device_type = std::nullopt;
+  std::optional<c10::DeviceType> device_type = std::nullopt;
  bool is_accelerator_detected = false;
  DETECT_AND_ASSIGN_ACCELERATOR(CUDA)
  DETECT_AND_ASSIGN_ACCELERATOR(MTIA)
  DETECT_AND_ASSIGN_ACCELERATOR(XPU)
+  DETECT_AND_ASSIGN_ACCELERATOR(HIP)
+  DETECT_AND_ASSIGN_ACCELERATOR(MPS)
  if (checked) {
    TORCH_CHECK(
        device_type, "Cannot access accelerator device when none is available.")
@ -34,4 +36,18 @@ C10_API std::optional<DeviceType> getAccelerator(bool checked) {
 #undef DETECT_AND_ASSIGN_ACCELERATOR
 }

+bool isAccelerator(c10::DeviceType d) {
+  switch (d) {
+    case at::kCUDA:
+    case at::kMTIA:
+    case at::kXPU:
+    case at::kHIP:
+    case at::kMPS:
+    case at::kPrivateUse1:
+      return true;
+    default:
+      return false;
+  }
+}
+
 } // namespace at
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -13,9 +13,7 @@
 // - It provides a set of common APIs as defined by AcceleratorHooksInterface
 //
 // As of today, accelerator devices are (in no particular order):
-// CUDA, MTIA, XPU, PrivateUse1
-// We want to add once all the proper APIs are supported and tested:
-// HIP, MPS
+// CUDA, MTIA, XPU, HIP, MPS, PrivateUse1

 namespace at {

@ -24,4 +22,6 @@ namespace at {
 // When checked is true, the returned optional always has a value.
 TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);

+TORCH_API bool isAccelerator(c10::DeviceType d);
+
 } // namespace at
--- a/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp
+++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp
@ -1,32 +0,0 @@
-#include <ATen/cuda/PinnedMemoryAllocator.h>
-#include <ATen/Context.h>
-#include <ATen/Config.h>
-#include <ATen/TensorUtils.h>
-#include <c10/core/Storage.h>
-#include <ATen/ATen.h>
-#include <ATen/CPUFunctions.h>
-
-namespace at::native {
-
-bool is_pinned_cuda(const Tensor& self, std::optional<Device> device) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda());
-  // TODO: unhook this
-  return detail::getCUDAHooks().isPinnedPtr(self.storage().data());
-}
-
-Tensor _pin_memory_cuda(const Tensor& self, std::optional<Device> device) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda());
-  auto* allocator = at::cuda::getPinnedMemoryAllocator();
-  auto storage = Storage(
-      Storage::use_byte_size_t(),
-      detail::computeStorageNbytes(
-          self.sizes(), self.strides(), self.dtype().itemsize()),
-      allocator,
-      /*resizable=*/false);
-  auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
-  tensor.copy_(self);
-  return tensor;
-}
-
-
-} // namespace at::native
--- a/aten/src/ATen/detail/AcceleratorHooksInterface.h
+++ b/aten/src/ATen/detail/AcceleratorHooksInterface.h
@ -2,6 +2,7 @@

 #include <c10/core/Device.h>
 #include <c10/core/Stream.h>
+#include <c10/core/Allocator.h>
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
 namespace at {

@ -40,6 +41,15 @@ struct TORCH_API AcceleratorHooksInterface {
    TORCH_CHECK(false, "Backend doesn't support maybeExchangeDevice()");
    return -1;
  }
+
+  virtual bool isPinnedPtr(const void* data) const {
+    return false;
+  }
+
+  virtual Allocator* getPinnedMemoryAllocator() const {
+    TORCH_CHECK(false, "Backend doesn't support getPinnedMemoryAllocator()");
+    return nullptr;
+  }
 };

 } // namespace at
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -77,7 +77,7 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    TORCH_CHECK(false, "Cannot get device of pointer on CUDA without ATen_cuda library. ", CUDA_HELP);
  }

-  virtual bool isPinnedPtr(const void* /*data*/) const {
+  virtual bool isPinnedPtr(const void* data) const override {
    return false;
  }

@ -121,7 +121,7 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    return -1;
  }

-  virtual Allocator* getPinnedMemoryAllocator() const {
+  virtual Allocator* getPinnedMemoryAllocator() const override {
    TORCH_CHECK(false, "Pinned memory requires CUDA. ", CUDA_HELP);
  }

--- a/aten/src/ATen/detail/HIPHooksInterface.h
+++ b/aten/src/ATen/detail/HIPHooksInterface.h
@ -6,6 +6,8 @@

 #include <c10/util/Registry.h>

+#include <ATen/detail/AcceleratorHooksInterface.h>
+
 #include <memory>

 namespace at {
@ -19,7 +21,7 @@ namespace at {
 // which we may want to call into from CPU code (and thus must be dynamically
 // dispatched, to allow for separate compilation of HIP code).  See
 // CUDAHooksInterface for more detailed motivation.
-struct TORCH_API HIPHooksInterface {
+struct TORCH_API HIPHooksInterface : AcceleratorHooksInterface {
  // This should never actually be implemented, but it is used to
  // squelch -Werror=non-virtual-dtor
  virtual ~HIPHooksInterface() = default;
@ -41,7 +43,11 @@ struct TORCH_API HIPHooksInterface {
    return -1;
  }

-  virtual Allocator* getPinnedMemoryAllocator() const {
+  virtual bool isPinnedPtr(const void* data) const override {
+    return false;
+  }
+
+  virtual Allocator* getPinnedMemoryAllocator() const override {
    AT_ERROR("Pinned memory requires HIP.");
  }

@ -52,6 +58,10 @@ struct TORCH_API HIPHooksInterface {
  virtual int getNumGPUs() const {
    return 0;
  }
+
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+    AT_ERROR("Cannot check primary context without ATen_hip library.");
+  }
 };

 // NB: dummy argument to suppress "ISO C++11 requires at least one argument
--- a/aten/src/ATen/detail/MPSHooksInterface.h
+++ b/aten/src/ATen/detail/MPSHooksInterface.h
@ -94,6 +94,12 @@ struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface {
  bool hasPrimaryContext(DeviceIndex device_index) const override {
    FAIL_MPSHOOKS_FUNC(__func__);
  }
+  virtual bool isPinnedPtr(const void* data) const override {
+    return false;
+  }
+  virtual Allocator* getPinnedMemoryAllocator() const override {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
  #undef FAIL_MPSHOOKS_FUNC
 };

--- a/aten/src/ATen/detail/MTIAHooksInterface.h
+++ b/aten/src/ATen/detail/MTIAHooksInterface.h
@ -6,6 +6,8 @@
 #include <c10/core/Stream.h>
 #include <c10/util/Registry.h>

+#include <c10/core/Allocator.h>
+
 #include <ATen/detail/AcceleratorHooksInterface.h>

 #include <string>
@ -88,6 +90,15 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
  virtual void setCurrentStream(const c10::Stream& stream) const {
    FAIL_MTIAHOOKS_FUNC(__func__);
  }
+
+  virtual bool isPinnedPtr(const void* data) const override {
+    return false;
+  }
+
+  virtual Allocator* getPinnedMemoryAllocator() const override {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+    return nullptr;
+  }
 };

 struct TORCH_API MTIAHooksArgs {};
--- a/aten/src/ATen/detail/PrivateUse1HooksInterface.h
+++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.h
@ -24,7 +24,11 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDeviceFromPtr`.");
  }

-  virtual Allocator* getPinnedMemoryAllocator() const {
+  virtual bool isPinnedPtr(const void* data) const override {
+    return false;
+  }
+
+  virtual Allocator* getPinnedMemoryAllocator() const override {
    TORCH_CHECK(
        false,
        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getPinnedMemoryAllocator`.");
--- a/aten/src/ATen/detail/XPUHooksInterface.h
+++ b/aten/src/ATen/detail/XPUHooksInterface.h
@ -58,15 +58,15 @@ struct TORCH_API XPUHooksInterface : AcceleratorHooksInterface{
    TORCH_CHECK(false, "Cannot synchronize XPU device without ATen_xpu library.");
  }

-  virtual Allocator* getPinnedMemoryAllocator() const  {
+  virtual Allocator* getPinnedMemoryAllocator() const override {
    TORCH_CHECK(false, "Cannot get XPU pinned memory allocator without ATen_xpu library.");
  }

-  virtual bool isPinnedPtr(const void* /*data*/) const {
+  virtual bool isPinnedPtr(const void* data) const override {
    return false;
  }

-  virtual bool hasPrimaryContext(DeviceIndex /*device_index*/) const override{
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
    TORCH_CHECK(false, "Cannot query primary context without ATen_xpu library.");
  }
 };
--- a/aten/src/ATen/mps/MPSAllocator.mm
+++ b/aten/src/ATen/mps/MPSAllocator.mm
@ -3,8 +3,6 @@
 #include <ATen/CPUFunctions.h>
 #include <ATen/EmptyTensor.h>
 #include <ATen/mps/MPSAllocator.h>
-#include <ATen/ops/_pin_memory_native.h>
-#include <ATen/ops/is_pinned_native.h>
 #include <c10/core/Allocator.h>
 #include <c10/core/Storage.h>

@ -860,31 +858,12 @@ IMPSAllocator* getIMPSAllocator(bool sharedAllocator) {
  return nullptr;
 }

-} // namespace at::mps
-
-namespace at::native {
-
 // torch.is_pinned() implementation
 // Pinned memory will be helpful on Apple Silicon Macs with Unified memory as we
 // will be able to use SharedStorageMode for MTLBuffer allocations. This will
 // avoid extra copies on DataLoading operations.
-bool is_pinned_mps(const Tensor& self, std::optional<Device> device) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps());
-  return at::mps::_getSharedAllocator().isSharedBuffer(self.storage().data());
+bool isMPSPinnedPtr(const void* data) {
+  return at::mps::_getSharedAllocator().isSharedBuffer(data);
 }

-// torch.pin_memory() implementation
-Tensor _pin_memory_mps(const Tensor& self, std::optional<Device> device) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps());
-  auto* shared_allocator = at::mps::getIMPSAllocator(true);
-  TORCH_CHECK(shared_allocator, "unable to pin memory on a non-unified memory device");
-
-  const size_t storage_size = at::detail::computeStorageNbytes(self.sizes(), self.strides(), self.dtype().itemsize());
-  std::cerr << "Pinning memory of size " << storage_size / 1024UL << " KB\n";
-  auto storage = Storage(Storage::use_byte_size_t(), storage_size, shared_allocator, false);
-  auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
-  tensor.copy_(self);
-  return tensor;
-}
-
-} // namespace at::native
+} // namespace at::mps
--- a/aten/src/ATen/mps/MPSAllocatorInterface.h
+++ b/aten/src/ATen/mps/MPSAllocatorInterface.h
@ -59,4 +59,6 @@ C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback);

 IMPSAllocator* getIMPSAllocator(bool sharedAllocator = false);

+bool isMPSPinnedPtr(const void* data);
+
 } // namespace at::mps
--- a/aten/src/ATen/mps/MPSGuardImpl.h
+++ b/aten/src/ATen/mps/MPSGuardImpl.h
@ -68,6 +68,11 @@ struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface
    return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
  }

+  Stream getNewStream(Device, int priority = 0) const override {
+    (void)priority;
+    return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
+  }
+
  Stream getDefaultStream(Device d) const override {
    return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
  }
--- a/aten/src/ATen/mps/MPSHooks.h
+++ b/aten/src/ATen/mps/MPSHooks.h
@ -34,6 +34,8 @@ struct MPSHooks : public at::MPSHooksInterface {
  size_t getDriverAllocatedMemory() const override;
  size_t getRecommendedMaxMemory() const override;
  void setMemoryFraction(double ratio) const override;
+  bool isPinnedPtr(const void* data) const override;
+  Allocator* getPinnedMemoryAllocator() const override;

  // MPSProfiler interface
  void profilerStartTrace(const std::string& mode, bool waitUntilCompleted) const override;
--- a/aten/src/ATen/mps/MPSHooks.mm
+++ b/aten/src/ATen/mps/MPSHooks.mm
@ -124,6 +124,14 @@ double MPSHooks::elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event
  return at::mps::getMPSEventPool()->elapsedTime(start_event_id, end_event_id);
 }

+bool MPSHooks::isPinnedPtr(const void* data) const {
+  return at::mps::isMPSPinnedPtr(data);
+}
+
+Allocator* MPSHooks::getPinnedMemoryAllocator() const {
+  return at::mps::getIMPSAllocator(true);
+}
+
 using at::MPSHooksRegistry;
 using at::RegistererMPSHooksRegistry;

--- a/aten/src/ATen/native/Memory.cpp
+++ b/aten/src/ATen/native/Memory.cpp
@ -1,15 +1,21 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/MemoryOverlap.h>
+#include <ATen/Context.h>
+#include <c10/core/Storage.h>
+#include <ATen/EmptyTensor.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
+#include <ATen/CPUFunctions.h>
 #else
 #include <ATen/ops/_debug_has_internal_overlap_native.h>
 #include <ATen/ops/_pin_memory.h>
 #include <ATen/ops/is_pinned_native.h>
 #include <ATen/ops/pin_memory_native.h>
+#include <ATen/ops/_pin_memory_native.h>
+#include <ATen/ops/empty_cpu_dispatch.h>
 #endif

 namespace at::native {
@ -19,15 +25,28 @@ int64_t _debug_has_internal_overlap(const Tensor& self) {
  return static_cast<int64_t>(at::has_internal_overlap(self));
 }

-// Technically, we could force backends to explicitly say "no, we don't support
-// pinned memory, always return false", but this makes life a little easier when
-// you haven't loaded the backend extension at all (which can happen, e.g., on a
-// CPU build of PyTorch and you try to check if something is CUDA pinned)
-bool is_pinned_default(const Tensor& self, std::optional<Device> device) {
-  return false;
+bool is_pinned(const Tensor& self, std::optional<c10::Device> device) {
+  std::optional<c10::DeviceType> opt_device_type;
+  if (device.has_value()) {
+    TORCH_WARN_DEPRECATION(
+        "The argument 'device' of Tensor.is_pinned() ",
+        "is deprecated. Please do not pass this argument.")
+    opt_device_type = device.value().type();
+  }
+  // Only CPU tensors can be pinned
+  if (!self.is_cpu()) {
+    return false;
+  }
+  // Use getAcceleratorHooksInterface to make is_pinned device-agnostic
+  return at::globalContext().isPinnedPtr(self.storage().data(), opt_device_type);
 }

-Tensor pin_memory(const Tensor& self, std::optional<Device> device) {
+Tensor pin_memory(const Tensor& self, std::optional<c10::Device> device) {
+  if (device.has_value()) {
+    TORCH_WARN_DEPRECATION(
+        "The argument 'device' of Tensor.pin_memory() ",
+        "is deprecated. Please do not pass this argument.")
+  }
  // Kind of mad that I have to do two dynamic dispatches here, pretty
  // annoying
  if (self.is_pinned(device)) {
@ -36,4 +55,21 @@ Tensor pin_memory(const Tensor& self, std::optional<Device> device) {
  return at::_pin_memory(self, device);
 }

+Tensor _pin_memory(const Tensor& self, std::optional<c10::Device> device) {
+  TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned");
+  // Use getAcceleratorHooksInterface to make pin_memory device-agnostic
+  auto* allocator = device.has_value()?
+      at::globalContext().getPinnedMemoryAllocator(device.value().type()):
+      at::globalContext().getPinnedMemoryAllocator();
+  auto storage = Storage(
+      Storage::use_byte_size_t(),
+      detail::computeStorageNbytes(
+          self.sizes(), self.strides(), self.dtype().itemsize()),
+      allocator,
+      /*resizable=*/false);
+  auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
+  tensor.copy_(self);
+  return tensor;
+}
+
 } // namespace at::native
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -4545,9 +4545,10 @@
 - func: is_pinned(Tensor self, Device? device=None) -> bool
  variants: method
  dispatch:
-    NestedTensorCUDA, CUDA: is_pinned_cuda
-    MPS: is_pinned_mps
-    CompositeExplicitAutograd: is_pinned_default
+    # the NestedTensor keys are necessary because NestedTensor has been removed
+    # from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys]
+    CompositeExplicitAutograd, NestedTensorCPU: is_pinned
+    SparseCPU, SparseCsrCPU: is_pinned_sparse

 # TODO: add a copy kwarg that guarantees that the tensor is put into fresh
 # pinned memory
@ -4557,9 +4558,9 @@
 # Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor
 - func: _pin_memory(Tensor self, Device? device=None) -> Tensor
  dispatch:
-    CUDA: _pin_memory_cuda
-    MPS: _pin_memory_mps
-    NestedTensorCUDA, NestedTensorCPU: _pin_memory_nested
+    NestedTensorCPU: _pin_memory_nested
+    SparseCPU, SparseCsrCPU: _pin_memory_sparse
+    CompositeExplicitAutograd: _pin_memory
  autogen: _pin_memory.out

 - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor
--- a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
+++ b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
@ -71,6 +71,8 @@
 #include <ATen/ops/threshold_backward_native.h>
 #include <ATen/ops/trunc.h>
 #include <ATen/ops/trunc_native.h>
+#include <ATen/ops/is_pinned_native.h>
+#include <ATen/ops/_pin_memory_native.h>
 #endif

 namespace at::native {
@ -280,4 +282,23 @@ Tensor& nan_to_num_sparse_(
  return nan_to_num_sparse_out(self, nan, posinf, neginf, self);
 }

+bool is_pinned_sparse(const Tensor& self, std::optional<c10::Device> device) {
+  if (device.has_value()) {
+    TORCH_WARN_DEPRECATION(
+        "The argument 'device' of Tensor.is_pinned() ",
+        "is deprecated. Please do not pass this argument.")
+  }
+  // Currently, we don't support pin memory for sparse tensor.
+  // so always return false
+  return false;
+}
+
+Tensor _pin_memory_sparse(const Tensor& self, std::optional<c10::Device> device) {
+  // Here, we throw an error rather than return self tensor. This
+  // is because we always return the pinned memory tensor, while
+  // giving unpinned tensor might mislead users.
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "'aten::_pin_memory' is not implemented for sparse tensor.");
+}
+
 }  // namespace at::native
--- a/aten/src/ATen/templates/RegisterBackendSelect.cpp
+++ b/aten/src/ATen/templates/RegisterBackendSelect.cpp
@ -11,8 +11,6 @@
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Operators.h>
 #else
-#include <ATen/ops/is_pinned_ops.h>
-#include <ATen/ops/_pin_memory_ops.h>

 ${ops_headers}
 #endif
@ -23,31 +21,8 @@ namespace {

 ${backend_select_method_definitions}

-bool is_pinned(const Tensor& self, std::optional<at::Device> device) {
-  // Only CPU tensors can be pinned
-  if (!self.is_cpu()) {
-    return false;
-  }
-  // TODO: fetch scalar type from Tensor? But it doesn't really matter...
-  DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(std::nullopt, self.layout(), device.value_or(at::kCUDA)));
-  return at::_ops::is_pinned::redispatch(_dk, self, device);
-}
-
-at::Tensor _pin_memory(const Tensor& self, std::optional<at::Device> device) {
-  TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned");
-  DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(std::nullopt, self.layout(), device.value_or(at::kCUDA)));
-  if (self.is_nested()) {
-    constexpr auto nested_key_set = c10::DispatchKeySet(
-        {c10::DispatchKey::NestedTensor, c10::DispatchKey::AutogradNestedTensor});
-    _dk = _dk.add(self.key_set() & nested_key_set);
-  }
-  return at::_ops::_pin_memory::redispatch(_dk, self, device);
-}
-
 TORCH_LIBRARY_IMPL(aten, BackendSelect, m) {
  ${backend_select_function_registrations};
-  m.impl(TORCH_SELECTIVE_NAME("aten::is_pinned"), TORCH_FN(is_pinned));
-  m.impl(TORCH_SELECTIVE_NAME("aten::_pin_memory"), TORCH_FN(_pin_memory));
 }

 } // namespace
--- a/test/cpp_extensions/open_registration_extension.cpp
+++ b/test/cpp_extensions/open_registration_extension.cpp
@ -418,38 +418,6 @@ at::Tensor& custom_set_source_Storage_storage_offset(at::Tensor& result,
  return result;
 }

-// basic dummy functions related to pin_memory.
-std::vector<void*> custom_pinned_data_ptr;
-
-at::Tensor custom__pin_memory(const at::Tensor& self, std::optional<at::Device> device) {
-  TORCH_CHECK(
-      self.device().is_cpu(),
-      "cannot pin '",
-      self.toString(),
-      "' only dense CPU tensors can be pinned");
-
-  // record pinned data ptr
-  at::Tensor dump_pinned_tensor = self * 1.0;
-  custom_pinned_data_ptr.push_back(dump_pinned_tensor.storage().data_ptr().get());
-
-  return dump_pinned_tensor;
-}
-
-bool custom_is_pinned(const at::Tensor& self, std::optional<at::Device> device) {
-  // Only CPU tensors can be pinned
-  if (!self.is_cpu()) {
-    return false;
-  }
-
-  void* query_pinned_ptr = self.storage().data_ptr().get();
-  for (const auto& iter_ptr : custom_pinned_data_ptr) {
-    if (iter_ptr == query_pinned_ptr) {
-      return true;
-    }
-  }
-  return false;
-}
-
 const at::Tensor& custom_resize_(const at::Tensor& self, at::IntArrayRef size,
                          std::optional<at::MemoryFormat> optional_memory_format) {
  at::TensorImpl* tensor_impl = self.unsafeGetTensorImpl();
@ -545,8 +513,6 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
  m.impl("empty_strided", &custom_empty_strided);
  m.impl("set_.source_Storage", &custom_set_source_Storage);
  m.impl("set_.source_Storage_storage_offset",&custom_set_source_Storage_storage_offset);
-  m.impl("_pin_memory", &custom__pin_memory);
-  m.impl("is_pinned", &custom_is_pinned);
  m.impl("resize_", &custom_resize_);
  m.impl("as_strided", at::native::as_strided_tensorimpl);
  m.impl("quantize_per_tensor", at::native::quantize_per_tensor);
@ -612,6 +578,9 @@ void set_custom_device_index(c10::DeviceIndex device_index) {
  custom_device_index = device_index;
 }

+// a global flag used for dummy pin_memory of custom device
+bool custom_pinned_flag = false;
+
 struct FooHooksArgs : public at::PrivateUse1HooksArgs {};

 struct FooHooksInterface : public at::PrivateUse1HooksInterface {
@ -621,6 +590,16 @@ struct FooHooksInterface : public at::PrivateUse1HooksInterface {
      static auto device_gen = make_generator_privateuse1(device_index);
      return device_gen;
    }
+    // this is a simple implementation, custom_pinned_flag will be set as true
+    // once tensor.pin_memory() is called. And then tensor.is_pinned()
+    // always return true no matter what tensor it's called on.
+    bool isPinnedPtr(const void* data) const override {
+      return custom_pinned_flag;
+    }
+    c10::Allocator* getPinnedMemoryAllocator() const override {
+      custom_pinned_flag = true;
+      return c10::GetCPUAllocator();
+    }
 };

 TORCH_DECLARE_REGISTRY(PrivateUse1HooksRegistry, FooHooksInterface, FooHooksArgs);
--- a/test/test_cpp_extensions_open_device_registration.py
+++ b/test/test_cpp_extensions_open_device_registration.py
@ -343,71 +343,24 @@ class TestCppExtensionOpenRgistration(common.TestCase):
        cpu_tensor_pin = cpu_tensor.pin_memory("foo")
        self.assertTrue(cpu_tensor_pin.is_pinned("foo"))

-        # Test storage pin_memory on custom device string
+        # Test storage pin_memory and is_pin
        cpu_storage = cpu_tensor.storage()
-        foo_device = torch.device("foo")
-        self.assertFalse(cpu_storage.is_pinned("foo"))
+        # We implement a dummy pin_memory of no practical significance
+        # for custom device. Once tensor.pin_memory() has been called,
+        # then tensor.is_pinned() will always return true no matter
+        # what tensor it's called on.
+        self.assertTrue(cpu_storage.is_pinned("foo"))

-        cpu_storage_pin = cpu_storage.pin_memory("foo")
-        self.assertFalse(cpu_storage.is_pinned())
-        self.assertFalse(cpu_storage.is_pinned("foo"))
-        self.assertFalse(cpu_storage.is_pinned(foo_device))
-        self.assertFalse(cpu_storage_pin.is_pinned())
-        self.assertTrue(cpu_storage_pin.is_pinned("foo"))
-        self.assertTrue(cpu_storage_pin.is_pinned(foo_device))
-
-        cpu_storage_pin_already = cpu_storage_pin.pin_memory("foo")
-        self.assertTrue(cpu_storage_pin.is_pinned("foo"))
-        self.assertTrue(cpu_storage_pin.is_pinned(foo_device))
-        self.assertTrue(cpu_storage_pin_already.is_pinned("foo"))
-        self.assertTrue(cpu_storage_pin_already.is_pinned(foo_device))
-        self.assertFalse(cpu_storage.is_pinned("foo"))
-
-        cpu_storage_pinned = cpu_storage.pin_memory(foo_device)
-        self.assertFalse(cpu_storage.is_pinned())
-        self.assertFalse(cpu_storage.is_pinned("foo"))
-        self.assertFalse(cpu_storage.is_pinned(foo_device))
-        self.assertFalse(cpu_storage_pinned.is_pinned())
+        cpu_storage_pinned = cpu_storage.pin_memory("foo")
        self.assertTrue(cpu_storage_pinned.is_pinned("foo"))
-        self.assertTrue(cpu_storage_pinned.is_pinned(foo_device))

        # Test untyped storage pin_memory and is_pin
        cpu_tensor = torch.randn([3, 2, 1, 4])
        cpu_untyped_storage = cpu_tensor.untyped_storage()
-        self.assertFalse(cpu_untyped_storage.is_pinned())
-        self.assertFalse(cpu_untyped_storage.is_pinned("foo"))
+        self.assertTrue(cpu_untyped_storage.is_pinned("foo"))

        cpu_untyped_storage_pinned = cpu_untyped_storage.pin_memory("foo")
-        self.assertFalse(cpu_untyped_storage_pinned.is_pinned())
        self.assertTrue(cpu_untyped_storage_pinned.is_pinned("foo"))
-        self.assertTrue(cpu_untyped_storage_pinned.is_pinned(foo_device))
-
-        cpu_untyped_storage_pinned = cpu_untyped_storage.pin_memory(foo_device)
-        self.assertFalse(cpu_untyped_storage_pinned.is_pinned())
-        self.assertTrue(cpu_untyped_storage_pinned.is_pinned("foo"))
-        self.assertTrue(cpu_untyped_storage_pinned.is_pinned(foo_device))
-
-        with self.assertRaisesRegex(TypeError, "positional arguments but 3 were given"):
-            cpu_untyped_storage_pinned.is_pinned("foo1", "foo2")
-
-        # Test storage pin_memory on error device
-        self.assertFalse(cpu_storage_pinned.is_pinned("hpu"))
-        self.assertFalse(cpu_untyped_storage_pinned.is_pinned("hpu"))
-        invalid_device = torch.device("hpu")
-        self.assertFalse(cpu_untyped_storage_pinned.is_pinned(invalid_device))
-
-        with self.assertRaisesRegex(
-            NotImplementedError, "with arguments from the 'HPU' backend"
-        ):
-            cpu_storage.pin_memory("hpu")
-        with self.assertRaisesRegex(
-            NotImplementedError, "with arguments from the 'HPU' backend"
-        ):
-            cpu_untyped_storage.pin_memory("hpu")
-        with self.assertRaisesRegex(
-            NotImplementedError, "with arguments from the 'HPU' backend"
-        ):
-            cpu_untyped_storage.pin_memory(invalid_device)

    @unittest.skip(
        "Temporarily disable due to the tiny differences between clang++ and g++ in defining static variable in inline function"
--- a/test/test_cpp_extensions_stream_and_event.py
+++ b/test/test_cpp_extensions_stream_and_event.py
@ -14,6 +14,7 @@ from torch.testing._internal.common_utils import (
    IS_LINUX,
    skipIfTorchDynamo,
    TEST_CUDA,
+    TEST_MPS,
    TEST_PRIVATEUSE1,
    TEST_XPU,
 )
@ -37,7 +38,13 @@ def remove_build_path():
 # Since we use a fake MTIA device backend to test generic Stream/Event, device backends are mutual exclusive to each other.
 # The test will be skipped if any of the following conditions are met:
@unittest.skipIf(
-    IS_ARM64 or not IS_LINUX or TEST_CUDA or TEST_XPU or TEST_PRIVATEUSE1 or TEST_ROCM,
+    IS_ARM64
+    or not IS_LINUX
+    or TEST_CUDA
+    or TEST_XPU
+    or TEST_MPS
+    or TEST_PRIVATEUSE1
+    or TEST_ROCM,
    "Only on linux platform and mutual exclusive to other backends",
 )
@torch.testing._internal.common_utils.markDynamoStrictTest
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -8432,9 +8432,7 @@ tensor([[[1.+1.j, 1.+1.j, 1.+1.j,  ..., 1.+1.j, 1.+1.j, 1.+1.j],
    def test_pin_memory(self):
        x = torch.randn(3, 5)
        self.assertFalse(x.is_pinned())
-        if not torch.cuda.is_available():
-            self.assertRaises(RuntimeError, lambda: x.pin_memory())
-        else:
+        if torch.cuda.is_available():
            pinned = x.pin_memory()
            self.assertTrue(pinned.is_pinned())
            self.assertEqual(pinned, x)
--- a/torch/csrc/Stream.cpp
+++ b/torch/csrc/Stream.cpp
@ -29,7 +29,7 @@ static PyObject* THPStream_pynew(
  int64_t priority = 0;

  static torch::PythonArgParser parser({
-      "Steram(Device device=None, *, int64_t priority=0)",
+      "Stream(Device device=None, *, int64_t priority=0)",
      "Stream(int64_t stream_id, int64_t device_index, int64_t device_type, *, int64_t priority=0)",
  });