From b4a35632f9cf5bb76fb7576d6f3e0052c124fcf5 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 10 Jan 2024 15:34:16 +0000 Subject: [PATCH] Add function to materialize COW storages (#117053) Summary: From Kurt Mohler, see https://github.com/pytorch/pytorch/pull/113396 (manually imported due to ghimport problems) Test Plan: sandcastle, OSS CI Differential Revision: D52610522 Pull Request resolved: https://github.com/pytorch/pytorch/pull/117053 Approved by: https://github.com/malfet, https://github.com/kurtamohler --- aten/src/ATen/EmptyTensor.cpp | 1 + aten/src/ATen/cuda/CachingHostAllocator.cpp | 8 ++ .../hip/impl/HIPAllocatorMasqueradingAsCUDA.h | 3 + aten/src/ATen/mps/MPSAllocator.mm | 4 + aten/src/ATen/native/Resize.cpp | 7 +- aten/src/ATen/native/TensorFactories.h | 1 + aten/src/ATen/test/CMakeLists.txt | 2 + aten/src/ATen/test/allocator_clone_test.h | 35 ++++++++ aten/src/ATen/test/cpu_allocator_test.cpp | 10 +++ aten/src/ATen/test/cuda_allocator_test.cpp | 10 +++ aten/src/ATen/test/xla_tensor_test.cpp | 13 +++ c10/CMakeLists.txt | 2 - c10/core/Allocator.cpp | 17 ++++ c10/core/Allocator.h | 31 +++++++ c10/core/CPUAllocator.cpp | 14 ++++ c10/core/StorageImpl.h | 30 ++++++- c10/core/build.bzl | 18 ---- c10/core/impl/{cow => }/COW.cpp | 65 ++++++++++----- c10/core/impl/{cow => }/COW.h | 3 + c10/core/impl/{cow => }/COWDeleter.cpp | 2 +- c10/core/impl/{cow => }/COWDeleter.h | 0 .../impl/{cow/README.md => README-cow.md} | 0 c10/cuda/CUDACachingAllocator.cpp | 4 + c10/cuda/CUDAMallocAsyncAllocator.cpp | 4 + c10/test/build.bzl | 1 - c10/test/core/impl/cow_test.cpp | 82 ++++++++++++++++++- caffe2/core/context_gpu.cu | 8 ++ setup.py | 1 - .../open_registration_extension.cpp | 4 + .../extension_backends/extension_device.cpp | 4 + torch/csrc/cuda/CUDAPluggableAllocator.cpp | 8 ++ torch/csrc/cuda/CUDAPluggableAllocator.h | 1 + 32 files changed, 343 insertions(+), 50 deletions(-) create mode 100644 aten/src/ATen/test/allocator_clone_test.h create mode 100644 aten/src/ATen/test/cpu_allocator_test.cpp create mode 100644 aten/src/ATen/test/cuda_allocator_test.cpp rename c10/core/impl/{cow => }/COW.cpp (64%) rename c10/core/impl/{cow => }/COW.h (87%) rename c10/core/impl/{cow => }/COWDeleter.cpp (96%) rename c10/core/impl/{cow => }/COWDeleter.h (100%) rename c10/core/impl/{cow/README.md => README-cow.md} (100%) diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp index 459960bbf86..e2518dca8c5 100644 --- a/aten/src/ATen/EmptyTensor.cpp +++ b/aten/src/ATen/EmptyTensor.cpp @@ -311,6 +311,7 @@ struct MetaAllocator final : public at::Allocator { DeleterFnPtr raw_deleter() const override { return deleter; } + void copy_data(void* dest, const void* src, std::size_t count) const final {} }; static MetaAllocator g_meta_alloc; diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp index 22dbb661f18..3c63ed71c0a 100644 --- a/aten/src/ATen/cuda/CachingHostAllocator.cpp +++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp @@ -301,6 +301,10 @@ class CUDAHostAllocator { } } + void copy_data(void* dest, const void* src, std::size_t count) const { + TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for CUDAHostAllocator"); + } + private: void process_events() { while (true) { @@ -496,6 +500,10 @@ struct CUDAHostAllocatorWrapper final : public at::Allocator { &CUDAHostAllocatorDeleter, at::DeviceType::CPU}; } + + void copy_data(void* dest, const void* src, std::size_t count) const final { + getCUDAHostAllocator().copy_data(dest, src, count); + } }; static CUDAHostAllocatorWrapper cuda_host_allocator; diff --git a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h index c5a607c51b3..e4c0cec950a 100644 --- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h @@ -23,6 +23,9 @@ public: DeleterFnPtr raw_deleter() const override { return allocator_->raw_deleter(); } + void copy_data(void* dest, const void* src, std::size_t count) const final { + allocator_->copy_data(dest, src, count); + } }; }} // namespace c10::hip diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm index 5b59cd5d1dd..ac9bf85f4cd 100644 --- a/aten/src/ATen/mps/MPSAllocator.mm +++ b/aten/src/ATen/mps/MPSAllocator.mm @@ -819,6 +819,10 @@ struct TORCH_API MPSAllocator final : public IMPSAllocator { return _getAllocImpl().format_size(size); } + void copy_data(void* dest, const void* src, std::size_t count) const final { + default_copy_data(dest, src, count); + } + private: bool m_has_unified_memory; uint32_t m_usage; diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp index 415d3d65bef..89f27935c19 100644 --- a/aten/src/ATen/native/Resize.cpp +++ b/aten/src/ATen/native/Resize.cpp @@ -94,13 +94,14 @@ void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) { if (size_bytes != 0) { new_data = storage->allocator()->allocate(size_bytes); } - at::DataPtr old_data = storage->set_data_ptr(std::move(new_data)); + const at::DataPtr& old_data = storage->data_ptr(); const auto old_capacity = storage->nbytes(); - storage->set_nbytes(size_bytes); const auto copy_capacity = std::min(size_bytes, old_capacity); if (old_data != nullptr && copy_capacity > 0) { - memcpy(storage->mutable_data(), old_data.get(), copy_capacity); + memcpy(new_data.get(), old_data.get(), copy_capacity); } + storage->set_data_ptr_noswap(std::move(new_data)); + storage->set_nbytes(size_bytes); } // Call the sparse implementation in SparseTensor.cpp directly. diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h index 26b57397911..09bbde077e8 100644 --- a/aten/src/ATen/native/TensorFactories.h +++ b/aten/src/ATen/native/TensorFactories.h @@ -130,6 +130,7 @@ struct ZeroTensorAllocator final : public at::Allocator { DeleterFnPtr raw_deleter() const override { return deleter; } + void copy_data(void* dest, const void* src, std::size_t count) const final {} at::Device device_; }; diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index aa976778617..d81ba78f4c2 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -13,6 +13,7 @@ list(APPEND ATen_CPU_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/atest.cpp ${CMAKE_CURRENT_SOURCE_DIR}/basic.cpp ${CMAKE_CURRENT_SOURCE_DIR}/broadcast_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpu_allocator_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpu_generator_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpu_profiling_allocator_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpu_rng_test.cpp @@ -54,6 +55,7 @@ list(APPEND ATen_CPU_TEST_SRCS ) list(APPEND ATen_CUDA_TEST_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/cuda_allocator_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_apply_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_atomic_ops_test.cu ${CMAKE_CURRENT_SOURCE_DIR}/cuda_caching_host_allocator_test.cpp diff --git a/aten/src/ATen/test/allocator_clone_test.h b/aten/src/ATen/test/allocator_clone_test.h new file mode 100644 index 00000000000..79a1f5f79c8 --- /dev/null +++ b/aten/src/ATen/test/allocator_clone_test.h @@ -0,0 +1,35 @@ +#pragma once +#include +#include + +void test_allocator_clone(c10::Allocator* allocator) { + ASSERT_TRUE(allocator != nullptr); + + c10::Storage a_storage(c10::make_intrusive( + c10::StorageImpl::use_byte_size_t(), + 0, + allocator, + /*resizable=*/true)); + + c10::Storage b_storage(c10::make_intrusive( + c10::StorageImpl::use_byte_size_t(), + 0, + allocator, + /*resizable=*/true)); + + at::Tensor a = at::empty({0}, at::TensorOptions().device(a_storage.device())).set_(a_storage); + at::Tensor b = at::empty({0}, at::TensorOptions().device(b_storage.device())).set_(b_storage); + + std::vector sizes({13, 4, 5}); + + at::rand_out(a, sizes); + at::rand_out(b, sizes); + + ASSERT_TRUE(a_storage.nbytes() == static_cast(a.numel() * a.element_size())); + ASSERT_TRUE(a_storage.nbytes() == b_storage.nbytes()); + + void* a_data_ptr = a_storage.mutable_data(); + b_storage.set_data_ptr(allocator->clone(a_data_ptr, a_storage.nbytes())); + + ASSERT_TRUE((a == b).all().item()); +} diff --git a/aten/src/ATen/test/cpu_allocator_test.cpp b/aten/src/ATen/test/cpu_allocator_test.cpp new file mode 100644 index 00000000000..db98522c57c --- /dev/null +++ b/aten/src/ATen/test/cpu_allocator_test.cpp @@ -0,0 +1,10 @@ +#include + +#include +#include + +#include + +TEST(AllocatorTestCPU, test_clone) { + test_allocator_clone(c10::GetDefaultCPUAllocator()); +} diff --git a/aten/src/ATen/test/cuda_allocator_test.cpp b/aten/src/ATen/test/cuda_allocator_test.cpp new file mode 100644 index 00000000000..27a352e7d5a --- /dev/null +++ b/aten/src/ATen/test/cuda_allocator_test.cpp @@ -0,0 +1,10 @@ +#include + +#include +#include + +#include + +TEST(AllocatorTestCUDA, test_clone) { + test_allocator_clone(c10::cuda::CUDACachingAllocator::get()); +} diff --git a/aten/src/ATen/test/xla_tensor_test.cpp b/aten/src/ATen/test/xla_tensor_test.cpp index 21f911ec1a4..1c9e392f9fe 100644 --- a/aten/src/ATen/test/xla_tensor_test.cpp +++ b/aten/src/ATen/test/xla_tensor_test.cpp @@ -2,6 +2,8 @@ #include +#include + using namespace at; void XLAFree(void *ptr) { @@ -22,6 +24,9 @@ struct XLAAllocator final : public at::Allocator { at::DeleterFnPtr raw_deleter() const override { return &XLAFree; } + void copy_data(void* dest, const void* src, std::size_t count) const final { + default_copy_data(dest, src, count); + } }; TEST(XlaTensorTest, TestNoStorage) { @@ -33,3 +38,11 @@ TEST(XlaTensorTest, TestNoStorage) { at::Tensor t(std::move(tensor_impl)); ASSERT_TRUE(t.device() == at::Device(DeviceType::XLA, 0)); } + +TEST(XlaTensorTest, test_allocator_clone) { + if (!at::hasXLA()) { + return; + } + XLAAllocator allocator; + test_allocator_clone(&allocator); +} diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt index 68396a654d2..b120dc2b74b 100644 --- a/c10/CMakeLists.txt +++ b/c10/CMakeLists.txt @@ -30,7 +30,6 @@ file(GLOB C10_SRCS *.cpp core/*.cpp core/impl/*.cpp - core/impl/cow/*.cpp mobile/*.cpp macros/*.cpp util/*.cpp @@ -39,7 +38,6 @@ file(GLOB C10_HEADERS *.h core/*.h core/impl/*.h - core/impl/cow/*.h mobile/*.h macros/*.h util/*.h diff --git a/c10/core/Allocator.cpp b/c10/core/Allocator.cpp index dada5bb0eac..c95fdec2400 100644 --- a/c10/core/Allocator.cpp +++ b/c10/core/Allocator.cpp @@ -4,6 +4,23 @@ namespace c10 { +DataPtr Allocator::clone(const void* data, std::size_t n) const { + DataPtr new_data = allocate(n); + copy_data(new_data.mutable_get(), data, n); + return new_data; +} + +void Allocator::default_copy_data( + void* dest, + const void* src, + std::size_t count) const { + std::memcpy(dest, src, count); +} + +bool Allocator::is_simple_data_ptr(const DataPtr& data_ptr) const { + return data_ptr.get() == data_ptr.get_context(); +} + static void deleteInefficientStdFunctionContext(void* ptr) { delete static_cast(ptr); } diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h index fb7f6c0b4a7..1019643ee87 100644 --- a/c10/core/Allocator.h +++ b/c10/core/Allocator.h @@ -162,6 +162,21 @@ struct C10_API Allocator { virtual DataPtr allocate(size_t n) const = 0; + // Clones an allocation that came from this allocator. + // + // To perform the copy, this function calls `copy_data`, which + // must be implemented by derived classes. + // + // Note that this explicitly ignores any context that may have been + // attached to the input data. + // + // Requires: input data was allocated by the same allocator. + DataPtr clone(const void* data, std::size_t n) const; + + // Checks if DataPtr has a simple context, not wrapped with any out of the + // ordinary contexts. + virtual bool is_simple_data_ptr(const DataPtr& data_ptr) const; + // If this returns a non nullptr, it means that allocate() // is guaranteed to return a unique_ptr with this deleter attached; // it means the rawAllocate and rawDeallocate APIs are safe to use. @@ -179,6 +194,22 @@ struct C10_API Allocator { AT_ASSERT(d); d(ptr); } + + // Copies data from one allocation to another. + // Pure virtual, so derived classes must define behavior. + // Derived class implementation can simply call `default_copy_data` + // to use `std::memcpy`. + // + // Requires: src and dest were allocated by this allocator + // Requires: src and dest both have length >= count + virtual void copy_data(void* dest, const void* src, std::size_t count) + const = 0; + + protected: + // Uses `std::memcpy` to copy data. + // Child classes can use this as `copy_data` when an alternative copy + // API is not needed. + void default_copy_data(void* dest, const void* src, std::size_t count) const; }; // This context is used to generate DataPtr which have arbitrary diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp index c103c42a282..04759047b40 100644 --- a/c10/core/CPUAllocator.cpp +++ b/c10/core/CPUAllocator.cpp @@ -40,6 +40,10 @@ struct C10_API DefaultCPUAllocator final : at::Allocator { at::DeleterFnPtr raw_deleter() const override { return &ReportAndDelete; } + + void copy_data(void* dest, const void* src, std::size_t count) const final { + default_copy_data(dest, src, count); + } }; ProfiledCPUMemoryReporter& profiledCPUMemoryReporter() { @@ -142,6 +146,16 @@ class DefaultMobileCPUAllocator final : public at::Allocator { DeleterFnPtr raw_deleter() const override { return deleter; } + + bool is_simple_data_ptr(const c10::DataPtr& data_ptr) const final { + return reinterpret_cast(data_ptr.get()) == + reinterpret_cast(data_ptr.get_context()) + + PreGuardBytes; + } + + void copy_data(void* dest, const void* src, std::size_t count) const final { + default_copy_data(dest, src, count); + } }; void NoDelete(void*) {} diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h index 328286338a0..ba1cef6f0e1 100644 --- a/c10/core/StorageImpl.h +++ b/c10/core/StorageImpl.h @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include #include @@ -117,6 +119,7 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target { } at::DataPtr& mutable_data_ptr() { + maybe_materialize_cow(); return data_ptr_; } @@ -126,9 +129,10 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target { // Returns the previous data_ptr at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) { - at::DataPtr old_data_ptr(std::move(data_ptr_)); - data_ptr_ = std::move(data_ptr); - return old_data_ptr; + // We need to materialize the old COW DataPtr because it is + // being returned as mutable. + maybe_materialize_cow(); + return set_data_ptr_no_materialize_cow(std::move(data_ptr)); } void set_data_ptr_noswap(at::DataPtr&& data_ptr) { @@ -140,6 +144,7 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target { } void* mutable_data() { + maybe_materialize_cow(); return data_ptr_.mutable_get(); } @@ -217,7 +222,26 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target { return &pyobj_slot_; } + protected: + // materialize_cow_storage needs to call set_data_ptr_no_materlize_cow + friend void c10::impl::cow::materialize_cow_storage(StorageImpl& storage); + + // Returns the previous data_ptr. If the old data_ptr was COW, + // this avoids materializing it + at::DataPtr set_data_ptr_no_materialize_cow(at::DataPtr&& data_ptr) { + at::DataPtr old_data_ptr(std::move(data_ptr_)); + data_ptr_ = std::move(data_ptr); + return old_data_ptr; + } + private: + // Triggers a copy if this is a copy-on-write tensor. + void maybe_materialize_cow() { + if (data_ptr_.get_deleter() == impl::cow::cow_deleter) { + impl::cow::materialize_cow_storage(*this); + } + } + DataPtr data_ptr_; SymInt size_bytes_; bool size_bytes_is_heap_allocated_; diff --git a/c10/core/build.bzl b/c10/core/build.bzl index 781c740d4c7..45fc5ea3390 100644 --- a/c10/core/build.bzl +++ b/c10/core/build.bzl @@ -62,7 +62,6 @@ def define_targets(rules): exclude = [ "CPUAllocator.cpp", "impl/alloc_cpu.cpp", - "impl/cow/*.cpp", ], ), hdrs = rules.glob( @@ -73,7 +72,6 @@ def define_targets(rules): exclude = [ "CPUAllocator.h", "impl/alloc_cpu.h", - "impl/cow/*.h", ], ), linkstatic = True, @@ -92,22 +90,6 @@ def define_targets(rules): alwayslink = True, ) - rules.cc_library( - name = "impl_cow", - srcs = rules.glob([ - "impl/cow/*.cpp", - ]), - hdrs = rules.glob([ - "impl/cow/*.h", - ]), - deps = [ - ":base", - ":CPUAllocator", - ], - visibility = ["//c10/test:__pkg__"], - - ) - rules.filegroup( name = "headers", srcs = rules.glob( diff --git a/c10/core/impl/cow/COW.cpp b/c10/core/impl/COW.cpp similarity index 64% rename from c10/core/impl/cow/COW.cpp rename to c10/core/impl/COW.cpp index f32e0ea6680..ad92a8d6717 100644 --- a/c10/core/impl/cow/COW.cpp +++ b/c10/core/impl/COW.cpp @@ -1,10 +1,9 @@ -#include +#include #include -#include #include #include -#include +#include #include #include @@ -30,24 +29,18 @@ at::DataPtr copy_data_ptr(at::DataPtr const& data_ptr) { return make_data_ptr(data_ptr, *ctx); } -bool is_simple_context( - const void* context, - const void* data, - const at::Allocator* allocator) { - if (allocator == c10::GetDefaultMobileCPUAllocator()) { - return reinterpret_cast(data) == - reinterpret_cast(context) + c10::gAlignment; - } else { - return data == context; - } -} - } // namespace bool has_simple_data_ptr(const c10::StorageImpl& storage) { const c10::DataPtr& data_ptr = storage.data_ptr(); - return is_simple_context( - data_ptr.get_context(), data_ptr.get(), storage.allocator()); + const void* ctx = data_ptr.get_context(); + const void* data = data_ptr.get(); + const c10::Allocator* allocator = storage.allocator(); + if (allocator != nullptr) { + return allocator->is_simple_data_ptr(data_ptr); + } else { + return ctx == data; + } } bool is_cow_data_ptr(const c10::DataPtr& data_ptr) { @@ -88,8 +81,6 @@ c10::intrusive_ptr lazy_clone_storage(StorageImpl& storage) { // Case 1) We have a simple data pointer: wrap it. std::unique_ptr original_ctx = storage.mutable_data_ptr().move_context(); - TORCH_INTERNAL_ASSERT(is_simple_context( - original_ctx.get(), data_ptr.get(), storage.allocator())); // Save this for the result. new_data_ptr = make_data_ptr( @@ -117,4 +108,40 @@ c10::intrusive_ptr lazy_clone_storage(StorageImpl& storage) { storage.resizable()); } +C10_API void materialize_cow_storage(StorageImpl& storage) { + const at::DataPtr& data_ptr = storage.data_ptr(); + + auto* ctx = data_ptr.cast_context(cow::cow_deleter); + TORCH_INTERNAL_ASSERT(ctx != nullptr); + + auto result = ctx->decrement_refcount(); + + // This must be set by each branch below. + std::optional new_data_ptr; + + if (std::holds_alternative(result)) { + // This is the only reference to the data. If there were any racing writes, + // the context ensured they finished before giving us the result. + std::unique_ptr data = + std::get(std::move(result)); + TORCH_INTERNAL_ASSERT(data.get() == data_ptr.get()); + new_data_ptr = DataPtr( + data.release(), data_ptr.get(), data.get_deleter(), data_ptr.device()); + } else { + TORCH_INTERNAL_ASSERT( + std::holds_alternative( + result)); + // We don't need to consume the result, it's just a shared lock ensuring + // that the data will remain while we copy it. + new_data_ptr = storage.allocator()->clone(data_ptr.get(), storage.nbytes()); + } + + TORCH_INTERNAL_ASSERT(new_data_ptr.has_value()); + DataPtr old_data_ptr = + storage.set_data_ptr_no_materialize_cow(*std::move(new_data_ptr)); + // The refcount of the context was already decremented above. Release the + // reference to the context so the refcount doesn't get decremented again + old_data_ptr.release_context(); +} + } // namespace c10::impl::cow diff --git a/c10/core/impl/cow/COW.h b/c10/core/impl/COW.h similarity index 87% rename from c10/core/impl/cow/COW.h rename to c10/core/impl/COW.h index 07ef5e4fe90..1cf81eda1ca 100644 --- a/c10/core/impl/cow/COW.h +++ b/c10/core/impl/COW.h @@ -26,4 +26,7 @@ C10_API bool has_simple_data_ptr(const c10::StorageImpl& storage); // Check if a DataPtr is COW C10_API bool is_cow_data_ptr(const c10::DataPtr& data_ptr); +// Eagerly copies a COW storage's data, turning it into a non-COW storage. +C10_API void materialize_cow_storage(StorageImpl& storage); + } // namespace c10::impl::cow diff --git a/c10/core/impl/cow/COWDeleter.cpp b/c10/core/impl/COWDeleter.cpp similarity index 96% rename from c10/core/impl/cow/COWDeleter.cpp rename to c10/core/impl/COWDeleter.cpp index 9da38f5cf1b..bd8b4963cf1 100644 --- a/c10/core/impl/cow/COWDeleter.cpp +++ b/c10/core/impl/COWDeleter.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include diff --git a/c10/core/impl/cow/COWDeleter.h b/c10/core/impl/COWDeleter.h similarity index 100% rename from c10/core/impl/cow/COWDeleter.h rename to c10/core/impl/COWDeleter.h diff --git a/c10/core/impl/cow/README.md b/c10/core/impl/README-cow.md similarity index 100% rename from c10/core/impl/cow/README.md rename to c10/core/impl/README-cow.md diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 393651e9b58..4e71e9f5dcd 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -3243,6 +3243,10 @@ class NativeCachingAllocator : public CUDAAllocator { std::string name() override { return "native"; } + void copy_data(void* dest, const void* src, std::size_t count) const final { + C10_CUDA_CHECK( + cudaMemcpy(dest, src, count, cudaMemcpyKind::cudaMemcpyDeviceToDevice)); + } }; NativeCachingAllocator allocator; diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp index dbc6e8626be..a4503e38303 100644 --- a/c10/cuda/CUDAMallocAsyncAllocator.cpp +++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp @@ -875,6 +875,10 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator { std::string name() override { return "cudaMallocAsync"; } + void copy_data(void* dest, const void* src, std::size_t count) const final { + C10_CUDA_CHECK( + cudaMemcpy(dest, src, count, cudaMemcpyKind::cudaMemcpyDeviceToDevice)); + } }; CudaMallocAsyncAllocator device_allocator; diff --git a/c10/test/build.bzl b/c10/test/build.bzl index d9b58971b2e..2f54c8a2faa 100644 --- a/c10/test/build.bzl +++ b/c10/test/build.bzl @@ -21,7 +21,6 @@ def define_targets(rules): "//c10/core:base", "//c10/util:base", "//c10/core:CPUAllocator", - "//c10/core:impl_cow", "@com_google_googletest//:gtest_main", ], ) diff --git a/c10/test/core/impl/cow_test.cpp b/c10/test/core/impl/cow_test.cpp index 5fd30f509e5..f00243adee2 100644 --- a/c10/test/core/impl/cow_test.cpp +++ b/c10/test/core/impl/cow_test.cpp @@ -1,5 +1,5 @@ -#include -#include +#include +#include #include #include @@ -167,6 +167,84 @@ TEST(lazy_clone_storage_test, already_copy_on_write) { ASSERT_THAT(new_storage->data(), testing::Eq(original_storage.data())); } +TEST(materialize_test, not_copy_on_write_context) { + StorageImpl storage( + {}, /*size_bytes=*/6, GetCPUAllocator(), /*resizable=*/false); + ASSERT_THAT(storage, testing::Not(is_copy_on_write())); + + void const* original_data = storage.data(); + + // Nothing to materialize. + ASSERT_THAT(storage.mutable_data(), testing::Eq(original_data)); +} + +TEST(materialize_test, copy_on_write_single_reference) { + // A copy-on-write storage with only a single reference can just + // drop the copy-on-write context upon materialization. + std::unique_ptr data( + new std::byte[4], + +[](void* bytes) { delete[] static_cast(bytes); }); + void* data_ptr = data.get(); + StorageImpl storage( + {}, + /*size_bytes=*/4, + at::DataPtr( + /*data=*/data_ptr, + /*ctx=*/new cow::COWDeleterContext(std::move(data)), + cow::cow_deleter, + Device(Device::Type::CPU)), + /*allocator=*/nullptr, + /*resizable=*/false); + + ASSERT_THAT(storage, is_copy_on_write()); + + ASSERT_THAT(storage.data(), testing::Eq(data_ptr)); + + void const* original_data = storage.data(); + + // Materializes storage. Only reference, so no new allocation. + ASSERT_THAT(storage.mutable_data(), testing::Eq(original_data)); + // But it is no longer copy-on-write. + ASSERT_THAT(storage, testing::Not(is_copy_on_write())); +} + +bool buffers_are_equal(const void* a, const void* b, size_t nbytes) { + const char* a_ = static_cast(a); + const char* b_ = static_cast(b); + + for (size_t idx = 0; idx < nbytes; idx++) { + if (a_[idx] != b_[idx]) { + return false; + } + } + return true; +} + +TEST(materialize_test, copy_on_write) { + StorageImpl original_storage( + {}, /*size_bytes=*/6, GetCPUAllocator(), /*resizable=*/false); + std::memcpy(original_storage.mutable_data(), "abcd", 4); + void const* original_data = original_storage.data(); + + auto new_storage = cow::lazy_clone_storage(original_storage); + ASSERT_THAT(new_storage, testing::NotNull()); + + auto context = new_storage->data_ptr().cast_context( + cow::cow_deleter); + ASSERT_THAT(context, testing::NotNull()); + + // Materialized storage has new copy of data. + ASSERT_THAT(new_storage->mutable_data(), testing::Ne(original_data)); + + // But the original storage still has the original copy. + ASSERT_THAT(original_storage.data(), testing::Eq(original_data)); + + // And their data is the same + ASSERT_TRUE(new_storage->nbytes() == original_storage.nbytes()); + ASSERT_TRUE(buffers_are_equal( + new_storage->data(), original_storage.data(), new_storage->nbytes())); +} + } // namespace } // namespace c10::impl // NOLINTEND(clang-analyzer-cplusplus*) diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu index 3359e88bcba..6555b9732c9 100644 --- a/caffe2/core/context_gpu.cu +++ b/caffe2/core/context_gpu.cu @@ -336,6 +336,10 @@ struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator { return &Delete; } + void copy_data(void* dest, const void* src, std::size_t count) const final { + TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for PinnedCPUAllocator"); + } + private: static void Delete(void* data) { if (!data) { @@ -581,6 +585,10 @@ struct DefaultCUDAAllocator final : public at::Allocator { return &Delete; } + void copy_data(void* dest, const void* src, std::size_t count) const final { + TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for DefaultCUDAAllocator"); + } + private: static void Delete(void* ptr) { // lock the mutex diff --git a/setup.py b/setup.py index 3ca6691309c..88067c7077d 100644 --- a/setup.py +++ b/setup.py @@ -1177,7 +1177,6 @@ def main(): "include/ATen/core/dispatch/*.h", "include/ATen/core/op_registration/*.h", "include/c10/core/impl/*.h", - "include/c10/core/impl/cow/*.h", "include/c10/util/*.h", "include/c10/cuda/*.h", "include/c10/cuda/impl/*.h", diff --git a/test/cpp_extensions/open_registration_extension.cpp b/test/cpp_extensions/open_registration_extension.cpp index 5818d647bc4..cfe49e02a48 100644 --- a/test/cpp_extensions/open_registration_extension.cpp +++ b/test/cpp_extensions/open_registration_extension.cpp @@ -188,6 +188,10 @@ struct DummyCustomAllocator final : at::Allocator { at::DeleterFnPtr raw_deleter() const override { return &ReportAndDelete; } + + void copy_data(void* dest, const void* src, std::size_t count) const final { + default_copy_data(dest, src, count); + } }; // Register our dummy allocator diff --git a/test/inductor/extension_backends/extension_device.cpp b/test/inductor/extension_backends/extension_device.cpp index fa2922afd3d..2e86cde2dd1 100644 --- a/test/inductor/extension_backends/extension_device.cpp +++ b/test/inductor/extension_backends/extension_device.cpp @@ -81,6 +81,10 @@ struct DummyCustomAllocator final : at::Allocator { at::DeleterFnPtr raw_deleter() const override { return &ReportAndDelete; } + + void copy_data(void* dest, const void* src, std::size_t count) const final { + default_copy_data(dest, src, count); + } }; // Register our dummy allocator diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.cpp b/torch/csrc/cuda/CUDAPluggableAllocator.cpp index 4dfdcd3d280..34e00674ed8 100644 --- a/torch/csrc/cuda/CUDAPluggableAllocator.cpp +++ b/torch/csrc/cuda/CUDAPluggableAllocator.cpp @@ -330,6 +330,14 @@ std::string CUDAPluggableAllocator::name() { return "pluggable"; } +void CUDAPluggableAllocator::copy_data( + void* dest, + const void* src, + std::size_t count) const { + C10_CUDA_CHECK( + cudaMemcpy(dest, src, count, cudaMemcpyKind::cudaMemcpyDeviceToDevice)); +} + std::shared_ptr current_custom_allocator; diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.h b/torch/csrc/cuda/CUDAPluggableAllocator.h index 4f901c5efb8..a5c9e4acad1 100644 --- a/torch/csrc/cuda/CUDAPluggableAllocator.h +++ b/torch/csrc/cuda/CUDAPluggableAllocator.h @@ -122,6 +122,7 @@ struct CUDAPluggableAllocator cudaStream_t stream, bool p2p_enabled) override; std::string name() override; + void copy_data(void* dest, const void* src, std::size_t count) const final; protected: std::function alloc_fn_;