Enable external CUDA allocator in ORTModule. (#6745)

* Enable external CUDA allocator in ORTModule.

* Fix assert after unification of allocators.

* Update no grad memory test.

* update comments.

* fix provider options array when not sharing allocator.
This commit is contained in:
M. Zeeshan Siddiqui 2021-02-18 20:01:13 -08:00 committed by GitHub
parent 39d182f7fc
commit 1a2f1bd23a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 11 deletions

View file

@ -10,6 +10,7 @@ import numpy as np
from inspect import signature
from torch.utils.dlpack import from_dlpack
from torch.utils.cpp_extension import load_inline
from collections import abc
# Needed to re-implement PyTorch's cpu,cuda,to methods
@ -157,6 +158,22 @@ def _ort_output_to_torch_tensor(ort_output):
tensor = from_dlpack(ort_output.to_dlpack())
return tensor.to(torch.bool) if tensor.dtype == torch.uint8 else tensor
def _load_torch_allocator_cpp_extension():
torch_cuda_allocator_addresses_cpp_source = """
#include <torch/extension.h>
#include <c10/cuda/CUDACachingAllocator.h>
size_t cuda_caching_allocator_raw_alloc_address() {
return reinterpret_cast<size_t>(&c10::cuda::CUDACachingAllocator::raw_alloc);
}
size_t cuda_caching_allocator_raw_delete_address() {
return reinterpret_cast<size_t>(&c10::cuda::CUDACachingAllocator::raw_delete);
}
"""
return load_inline(name='inline_extension', cpp_sources=[torch_cuda_allocator_addresses_cpp_source],
functions=['cuda_caching_allocator_raw_alloc_address', 'cuda_caching_allocator_raw_delete_address'],
verbose=True, with_cuda=True)
class ORTModule(torch.nn.Module):
def __init__(self, module):
@ -194,6 +211,13 @@ class ORTModule(torch.nn.Module):
self._save_onnx = False
self._save_onnx_prefix = ''
# CPP extension to get torch CUDA allocator's alloc and free function addresses
self._use_external_cuda_allocator = True
if self._use_external_cuda_allocator:
self._torch_cuda_allocator = _load_torch_allocator_cpp_extension()
self._torch_alloc = self._torch_cuda_allocator.cuda_caching_allocator_raw_alloc_address()
self._torch_free = self._torch_cuda_allocator.cuda_caching_allocator_raw_delete_address()
def _initialize_module_gradient_graph_builder(self):
# TODO: PyTorch exporter bug: changes the initializer order
initializer_names = [p[0] for p in self._original_module.named_parameters()]
@ -219,7 +243,10 @@ class ORTModule(torch.nn.Module):
if self._device.type == 'cuda':
# Configure the InferenceSessions to use the specific GPU on which the model is placed.
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
provider_options = [{"device_id": str(self._device.index)}, {}]
if self._use_external_cuda_allocator:
provider_options = [{"device_id": str(self._device.index), "cuda_external_alloc": str(self._torch_alloc), "cuda_external_free": str(self._torch_free)}, {}]
else:
provider_options = [{"device_id": str(self._device.index)}, {}]
elif self._device.type == 'cpu':
providers = ["CPUExecutionProvider"]
provider_options = [{}]
@ -461,8 +488,4 @@ class ORTModule(torch.nn.Module):
except RuntimeError as e:
raise RuntimeError('There was an error while exporting the PyTorch model to ONNX: {}'.format(e))
# TODO: this step might not be needed when we use the torch external allocator
# clear cache after model export
torch.cuda.empty_cache()
return onnx.load_model_from_string(f.getvalue())

View file

@ -337,22 +337,18 @@ def test_gpu_reserved_memory_with_torch_no_grad():
model_with_no_grad(x, y, None, None, None, None, z)
mem_reserved_after_export_with_torch_no_grad = torch.cuda.memory_reserved(device)
del model_with_no_grad
torch.cuda.empty_cache()
mem_reserved_after_cache_empty = torch.cuda.memory_reserved(device)
assert mem_reserved_before_export == mem_reserved_after_cache_empty
# Create another model and get the memory_reserved when torch.no_grad and torch.cuda.empty_cache
# has not been enabled after export
# Create another model and get the memory_reserved when torch.no_grad has not been enabled after export.
model_without_no_grad = _get_bert_for_sequence_classification_model(device)
model_without_no_grad = ORTModule(model_without_no_grad)
mem_reserved_after_export_without_torch_no_grad = 0
with patch('torch.no_grad'), patch('torch.cuda.empty_cache'):
with patch('torch.no_grad'):
model_without_no_grad(x, y, None, None, None, None, z)
mem_reserved_after_export_without_torch_no_grad = torch.cuda.memory_reserved(device)
assert mem_reserved_after_export_with_torch_no_grad < mem_reserved_after_export_without_torch_no_grad
assert mem_reserved_before_export == mem_reserved_after_export_with_torch_no_grad
@pytest.mark.parametrize("return_type, device", [
(dict, 'cpu'),