mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-27 03:11:28 +00:00
Enable external CUDA allocator in ORTModule. (#6745)
* Enable external CUDA allocator in ORTModule. * Fix assert after unification of allocators. * Update no grad memory test. * update comments. * fix provider options array when not sharing allocator.
This commit is contained in:
parent
39d182f7fc
commit
1a2f1bd23a
2 changed files with 30 additions and 11 deletions
|
|
@ -10,6 +10,7 @@ import numpy as np
|
|||
from inspect import signature
|
||||
|
||||
from torch.utils.dlpack import from_dlpack
|
||||
from torch.utils.cpp_extension import load_inline
|
||||
from collections import abc
|
||||
|
||||
# Needed to re-implement PyTorch's cpu,cuda,to methods
|
||||
|
|
@ -157,6 +158,22 @@ def _ort_output_to_torch_tensor(ort_output):
|
|||
tensor = from_dlpack(ort_output.to_dlpack())
|
||||
return tensor.to(torch.bool) if tensor.dtype == torch.uint8 else tensor
|
||||
|
||||
def _load_torch_allocator_cpp_extension():
|
||||
torch_cuda_allocator_addresses_cpp_source = """
|
||||
#include <torch/extension.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
size_t cuda_caching_allocator_raw_alloc_address() {
|
||||
return reinterpret_cast<size_t>(&c10::cuda::CUDACachingAllocator::raw_alloc);
|
||||
}
|
||||
size_t cuda_caching_allocator_raw_delete_address() {
|
||||
return reinterpret_cast<size_t>(&c10::cuda::CUDACachingAllocator::raw_delete);
|
||||
}
|
||||
"""
|
||||
|
||||
return load_inline(name='inline_extension', cpp_sources=[torch_cuda_allocator_addresses_cpp_source],
|
||||
functions=['cuda_caching_allocator_raw_alloc_address', 'cuda_caching_allocator_raw_delete_address'],
|
||||
verbose=True, with_cuda=True)
|
||||
|
||||
class ORTModule(torch.nn.Module):
|
||||
|
||||
def __init__(self, module):
|
||||
|
|
@ -194,6 +211,13 @@ class ORTModule(torch.nn.Module):
|
|||
self._save_onnx = False
|
||||
self._save_onnx_prefix = ''
|
||||
|
||||
# CPP extension to get torch CUDA allocator's alloc and free function addresses
|
||||
self._use_external_cuda_allocator = True
|
||||
if self._use_external_cuda_allocator:
|
||||
self._torch_cuda_allocator = _load_torch_allocator_cpp_extension()
|
||||
self._torch_alloc = self._torch_cuda_allocator.cuda_caching_allocator_raw_alloc_address()
|
||||
self._torch_free = self._torch_cuda_allocator.cuda_caching_allocator_raw_delete_address()
|
||||
|
||||
def _initialize_module_gradient_graph_builder(self):
|
||||
# TODO: PyTorch exporter bug: changes the initializer order
|
||||
initializer_names = [p[0] for p in self._original_module.named_parameters()]
|
||||
|
|
@ -219,7 +243,10 @@ class ORTModule(torch.nn.Module):
|
|||
if self._device.type == 'cuda':
|
||||
# Configure the InferenceSessions to use the specific GPU on which the model is placed.
|
||||
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
provider_options = [{"device_id": str(self._device.index)}, {}]
|
||||
if self._use_external_cuda_allocator:
|
||||
provider_options = [{"device_id": str(self._device.index), "cuda_external_alloc": str(self._torch_alloc), "cuda_external_free": str(self._torch_free)}, {}]
|
||||
else:
|
||||
provider_options = [{"device_id": str(self._device.index)}, {}]
|
||||
elif self._device.type == 'cpu':
|
||||
providers = ["CPUExecutionProvider"]
|
||||
provider_options = [{}]
|
||||
|
|
@ -461,8 +488,4 @@ class ORTModule(torch.nn.Module):
|
|||
except RuntimeError as e:
|
||||
raise RuntimeError('There was an error while exporting the PyTorch model to ONNX: {}'.format(e))
|
||||
|
||||
# TODO: this step might not be needed when we use the torch external allocator
|
||||
# clear cache after model export
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return onnx.load_model_from_string(f.getvalue())
|
||||
|
|
|
|||
|
|
@ -337,22 +337,18 @@ def test_gpu_reserved_memory_with_torch_no_grad():
|
|||
model_with_no_grad(x, y, None, None, None, None, z)
|
||||
mem_reserved_after_export_with_torch_no_grad = torch.cuda.memory_reserved(device)
|
||||
del model_with_no_grad
|
||||
torch.cuda.empty_cache()
|
||||
mem_reserved_after_cache_empty = torch.cuda.memory_reserved(device)
|
||||
assert mem_reserved_before_export == mem_reserved_after_cache_empty
|
||||
|
||||
# Create another model and get the memory_reserved when torch.no_grad and torch.cuda.empty_cache
|
||||
# has not been enabled after export
|
||||
# Create another model and get the memory_reserved when torch.no_grad has not been enabled after export.
|
||||
model_without_no_grad = _get_bert_for_sequence_classification_model(device)
|
||||
model_without_no_grad = ORTModule(model_without_no_grad)
|
||||
mem_reserved_after_export_without_torch_no_grad = 0
|
||||
|
||||
with patch('torch.no_grad'), patch('torch.cuda.empty_cache'):
|
||||
with patch('torch.no_grad'):
|
||||
model_without_no_grad(x, y, None, None, None, None, z)
|
||||
mem_reserved_after_export_without_torch_no_grad = torch.cuda.memory_reserved(device)
|
||||
|
||||
assert mem_reserved_after_export_with_torch_no_grad < mem_reserved_after_export_without_torch_no_grad
|
||||
assert mem_reserved_before_export == mem_reserved_after_export_with_torch_no_grad
|
||||
|
||||
@pytest.mark.parametrize("return_type, device", [
|
||||
(dict, 'cpu'),
|
||||
|
|
|
|||
Loading…
Reference in a new issue