From ff608a9ff3edded33764c8631427e92c7288bafb Mon Sep 17 00:00:00 2001
From: Junjie Bai <jbai@fb.com>
Date: Mon, 1 Oct 2018 21:44:08 -0700
Subject: [PATCH] Back out "Revert D10123245: Back out "codemod cuda_gpu_id to
 device_id"" (#12232)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12232

Original commit changeset: fca91fea58b7

This adds proper modifications to the DeviceType <->DeviceOption conversion code added in D10033396

Reviewed By: jerryzh168

Differential Revision: D10132473

fbshipit-source-id: 801ef777e2950982cb47b48051b1471a0a91e64b
---
 caffe2/contrib/nccl/cuda_nccl_op_gpu.cc       |  4 +-
 caffe2/contrib/nccl/nccl_ops_test.py          |  2 +-
 caffe2/contrib/prof/prof_dag_net.cc           |  4 +-
 .../tensorboard/tensorboard_exporter.py       |  2 +-
 caffe2/contrib/warpctc/ctc_ops_test.py        |  8 +-
 caffe2/core/blob_gpu_test.cc                  |  4 +-
 caffe2/core/context_gpu.cu                    |  2 +-
 caffe2/core/context_gpu.h                     |  6 +-
 caffe2/core/cudnn_wrappers.h                  |  6 +-
 caffe2/core/event_gpu.cc                      | 16 ++--
 caffe2/core/hip/event_hip.cc                  |  2 +-
 caffe2/core/memonger.cc                       |  4 +-
 caffe2/core/net_async_base.cc                 |  4 +-
 caffe2/core/net_async_dag_gpu.cc              |  2 +-
 caffe2/core/net_gpu_test.cc                   |  2 +-
 caffe2/core/operator.cc                       |  2 +-
 caffe2/mkl/utils/mkl_memory.cc                |  2 +-
 caffe2/observers/profile_observer_gpu.cc      |  4 +-
 caffe2/onnx/backend.cc                        |  2 +-
 caffe2/operators/load_save_op_gpu.cc          |  2 +-
 .../rnn/recurrent_network_executor_gpu.cc     |  4 +-
 caffe2/proto/caffe2.proto                     |  2 +-
 caffe2/proto/caffe2_pb.h                      | 46 ++++++++++-
 caffe2/python/cnn.py                          |  2 +-
 caffe2/python/core.py                         | 16 ++--
 caffe2/python/core_test.py                    | 82 +++++++++----------
 caffe2/python/data_parallel_model.py          |  6 +-
 caffe2/python/hypothesis_test_util.py         |  2 +-
 caffe2/python/model_helper.py                 |  4 +-
 caffe2/python/muji.py                         |  2 +-
 caffe2/python/net_printer.py                  |  4 +-
 caffe2/python/numa_test.py                    |  2 +-
 caffe2/python/onnx/backend_rep.py             |  2 +-
 caffe2/python/operator_test/load_save_test.py |  2 +-
 caffe2/python/operator_test/rnn_cell_test.py  |  2 +-
 caffe2/python/optimizer.py                    | 10 +--
 .../predictor/predictor_exporter_test.py      |  2 +-
 caffe2/python/pybind_state_dlpack.h           |  4 +-
 caffe2/utils/proto_utils.cc                   |  4 +-
 caffe2/utils/proto_utils_test.cc              |  4 +-
 .../pyHIPIFY/cuda_to_hip_mappings.py          |  2 +-
 41 files changed, 163 insertions(+), 121 deletions(-)
diff --git a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
index 4c5313ff4b3..ea8b3494c6a 100644
--- a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
@@ -11,7 +11,7 @@ nccl::NCCLExecution getNCCLElements(
   // We either do an N-N op, or an N-1 op.
   CAFFE_ENFORCE(op->InputSize() == op->OutputSize() || op->OutputSize() == 1);
   nccl::NCCLExecution ex;
-  ex.stream_gpu_id = context.device_id();
+  ex.stream_gpu_id = context.cuda_gpu_id();
   ex.stream = context.cuda_stream();
   ex.root = op->template GetSingleArgument<int>("root", 0);
   ex.elements.resize(op->InputSize());
@@ -204,7 +204,7 @@ std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>> ncclOpDevInfer(
   for (int i = 0; i < def.input().size(); ++i) {
     DeviceOption dev;
     dev.set_device_type(1);
-    dev.set_device_id(i);
+    dev.set_cuda_gpu_id(i);
     opt.push_back(dev);
   }
   return std::make_pair(opt, opt);
diff --git a/caffe2/contrib/nccl/nccl_ops_test.py b/caffe2/contrib/nccl/nccl_ops_test.py
index f6c22a7d750..7e8a61e9de2 100644
--- a/caffe2/contrib/nccl/nccl_ops_test.py
+++ b/caffe2/contrib/nccl/nccl_ops_test.py
@@ -21,7 +21,7 @@ dyndep.InitOpsLibrary('@/caffe2/caffe2/contrib/nccl:nccl_ops')
 def gpu_device(i):
     device_option = caffe2_pb2.DeviceOption()
     device_option.device_type = caffe2_pb2.CUDA
-    device_option.device_id = i
+    device_option.cuda_gpu_id = i
     return device_option
 
 
diff --git a/caffe2/contrib/prof/prof_dag_net.cc b/caffe2/contrib/prof/prof_dag_net.cc
index c8678652c31..16917ddc154 100644
--- a/caffe2/contrib/prof/prof_dag_net.cc
+++ b/caffe2/contrib/prof/prof_dag_net.cc
@@ -33,9 +33,9 @@ void ProfDAGNet::ValidateOpTensorDevices() {
       had_mismatches = true;
       LOG(INFO) << "== PERFORMANCE WARNING == \n"
                 << " Operator " << node.operator_->debug_def().type()
-                << " expects GPU " << mismatch.second.first.device_id()
+                << " expects GPU " << mismatch.second.first.cuda_gpu_id()
                 << " but tensor [" << mismatch.first << "] is on GPU "
-                << mismatch.second.second.device_id();
+                << mismatch.second.second.cuda_gpu_id();
     }
   }
   if (!had_mismatches) {
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter.py b/caffe2/contrib/tensorboard/tensorboard_exporter.py
index cc2c3d85c96..93ade48e7d2 100644
--- a/caffe2/contrib/tensorboard/tensorboard_exporter.py
+++ b/caffe2/contrib/tensorboard/tensorboard_exporter.py
@@ -177,7 +177,7 @@ def _tf_device(device_option):
     if device_option.device_type == caffe2_pb2.CPU:
         return "/cpu:*"
     if device_option.device_type == caffe2_pb2.CUDA:
-        return "/gpu:{}".format(device_option.device_id)
+        return "/gpu:{}".format(device_option.cuda_gpu_id)
     raise Exception("Unhandled device", device_option)
 
 
diff --git a/caffe2/contrib/warpctc/ctc_ops_test.py b/caffe2/contrib/warpctc/ctc_ops_test.py
index 3b21c8b6674..25bb0a39e3a 100644
--- a/caffe2/contrib/warpctc/ctc_ops_test.py
+++ b/caffe2/contrib/warpctc/ctc_ops_test.py
@@ -79,11 +79,11 @@ class CTCOpsTest(test_util.TestCase):
     def test_ctc_cost_gpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
+                                    cuda_gpu_id=0),
             is_test=False)
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
+                                    cuda_gpu_id=0),
             is_test=False,
             skip_input_lengths=True)
 
@@ -99,10 +99,10 @@ class CTCOpsTest(test_util.TestCase):
     def test_ctc_forward_only_gpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
+                                    cuda_gpu_id=0),
             is_test=True)
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
+                                    cuda_gpu_id=0),
             is_test=True,
             skip_input_lengths=True)
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index 8b4127e403a..55eafdede72 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -195,7 +195,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     }
     EXPECT_TRUE(tensor_proto.has_device_detail());
     EXPECT_EQ(tensor_proto.device_detail().device_type(), PROTO_CUDA);
-    EXPECT_EQ(tensor_proto.device_detail().device_id(), gpu_id);
+    EXPECT_EQ(tensor_proto.device_detail().cuda_gpu_id(), gpu_id);
     // Test if the restored blob is still of the same device.
     blob.Reset();
     EXPECT_NO_THROW(DeserializeBlob(serialized, &blob));
@@ -205,7 +205,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     // Test if we force the restored blob on a different device, we
     // can still get so.
     blob.Reset();
-    proto.mutable_tensor()->mutable_device_detail()->set_device_id(0);
+    proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
     EXPECT_NO_THROW(DeserializeBlob(proto.SerializeAsString(), &blob));
     EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index f10fe067ac7..0d9e2686212 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -256,7 +256,7 @@ CUDAContext::CUDAContext(const int gpu_id)
 
 CUDAContext::CUDAContext(const DeviceOption& option)
     : gpu_id_(
-          option.has_device_id() ? RectifyGPUID(option.device_id())
+          option.has_cuda_gpu_id() ? RectifyGPUID(option.cuda_gpu_id())
                                    : CaffeCudaGetDevice()),
       random_seed_(
           option.has_random_seed() ? option.random_seed()
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index 65ba4a006a9..ce73f5f9428 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -184,7 +184,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
     }
   }
 
-  inline int device_id() const {
+  inline int cuda_gpu_id() const {
     return gpu_id_;
   }
 
@@ -283,7 +283,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
   }
 
   static bool IsStreamFree(const DeviceOption& option, int stream_id) {
-    auto stream = CUDAContext::cuda_stream(option.device_id(), stream_id);
+    auto stream = CUDAContext::cuda_stream(option.cuda_gpu_id(), stream_id);
     return cudaStreamQuery(stream) == cudaSuccess;
   }
 
@@ -393,7 +393,7 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext {
 
   void ExtractDeviceOption(DeviceOption* device, const void* data) override {
     device->set_device_type(TypeToProto(GetDeviceType()));
-    device->set_device_id(GetGPUIDForPointer(data));
+    device->set_cuda_gpu_id(GetGPUIDForPointer(data));
   }
 
  protected:
diff --git a/caffe2/core/cudnn_wrappers.h b/caffe2/core/cudnn_wrappers.h
index dea138e9ad5..1bd39fa62a3 100644
--- a/caffe2/core/cudnn_wrappers.h
+++ b/caffe2/core/cudnn_wrappers.h
@@ -122,9 +122,9 @@ class CuDNNWrapper {
   void with_cudnn_state(size_t state_idx, F&& f) {
     CAFFE_ENFORCE(
         state_idx < CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES, "Invalid state_idx");
-    auto& sync_state = cudnn_states()[context_->device_id()][state_idx];
+    auto& sync_state = cudnn_states()[context_->cuda_gpu_id()][state_idx];
 
-    DeviceGuard dg(context_->device_id());
+    DeviceGuard dg(context_->cuda_gpu_id());
 
     // We need to serialize execution on the CuDNNState as we can't
     // allow multiple threads to race through the cudaEventRecord
@@ -132,7 +132,7 @@ class CuDNNWrapper {
     // execution)
     std::lock_guard<std::mutex> g(sync_state.mutex);
     if (!sync_state.state.get()) {
-      sync_state.state.reset(new CuDNNState(context_->device_id()));
+      sync_state.state.reset(new CuDNNState(context_->cuda_gpu_id()));
     }
     CHECK_NOTNULL(sync_state.state.get())->execute(context_->cuda_stream(), f);
   }
diff --git a/caffe2/core/event_gpu.cc b/caffe2/core/event_gpu.cc
index 44aec8d3f2b..6253ca19c9a 100644
--- a/caffe2/core/event_gpu.cc
+++ b/caffe2/core/event_gpu.cc
@@ -9,21 +9,21 @@ namespace caffe2 {
 struct CudaEventWrapper {
   explicit CudaEventWrapper(const DeviceOption& option)
       : cuda_stream_(nullptr),
-        device_id_(option.device_id()),
+        cuda_gpu_id_(option.cuda_gpu_id()),
         status_(EventStatus::EVENT_INITIALIZED) {
     CAFFE_ENFORCE(option.device_type(), PROTO_CUDA);
-    DeviceGuard g(device_id_);
+    DeviceGuard g(cuda_gpu_id_);
     CUDA_ENFORCE(cudaEventCreate(
         &cuda_event_, cudaEventDefault | cudaEventDisableTiming));
   }
   ~CudaEventWrapper() {
-    DeviceGuard g(device_id_);
+    DeviceGuard g(cuda_gpu_id_);
     CUDA_CHECK(cudaEventDestroy(cuda_event_));
   }
 
   cudaEvent_t cuda_event_;
   cudaStream_t cuda_stream_;
-  int device_id_;
+  int cuda_gpu_id_;
 
   std::atomic<int> status_;
   std::mutex mutex_recorded_;
@@ -65,12 +65,12 @@ void EventRecordCUDA(Event* event, const void* context, const char* err_msg) {
       const auto& current_device = CaffeCudaGetDevice();
       CAFFE_ENFORCE_EQ(
           current_device,
-          wrapper->device_id_,
+          wrapper->cuda_gpu_id_,
           "When you call EventRecordCUDA, your current device should be the same "
           "as the device specified by the event.");
       CAFFE_ENFORCE_EQ(
           current_device,
-          static_cast<const CUDAContext*>(context)->device_id());
+          static_cast<const CUDAContext*>(context)->cuda_gpu_id());
       CUDA_ENFORCE(cudaEventRecord(
           wrapper->cuda_event_,
           static_cast<const CUDAContext*>(context)->cuda_stream()));
@@ -96,7 +96,7 @@ void EventFinishCUDA(const Event* event) {
 
   if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
     // ok, even if event is already completed and status was not yet updated
-    DeviceGuard g(wrapper->device_id_);
+    DeviceGuard g(wrapper->cuda_gpu_id_);
     auto cudaResult = cudaEventSynchronize(wrapper->cuda_event_);
     if (cudaResult == cudaSuccess) {
       wrapper->status_ = EventStatus::EVENT_SUCCESS;
@@ -127,7 +127,7 @@ void EventWaitCUDACUDA(const Event* event, void* context) {
     if (context_stream != event_stream) {
       // CAFFE_ENFORCE_EQ(
       //    CaffeCudaGetDevice(),
-      //    static_cast<const CUDAContext*>(context)->device_id());
+      //    static_cast<const CUDAContext*>(context)->cuda_gpu_id());
       CUDA_CHECK(cudaStreamWaitEvent(context_stream, wrapper->cuda_event_, 0));
     }
   }
diff --git a/caffe2/core/hip/event_hip.cc b/caffe2/core/hip/event_hip.cc
index ebec9c593e6..6f0db4642dd 100644
--- a/caffe2/core/hip/event_hip.cc
+++ b/caffe2/core/hip/event_hip.cc
@@ -138,7 +138,7 @@ void EventWaitHIPHIP(const Event* event, void* context)
         {
             // CAFFE_ENFORCE_EQ(
             //    CaffeCudaGetDevice(),
-            //    static_cast<const CUDAContext*>(context)->device_id());
+            //    static_cast<const CUDAContext*>(context)->cuda_gpu_id());
             HIP_CHECK(hipStreamWaitEvent(context_stream, wrapper->hip_event_, 0));
         }
     }
diff --git a/caffe2/core/memonger.cc b/caffe2/core/memonger.cc
index 87633fadebe..d9816e787ba 100644
--- a/caffe2/core/memonger.cc
+++ b/caffe2/core/memonger.cc
@@ -176,7 +176,7 @@ class ComputeBlobRecyclingForDag {
         // cuda device option but whose inputs/outputs are on CPU
         if (net.op(op_index).type() == "CopyGPUToCPU") {
           blob_device_[output].set_device_type(0);
-          blob_device_[output].set_device_id(0);
+          blob_device_[output].set_cuda_gpu_id(0);
         }
       }
     }
@@ -478,7 +478,7 @@ class ComputeBlobRecyclingForDag {
       const DeviceOption& device_option) {
     const DeviceOption& blob_device = blob_device_[blob_name];
     if (device_option.device_type() != blob_device.device_type() ||
-        device_option.device_id() != blob_device.device_id()) {
+        device_option.cuda_gpu_id() != blob_device.cuda_gpu_id()) {
       return false;
     }
     for (const int token : req_tokens_[blob_name]) {
diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
index acc30e56517..fe4b57cd332 100644
--- a/caffe2/core/net_async_base.cc
+++ b/caffe2/core/net_async_base.cc
@@ -157,7 +157,7 @@ TaskThreadPool* AsyncNetBase::pool(const DeviceOption& device_option) {
         numa_node_id);
     return poolGetter(cpu_pools_, PROTO_CPU, numa_node_id, num_workers_);
   } else if (device_option.device_type() == PROTO_CUDA) {
-    auto gpu_id = device_option.device_id();
+    auto gpu_id = device_option.cuda_gpu_id();
     CAFFE_ENFORCE(
         gpu_id >= 0 && gpu_id < FLAGS_caffe2_net_async_max_gpus,
         "Invalid GPU id: " + caffe2::to_string(gpu_id));
@@ -173,7 +173,7 @@ int AsyncNetBase::stream(int task_id) {
   const auto& device_option = event(task_id).GetDeviceOption();
   int stream_id = 0;
   if (device_option.device_type() == PROTO_CUDA) {
-    int gpu_id = device_option.device_id();
+    int gpu_id = device_option.cuda_gpu_id();
     CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
     if ((unsigned)gpu_id >= getStreamCounters().size()) {
       getStreamCounters().resize(gpu_id + 1, 0);
diff --git a/caffe2/core/net_async_dag_gpu.cc b/caffe2/core/net_async_dag_gpu.cc
index 86d0b4d1d27..550a760826e 100644
--- a/caffe2/core/net_async_dag_gpu.cc
+++ b/caffe2/core/net_async_dag_gpu.cc
@@ -112,7 +112,7 @@ AsyncDAGNet::AsyncDAGNet(
 int AsyncDAGNet::stream(const DeviceOption& device_option) {
   int stream_id = 0;
   if (device_option.device_type() == PROTO_CUDA) {
-    int gpu_id = device_option.device_id();
+    int gpu_id = device_option.cuda_gpu_id();
     CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
     if ((unsigned)gpu_id >= stream_counters_.size()) {
       stream_counters_.resize(gpu_id + 1, 0);
diff --git a/caffe2/core/net_gpu_test.cc b/caffe2/core/net_gpu_test.cc
index fab56112ec2..eaea9377f9b 100644
--- a/caffe2/core/net_gpu_test.cc
+++ b/caffe2/core/net_gpu_test.cc
@@ -124,7 +124,7 @@ TEST(NetTest, DISABLED_ChainingForDifferentDevices) {
           type: "NetTestDummy"
           device_option {
             device_type: 1
-            device_id: 1
+            cuda_gpu_id: 1
           }
         }
 )DOC";
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 8115ae3aab6..79be08c03b2 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -649,7 +649,7 @@ std::map<string, std::pair<DeviceOption, DeviceOption>> ValidateTensorDevices(
           &blob_device);
 
       if (blob_device.device_type() == PROTO_CUDA &&
-          blob_device.device_id() != op_device.device_id()) {
+          blob_device.cuda_gpu_id() != op_device.cuda_gpu_id()) {
         mismatches[blob_name] = std::make_pair(op_device, blob_device);
       } else if (
           blob_device.device_type() == PROTO_HIP &&
diff --git a/caffe2/mkl/utils/mkl_memory.cc b/caffe2/mkl/utils/mkl_memory.cc
index 9d4f347a13c..3f05f9c5d24 100644
--- a/caffe2/mkl/utils/mkl_memory.cc
+++ b/caffe2/mkl/utils/mkl_memory.cc
@@ -26,7 +26,7 @@ static vector<int64_t> GetMKLTensorInfo(
   const mkl::MKLMemory<T>* tc = static_cast<const mkl::MKLMemory<T>*>(c);
   *capacity = tc->size() * sizeof(T);
   device->set_device_type(PROTO_MKLDNN);
-  device->set_device_id(0);
+  device->set_cuda_gpu_id(0);
   return tc->dims();
 }
 
diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc
index 5bd9b0a11b0..bf4e20b7904 100644
--- a/caffe2/observers/profile_observer_gpu.cc
+++ b/caffe2/observers/profile_observer_gpu.cc
@@ -70,7 +70,7 @@ void ProfileOperatorObserver::Start() {
     int device;
     cudaGetDevice(&device);
 
-    cudaSetDevice(context->device_id());
+    cudaSetDevice(context->cuda_gpu_id());
     cudaEventCreate(&start_);
     cudaEventRecord(start_, context->cuda_stream());
 
@@ -92,7 +92,7 @@ void ProfileOperatorObserver::Stop() {
     int device;
     cudaGetDevice(&device);
 
-    cudaSetDevice(context->device_id());
+    cudaSetDevice(context->cuda_gpu_id());
     cudaEventCreate(&stop_);
     cudaEventRecord(stop_, context->cuda_stream());
     cudaEventSynchronize(stop_);
diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index 8a21fa0acf6..2350910febf 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -65,7 +65,7 @@ caffe2::DeviceOption GetDeviceOption(const Device& onnx_device) {
       {DeviceType::CUDA, caffe2::DeviceType::CUDA}};
   caffe2::DeviceOption d;
   d.set_device_type(static_cast<int32_t>(m.at(onnx_device.type)));
-  d.set_device_id(onnx_device.device_id);
+  d.set_cuda_gpu_id(onnx_device.device_id);
   return d;
 }
 
diff --git a/caffe2/operators/load_save_op_gpu.cc b/caffe2/operators/load_save_op_gpu.cc
index f81b7789699..eaa90b3dcdb 100644
--- a/caffe2/operators/load_save_op_gpu.cc
+++ b/caffe2/operators/load_save_op_gpu.cc
@@ -9,7 +9,7 @@ void LoadOp<CUDAContext>::SetCurrentDevice(BlobProto* proto) {
     proto->mutable_tensor()->clear_device_detail();
     auto* device_detail = proto->mutable_tensor()->mutable_device_detail();
     device_detail->set_device_type(PROTO_CUDA);
-    device_detail->set_device_id(CaffeCudaGetDevice());
+    device_detail->set_cuda_gpu_id(CaffeCudaGetDevice());
   }
 }
 
diff --git a/caffe2/operators/rnn/recurrent_network_executor_gpu.cc b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
index 061f54d3a4c..e16e2073f7f 100644
--- a/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
+++ b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
@@ -72,11 +72,11 @@ void CUDARecurrentNetworkExecutor::_ExecRange(int from, int to) {
       if (gpu_id == -1 &&
           rnn_op.op->device_option().device_type() ==
               DeviceTypeProto::PROTO_CUDA) {
-        gpu_id = rnn_op.op->device_option().device_id();
+        gpu_id = rnn_op.op->device_option().cuda_gpu_id();
       } else {
         CAFFE_ENFORCE(
             rnn_op.op->device_option().device_type() == 0 ||
-                rnn_op.op->device_option().device_id() == gpu_id,
+                rnn_op.op->device_option().cuda_gpu_id() == gpu_id,
             "RNN Executor only supports ops on one GPU");
       }
 
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
index 63a2a256ded..9dc745edbdf 100644
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@@ -183,7 +183,7 @@ message DeviceOption {
   // optional DeviceType device_type = 1 [ default = CPU ];
   optional int32 device_type = 1 [ default = 0 ]; // 0 is CPU.
   // [CUDA specific] the cuda gpu id.
-  optional int32 device_id = 2;
+  optional int32 cuda_gpu_id = 2;
   // [general] The random seed to start the device random number generator with.
   optional uint32 random_seed = 3;
   // [general] What node this op should execute on.
diff --git a/caffe2/proto/caffe2_pb.h b/caffe2/proto/caffe2_pb.h
index e0eb8e8dcdc..ded59d52b21 100644
--- a/caffe2/proto/caffe2_pb.h
+++ b/caffe2/proto/caffe2_pb.h
@@ -86,12 +86,54 @@ inline CAFFE2_API caffe2::DeviceOption DeviceToOption(
   caffe2::DeviceOption option;
   auto type = device.type();
   option.set_device_type(TypeToProto(type));
-  option.set_device_id(device.index());
+
+  switch (type) {
+    case DeviceType::CPU:
+      if (device.index() != -1) {
+        option.set_numa_node_id(device.index());
+      }
+      break;
+    case DeviceType::CUDA:
+      option.set_cuda_gpu_id(device.index());
+      break;
+    case DeviceType::HIP:
+      option.set_hip_gpu_id(device.index());
+      break;
+    case DeviceType::OPENGL:
+    case DeviceType::OPENCL:
+    case DeviceType::MKLDNN:
+    case DeviceType::IDEEP:
+    case DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES:
+    case DeviceType::ONLY_FOR_TEST:
+      break;
+    default:
+      AT_ERROR(
+          "Unknown device:",
+          static_cast<int32_t>(type),
+          ". If you have recently updated the caffe2.proto file to add a new "
+          "device type, did you forget to update the ProtoToType() and TypeToProto"
+          "function to reflect such recent changes?");
+  }
   return option;
 }
 
 inline CAFFE2_API at::Device OptionToDevice(const caffe2::DeviceOption option) {
-  return at::Device(ProtoToType(option.device_type()), option.device_id());
+  auto type = option.device_type();
+  int32_t id = -1;
+  switch (type) {
+    case caffe2::PROTO_CPU:
+      if (option.has_numa_node_id()) {
+        id = option.numa_node_id();
+      }
+      break;
+    case caffe2::PROTO_CUDA:
+      id = option.cuda_gpu_id();
+      break;
+    case caffe2::PROTO_HIP:
+      id = option.hip_gpu_id();
+      break;
+  }
+  return at::Device(ProtoToType(type), id);
 }
 
 } // namespace caffe2
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
index f9ccf92d750..f927020e6ae 100644
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@@ -236,5 +236,5 @@ class CNNModelHelper(ModelHelper):
     def GPU(self, gpu_id=0):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = gpu_id
+        device_option.cuda_gpu_id = gpu_id
         return device_option
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 4f683daa368..6850c02fc13 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -84,7 +84,7 @@ def IsOperatorWithEngine(op_type, engine):
 
 def DeviceOption(
     device_type,
-    device_id=0,
+    cuda_gpu_id=0,
     random_seed=None,
     node_name=None,
     numa_node_id=None,
@@ -92,7 +92,7 @@ def DeviceOption(
 ):
     option = caffe2_pb2.DeviceOption()
     option.device_type = device_type
-    option.device_id = device_id
+    option.cuda_gpu_id = cuda_gpu_id
     if node_name is not None:
         option.node_name = node_name
     if random_seed is not None:
@@ -115,7 +115,7 @@ def device_option_equal(opt1, opt2, ignore_node_name=True, ignore_random_seed=Tr
     if not opt1.device_type or not opt2.device_type:
         # At least one option is for CPU, check if both are for CPU.
         return not opt1.device_type and not opt2.device_type
-    return opt1.device_id == opt2.device_id
+    return opt1.cuda_gpu_id == opt2.cuda_gpu_id
 
 
 def InferBlobDevices(net):
@@ -2111,7 +2111,7 @@ class Net(object):
         """A convenient function to run everything on the GPU."""
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = gpu_id
+        device_option.cuda_gpu_id = gpu_id
         self._net.device_option.CopyFrom(device_option)
         if use_cudnn:
             for op in self._net.op:
@@ -2286,7 +2286,7 @@ def copy_func_between_devices(src, dst):
         return None
 
     if src.device_type == CUDA and dst.device_type == CUDA:
-        if src.device_id == dst.device_id:
+        if src.cuda_gpu_id == dst.cuda_gpu_id:
             return None
         else:
             def fun(net, *args, **kw):
@@ -2312,10 +2312,10 @@ def copy_func_between_devices(src, dst):
 def device_equal(src, dst):
     '''
     We are using this fucntion instead of == operator because optional-value
-    comparison between empty device_options and {device_type:0, device_id:0}
+    comparison between empty device_options and {device_type:0, cuda_gpu_id:0}
     returns not equal in some cases.
     '''
-    return src.device_type == dst.device_type and src.device_id == dst.device_id
+    return src.device_type == dst.device_type and src.cuda_gpu_id == dst.cuda_gpu_id
 
 
 def update_placeholder_op_output(op, blob_to_device):
@@ -2429,7 +2429,7 @@ def InjectCrossDeviceCopies(net, blob_to_device=None, blob_remap=None,
                         if device_option.device_type == CPU:
                             suffix = '_cpu'
                         elif device_option.device_type == CUDA:
-                            suffix = '_cuda_' + str(device_option.device_id)
+                            suffix = '_cuda_' + str(device_option.cuda_gpu_id)
                         else:
                             raise RuntimeError(
                                 "Unknown device type: {}".
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index 2f6dedbfd80..7120843f331 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -83,17 +83,17 @@ class TestScopes(test_util.TestCase):
         # explicitly setting a device
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
         self.assertTrue(op.HasField('device_option'))
         self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         with core.DeviceScope(device_option):
             # from device scope
             op = core.CreateOperator("Relu", "x", "y")
             self.assertTrue(op.HasField('device_option'))
             self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-            self.assertEqual(op.device_option.device_id, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
             # from an overridden device option
             override_device = caffe2_pb2.DeviceOption()
             override_device.device_type = caffe2_pb2.CPU
@@ -109,13 +109,13 @@ class TestScopes(test_util.TestCase):
     def testNameAndDeviceScopeTogether(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         with core.DeviceScope(device_option):
             with core.NameScope("foo"):
                 op = core.CreateOperator("Relu", "x", "y")
                 self.assertTrue(op.HasField('device_option'))
                 self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-                self.assertEqual(op.device_option.device_id, 1)
+                self.assertEqual(op.device_option.cuda_gpu_id, 1)
                 self.assertEqual(len(op.input), 1)
                 self.assertEqual(op.input[0], "foo/x")
                 self.assertEqual(len(op.output), 1)
@@ -255,7 +255,7 @@ class TestCreateOperator(test_util.TestCase):
     def testCreate(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         op = core.CreateOperator(
             "Ludicrous", "x", "y", name="ludicrous",
             control_input="z", device_option=device_option,
@@ -271,7 +271,7 @@ class TestCreateOperator(test_util.TestCase):
         self.assertEqual(op.control_input[0], "z")
         self.assertTrue(op.HasField('device_option'))
         self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertTrue(len(op.arg), 3)
 
         # can't guarantee ordering of kwargs, so generate a set of args
@@ -574,7 +574,7 @@ class TestDeviceOption(test_util.TestCase):
         opt2 = caffe2_pb2.DeviceOption()
         opt1.device_type = 0
         self.assertTrue(core.device_option_equal(opt1, opt2))
-        opt1.device_id = 5
+        opt1.cuda_gpu_id = 5
         # opt1 still is on CPU, so the options should be equal
         self.assertTrue(core.device_option_equal(opt1, opt2))
         opt2.device_type = 0
@@ -649,7 +649,7 @@ class TestInferDevice(test_util.TestCase):
     def setUp(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         self.cuda_option = device_option
         self.cpu_option = caffe2_pb2.DeviceOption()
 
@@ -748,7 +748,7 @@ class TestInferDevice(test_util.TestCase):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
 
@@ -765,7 +765,7 @@ class TestInferDevice(test_util.TestCase):
         self.assertEqual(op.input[1], "fc_w_cuda_1")
         self.assertEqual(op.input[2], "fc_b_cuda_1")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(new_net._net.op[-2].type, "CopyCPUToGPU")
         self.assertEqual(new_net._net.op[0].type, "CopyCPUToGPU")
         self.assertNotEqual(blob_to_device["fc_w"], device_option)
@@ -775,7 +775,7 @@ class TestInferDevice(test_util.TestCase):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
         const = init_net.ConstantFill([], 'const', shape=[], value=1.)
@@ -791,12 +791,12 @@ class TestInferDevice(test_util.TestCase):
         op = nets[1]._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "fc_w_cuda_1")
         op = nets[1]._net.op[1]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "fc_b_cuda_1")
         op = nets[1]._net.op[2]
         self.assertEqual(op.type, "FC")
@@ -804,7 +804,7 @@ class TestInferDevice(test_util.TestCase):
         self.assertEqual(op.input[1], "fc_w_cuda_1")
         self.assertEqual(op.input[2], "fc_b_cuda_1")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         op = nets[1]._net.op[3]
         self.assertEqual(op.type, "Add")
         self.assertEqual(op.input[0], "fc1")
@@ -822,7 +822,7 @@ op {
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -832,7 +832,7 @@ op {
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -844,7 +844,7 @@ op {
   type: "FC"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -855,7 +855,7 @@ op {
   type: "Add"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 external_input: "data"
@@ -870,7 +870,7 @@ external_input: "const_cuda_1"
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
 
         with core.DeviceScope(device_option):
             weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
@@ -887,7 +887,7 @@ external_input: "const_cuda_1"
         self.assertEqual(op.input[1], "fc_w")
         self.assertEqual(op.input[2], "fc_b")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         """
 For reference, net.Proto() should be like:
 name: ""
@@ -900,7 +900,7 @@ op {
   type: "FC"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 external_input: "data"
@@ -912,7 +912,7 @@ external_input: "fc_b"
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
 
         with core.DeviceScope(device_option):
             net.Relu("data", "relu1")
@@ -920,10 +920,10 @@ external_input: "fc_b"
         with core.DeviceScope(device_option):
             net.Relu("data", "relu3")
         net.Relu("data", "relu4")
-        device_option.device_id = 0
+        device_option.cuda_gpu_id = 0
         with core.DeviceScope(device_option):
             net.Relu("data", "relu5")
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         with core.DeviceScope(device_option):
             net.Relu("data", "relu6")
 
@@ -931,12 +931,12 @@ external_input: "fc_b"
         op = new_net._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "data_cuda_1")
         op = new_net._net.op[1]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "relu1")
         op = new_net._net.op[2]
         self.assertEqual(op.type, "Relu")
@@ -945,7 +945,7 @@ external_input: "fc_b"
         op = new_net._net.op[3]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu3")
         op = new_net._net.op[4]
@@ -955,18 +955,18 @@ external_input: "fc_b"
         op = new_net._net.op[5]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 0)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.output[0], "data_cuda_0")
         op = new_net._net.op[6]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 0)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.input[0], "data_cuda_0")
         self.assertEqual(op.output[0], "relu5")
         op = new_net._net.op[7]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu6")
         """
@@ -979,7 +979,7 @@ op {
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -989,7 +989,7 @@ op {
   type: "Relu"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -1005,7 +1005,7 @@ op {
   type: "Relu"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -1021,7 +1021,7 @@ op {
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    device_id: 0
+    cuda_gpu_id: 0
   }
 }
 op {
@@ -1031,7 +1031,7 @@ op {
   type: "Relu"
   device_option {
     device_type: 1
-    device_id: 0
+    cuda_gpu_id: 0
   }
 }
 op {
@@ -1041,7 +1041,7 @@ op {
   type: "Relu"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 external_input: "data"
@@ -1060,7 +1060,7 @@ external_input: "data"
             cpu_device[i].node_name = 'node:' + str(i)
             gpu_device.append(caffe2_pb2.DeviceOption())
             gpu_device[i].device_type = caffe2_pb2.CUDA
-            gpu_device[i].device_id = 0
+            gpu_device[i].cuda_gpu_id = 0
             gpu_device[i].node_name = 'node:' + str(i)
         send_node = 'node:0'
         recv_node = 'node:1'
@@ -1100,12 +1100,12 @@ external_input: "data"
         op = init_net._net.op[2]
         self.assertEqual(op.type, "CopyGPUToCPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 0)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.output[0], "fc_w_cpu")
         op = init_net._net.op[3]
         self.assertEqual(op.type, "CopyGPUToCPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 0)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.output[0], "fc_b_cpu")
         op = init_net._net.op[4]
         self.assertEqual(op.type, placeholder_send)
@@ -1128,7 +1128,7 @@ external_input: "data"
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
 
         net.Adagrad(['param', 'moment', 'grad', 'lr'], ['param', 'moment'])
         with core.DeviceScope(device_option):
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 749c8b12c93..89770dc6ea7 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -813,7 +813,7 @@ def ConvertNetForDevice(net, device=None):
 
     device_prefix = "gpu" if device.device_type == caffe2_pb2.CUDA else "cpu"
 
-    namescope = "{}_{}/".format(device_prefix, device.device_id)
+    namescope = "{}_{}/".format(device_prefix, device.cuda_gpu_id)
     for op in mnet.Proto().op:
         if "RecurrentNetwork" in op.type:
             raise("RecurrentNetwork conversion not yet supported")
@@ -1540,7 +1540,7 @@ def _AnalyzeOperators(model):
             continue
 
         op_dev = op.device_option
-        op_gpu = op_dev.device_id
+        op_gpu = op_dev.cuda_gpu_id
 
         # This avoids failing on operators that are only for CPU
         if op_dev.device_type != caffe2_pb2.CUDA:
@@ -1904,7 +1904,7 @@ def _InterleaveOps(model):
     new_ops = []
     ops = {d: [] for d in range(num_devices)}
     for op in orig_ops:
-        ops[op.device_option.device_id].append(op)
+        ops[op.device_option.cuda_gpu_id].append(op)
 
     for j in range(num_ops_per_dev):
         tp = None
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index 8470df15887..5cc18f99bd9 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -259,7 +259,7 @@ device_options = _device_options_no_hip + ([hip_do] if workspace.has_hip_support
 
 # Include device option for each GPU
 expanded_device_options = [cpu_do] + (
-    [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, device_id=i)
+    [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, cuda_gpu_id=i)
      for i in range(workspace.NumCudaDevices())]
     if workspace.has_gpu_support else [])
 
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
index 1e881d27f49..f8e3f32bb2c 100644
--- a/caffe2/python/model_helper.py
+++ b/caffe2/python/model_helper.py
@@ -596,7 +596,7 @@ def ExtractPredictorNet(
                             rename_list(step_op.output)
                             if device is not None:
                                 step_op.device_option.device_type = device.device_type
-                                step_op.device_option.device_id = device.device_id
+                                step_op.device_option.cuda_gpu_id = device.cuda_gpu_id
 
                         rename_list(arg.n.external_input)
                         rename_list(arg.n.external_output)
@@ -610,7 +610,7 @@ def ExtractPredictorNet(
 
             if device is not None:
                 op.device_option.device_type = device.device_type
-                op.device_option.device_id = device.device_id
+                op.device_option.cuda_gpu_id = device.cuda_gpu_id
             validate_op(op)
             predict_proto.op.extend([op])
             known_blobs.update(op.output)
diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py
index 2f2b5aced66..b407f96d239 100644
--- a/caffe2/python/muji.py
+++ b/caffe2/python/muji.py
@@ -26,7 +26,7 @@ def OnGPU(gpu_id):
   """
     device_option = caffe2_pb2.DeviceOption()
     device_option.device_type = caffe2_pb2.CUDA
-    device_option.device_id = gpu_id
+    device_option.cuda_gpu_id = gpu_id
     return device_option
 
 
diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py
index 7583f863b1f..4b5cddb61d2 100644
--- a/caffe2/python/net_printer.py
+++ b/caffe2/python/net_printer.py
@@ -268,11 +268,11 @@ def call(op, inputs=None, outputs=None, factor_prefixes=False):
 
 def format_device_option(dev_opt):
     if not dev_opt or not (
-            dev_opt.device_type or dev_opt.device_id or dev_opt.node_name):
+            dev_opt.device_type or dev_opt.cuda_gpu_id or dev_opt.node_name):
         return None
     return call(
         'DeviceOption',
-        [dev_opt.device_type, dev_opt.device_id, "'%s'" % dev_opt.node_name])
+        [dev_opt.device_type, dev_opt.cuda_gpu_id, "'%s'" % dev_opt.node_name])
 
 
 @Printer.register(OperatorDef)
diff --git a/caffe2/python/numa_test.py b/caffe2/python/numa_test.py
index 3178345cf46..8d3a362dcdf 100644
--- a/caffe2/python/numa_test.py
+++ b/caffe2/python/numa_test.py
@@ -27,7 +27,7 @@ def build_test_net(net_name):
 
     gpu_device_option = caffe2_pb2.DeviceOption()
     gpu_device_option.device_type = caffe2_pb2.CUDA
-    gpu_device_option.device_id = 0
+    gpu_device_option.cuda_gpu_id = 0
 
     net.CopyCPUToGPU("output_blob_0", "output_blob_0_gpu",
                         device_option=gpu_device_option)
diff --git a/caffe2/python/onnx/backend_rep.py b/caffe2/python/onnx/backend_rep.py
index 5802e49de52..8cc3f9e2fa9 100644
--- a/caffe2/python/onnx/backend_rep.py
+++ b/caffe2/python/onnx/backend_rep.py
@@ -24,7 +24,7 @@ class Caffe2Rep(BackendRep):
     @property
     def _name_scope(self):
         if self.predict_net.device_option.device_type == caffe2_pb2.CUDA:
-            return 'gpu_{}'.format(self.predict_net.device_option.device_id)
+            return 'gpu_{}'.format(self.predict_net.device_option.cuda_gpu_id)
         return ''
 
     def run(self, inputs, **kwargs):
diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py
index 8e3817034d4..2d53027a0a0 100644
--- a/caffe2/python/operator_test/load_save_test.py
+++ b/caffe2/python/operator_test/load_save_test.py
@@ -91,7 +91,7 @@ class TestLoadSaveBase(test_util.TestCase):
                     self.assertEqual(proto.tensor.device_detail.device_type,
                                      device_type)
                     if device_type == caffe2_pb2.CUDA:
-                        self.assertEqual(proto.tensor.device_detail.device_id,
+                        self.assertEqual(proto.tensor.device_detail.cuda_gpu_id,
                                          gpu_id)
 
             blobs = [str(i) for i in range(len(arrays))]
diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py
index 66ac07dbdca..9d9bb38e178 100644
--- a/caffe2/python/operator_test/rnn_cell_test.py
+++ b/caffe2/python/operator_test/rnn_cell_test.py
@@ -1216,7 +1216,7 @@ class RNNCellTest(hu.HypothesisTestCase):
                     if arg.name == "step_net":
                         for step_op in arg.n.op:
                             self.assertEqual(0, step_op.device_option.device_type)
-                            self.assertEqual(1, step_op.device_option.device_id)
+                            self.assertEqual(1, step_op.device_option.cuda_gpu_id)
                     elif arg.name == 'backward_step_net':
                         self.assertEqual(caffe2_pb2.NetDef(), arg.n)
 
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index ddd5871f7d4..0c5b18b0b6a 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -83,7 +83,7 @@ class Optimizer(object):
 
         if current_scope.device_type == caffe2_pb2.CUDA:
             return self.get_gpu_blob_name(
-                base_str, current_scope.device_id, current_scope.node_name
+                base_str, current_scope.cuda_gpu_id, current_scope.node_name
             )
         else:
             return self.get_cpu_blob_name(base_str, current_scope.node_name)
@@ -279,7 +279,7 @@ class SgdOptimizer(Optimizer):
         # to include device information.
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}{}".format(dev.device_type, dev.device_id, dev.node_name),
+            "ONE_{}_{}{}".format(dev.device_type, dev.cuda_gpu_id, dev.node_name),
             shape=[1],
             value=1.0
         )
@@ -488,12 +488,12 @@ class WeightDecayBuilder(Optimizer):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.device_id),
+            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
             shape=[1],
             value=1.0
         )
         WD = param_init_net.ConstantFill(
-            [], "wd_{}_{}".format(dev.device_type, dev.device_id),
+            [], "wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
             shape=[1], value=self.weight_decay
         )
 
@@ -1160,7 +1160,7 @@ class RmsPropOptimizer(Optimizer):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.device_id),
+            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
             shape=[1],
             value=1.0
         )
diff --git a/caffe2/python/predictor/predictor_exporter_test.py b/caffe2/python/predictor/predictor_exporter_test.py
index ef11246bdfc..b4c71535deb 100644
--- a/caffe2/python/predictor/predictor_exporter_test.py
+++ b/caffe2/python/predictor/predictor_exporter_test.py
@@ -193,7 +193,7 @@ class PredictorExporterTest(unittest.TestCase):
 
         # check device options
         for op in list(init_net.Proto().op) + list(predict_init_net.Proto().op):
-            self.assertEqual(1, op.device_option.device_id)
+            self.assertEqual(1, op.device_option.cuda_gpu_id)
             self.assertEqual(caffe2_pb2.CPU, op.device_option.device_type)
 
     def test_db_fails_without_params(self):
diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h
index 6db4ae42b84..679152c7881 100644
--- a/caffe2/python/pybind_state_dlpack.h
+++ b/caffe2/python/pybind_state_dlpack.h
@@ -34,7 +34,7 @@ class DLPackWrapper {
         "Unsupported device type: ",
         device_option.device_type());
     tensor_context.device_type = *device_type_ptr;
-    tensor_context.device_id = device_option.device_id();
+    tensor_context.device_id = device_option.cuda_gpu_id();
 
     if (tensor->size() <= 0) {
       tensor->Resize(0);
@@ -87,7 +87,7 @@ class DLPackWrapper {
     int dlpack_device_id = dlTensor->ctx.device_id;
     CAFFE_ENFORCE_EQ(
         dlpack_device_id,
-        device_option.device_id(),
+        device_option.cuda_gpu_id(),
         "Expected same device id for DLPack and C2 tensors");
 
     std::vector<int64_t> dims;
diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
index dd80282238a..dc8e088eba9 100644
--- a/caffe2/utils/proto_utils.cc
+++ b/caffe2/utils/proto_utils.cc
@@ -30,7 +30,7 @@ C10_EXPORT int DeviceId(const DeviceOption& option) {
     case PROTO_CPU:
       return option.numa_node_id();
     case PROTO_CUDA:
-      return option.device_id();
+      return option.cuda_gpu_id();
     case PROTO_MKLDNN:
       return option.numa_node_id();
     case PROTO_HIP:
@@ -43,7 +43,7 @@ C10_EXPORT int DeviceId(const DeviceOption& option) {
 C10_EXPORT bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs) {
   return (
       lhs.device_type() == rhs.device_type() &&
-      lhs.device_id() == rhs.device_id() &&
+      lhs.cuda_gpu_id() == rhs.cuda_gpu_id() &&
       lhs.hip_gpu_id() == rhs.hip_gpu_id() &&
       lhs.node_name() == rhs.node_name() &&
       lhs.numa_node_id() == rhs.numa_node_id());
diff --git a/caffe2/utils/proto_utils_test.cc b/caffe2/utils/proto_utils_test.cc
index 5d8fb86b34e..c9f37f4c98c 100644
--- a/caffe2/utils/proto_utils_test.cc
+++ b/caffe2/utils/proto_utils_test.cc
@@ -11,9 +11,9 @@ TEST(ProtoUtilsTest, IsSameDevice) {
   EXPECT_FALSE(IsSameDevice(a, b));
   b.set_node_name("my_node");
   EXPECT_TRUE(IsSameDevice(a, b));
-  b.set_device_id(2);
+  b.set_cuda_gpu_id(2);
   EXPECT_FALSE(IsSameDevice(a, b));
-  a.set_device_id(2);
+  a.set_cuda_gpu_id(2);
   EXPECT_TRUE(IsSameDevice(a, b));
   a.set_device_type(DeviceTypeProto::PROTO_CUDA);
   b.set_device_type(DeviceTypeProto::PROTO_CPU);
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
index 3a98a4cb7d9..113403fd87b 100644
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -2216,7 +2216,7 @@ CAFFE2_SPECIFIC_MAPPINGS = {
     "CURAND_ENFORCE" :("HIPRAND_ENFORCE", API_CAFFE2),
     "curandGenerateUniform" : ("hiprandGenerateUniform", API_CAFFE2),
     "curand_generator" : ("hiprand_generator", API_CAFFE2),
-    "device_id" : ("hip_gpu_id", API_CAFFE2),
+    "cuda_gpu_id" : ("hip_gpu_id", API_CAFFE2),
     "CaffeCudaGetDevice" : ("CaffeHipGetDevice", API_CAFFE2),
 }