diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 1625ce40834..72a2ea2cc10 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -228,6 +228,15 @@ test_libtorch() {
   fi
 }
 
+test_distributed() {
+  if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+    echo "Testing distributed C++ tests"
+    mkdir -p test/test-reports/cpp-distributed
+    build/bin/ProcessGroupGlooTest --gtest_output=xml:test/test-reports/cpp-distributed/ProcessGroupGlooTest.xml
+    build/bin/ProcessGroupNCCLErrorsTest --gtest_output=xml:test/test-reports/cpp-distributed/ProcessGroupNCCLErrorsTest.xml
+  fi
+}
+
 test_custom_backend() {
   if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then
     echo "Testing custom backends"
@@ -371,4 +380,5 @@ else
   test_custom_script_ops
   test_custom_backend
   test_torch_function_benchmark
+  test_distributed
 fi
diff --git a/torch/lib/c10d/test/CUDATest.cu b/torch/lib/c10d/test/CUDATest.cu
index 870c54cd665..c47b29ea536 100644
--- a/torch/lib/c10d/test/CUDATest.cu
+++ b/torch/lib/c10d/test/CUDATest.cu
@@ -21,7 +21,7 @@ void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks) {
 
 int cudaNumDevices() {
   int n = 0;
-  AT_CUDA_CHECK(cudaGetDeviceCount(&n));
+  C10_CUDA_CHECK_WARN(cudaGetDeviceCount(&n));
   return n;
 }
 
diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
index 1f2ee1c2e30..bdca745cbd8 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
@@ -224,7 +224,7 @@ void testBroadcast(const std::string& path, const at::DeviceType b) {
 
   std::vector<std::vector<at::Tensor>> inputs(size);
 
-  // Try every permutation of root rank and root tensoro
+  // Try every permutation of root rank and root tensor
   for (auto i = 0; i < size; i++) {
     for (auto j = 0; j < stride; j++) {
       // Initialize inputs
@@ -548,7 +548,7 @@ TEST(ProcessGroupGlooTest, testAllReduceCUDA) {
 
 TEST(ProcessGroupGlooTest, testBroadcastCUDA) {
   {
-    if (torch::cuda::is_available()) {
+    if (torch::cuda::device_count() > 1) {
       TemporaryFile file;
       testBroadcast(file.path, at::DeviceType::CUDA);
     }