diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
index 1ed6c4417df..974011fb4f3 100644
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -165,22 +165,26 @@ void CUDAGraph::replay() {
   TORCH_CHECK(has_graph_exec_,
               "Called CUDAGraph::replay without a preceding successful capture.");
 
+  c10::OptionalDeviceGuard device_guard{capture_stream_.device()};
+
+  // Just like any RNG consumer kernel!
+  auto* gen = get_generator_or_default<CUDAGeneratorImpl>(
+      c10::nullopt, cuda::detail::getDefaultCUDAGenerator());
+  PhiloxCudaState rng_engine_inputs;
   {
-    c10::OptionalDeviceGuard device_guard{capture_stream_.device()};
-
-    // Just like any RNG consumer kernel!
-    auto* gen = get_generator_or_default<CUDAGeneratorImpl>(
-        c10::nullopt, cuda::detail::getDefaultCUDAGenerator());
-    PhiloxCudaState rng_engine_inputs;
-    {
-      std::lock_guard<std::mutex> lock(gen->mutex_);
-      rng_engine_inputs = gen->philox_cuda_state(wholegraph_increment_);
-    }
-    offset_extragraph_.fill_(int64_t(rng_engine_inputs.offset_.val));
-
-    // graph_exec_ may be replayed in any stream.
-    AT_CUDA_CHECK(cudaGraphLaunch(graph_exec_, at::cuda::getCurrentCUDAStream()));
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_cuda_state(wholegraph_increment_);
   }
+  offset_extragraph_.fill_(int64_t(rng_engine_inputs.offset_.val));
+
+  // graph_exec_ may be replayed in any stream.
+  AT_CUDA_CHECK(cudaGraphLaunch(graph_exec_, at::cuda::getCurrentCUDAStream()));
+
+  // Temporary workaround for bug in libcuda.so that causes replayed graphs
+  // with certain topologies to be corrupted (kernels elided, internal syncs
+  // ignored) when replayed back to back without a sync in between.
+  // I hate to use a hard sync, but it's the only surefire workaround at the moment.
+  cudaDeviceSynchronize();
 #else
   TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0");
 #endif
diff --git a/test/test_cuda.py b/test/test_cuda.py
index d72fa3ac701..ae93b6bd50b 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -3187,6 +3187,8 @@ torch.cuda.synchronize()
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
+    @unittest.skip("Temporarily disabled due to a graphs bug in libcuda.so, " +
+                   "see https://github.com/pytorch/pytorch/pull/57556")
     @unittest.skipIf((not TEST_CUDA) or
                      TEST_WITH_ROCM or
                      int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")