diff --git a/benchmarks/tensorexpr/benchmark.py b/benchmarks/tensorexpr/benchmark.py
index c560ff57a34..7a5b255da90 100644
--- a/benchmarks/tensorexpr/benchmark.py
+++ b/benchmarks/tensorexpr/benchmark.py
@@ -228,14 +228,15 @@ def cuda_pointwise_context(loop_levels, block_count, block_size):
         old_block_size = torch._C._jit_get_te_cuda_pointwise_block_size()
         torch._C._jit_set_te_cuda_pointwise_block_size(block_size)
 
-    yield
-
-    if loop_levels:
-        torch._C._jit_set_te_cuda_pointwise_loop_levels(old_loop_levels)
-    if block_count:
-        torch._C._jit_set_te_cuda_pointwise_block_count(old_block_count)
-    if block_size:
-        torch._C._jit_set_te_cuda_pointwise_block_size(old_block_size)
+    try:
+        yield
+    finally:
+        if loop_levels:
+            torch._C._jit_set_te_cuda_pointwise_loop_levels(old_loop_levels)
+        if block_count:
+            torch._C._jit_set_te_cuda_pointwise_block_count(old_block_count)
+        if block_size:
+            torch._C._jit_set_te_cuda_pointwise_block_size(old_block_size)
 
 # Auxiliary class to facilitate dynamic input shape
 class DynamicShape:
diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py
index 6b0af67853a..b7b3bba4542 100644
--- a/caffe2/python/net_printer.py
+++ b/caffe2/python/net_printer.py
@@ -65,8 +65,10 @@ class Analyzer(Visitor):
         if do_copy:
             ws = copy(ws)
         self.workspace_ctx.append(ws)
-        yield ws
-        del self.workspace_ctx[-1]
+        try:
+            yield ws
+        finally:
+            del self.workspace_ctx[-1]
 
     def define_blob(self, blob):
         self.workspace[blob] += 1
@@ -166,12 +168,14 @@ class Text:
             self.add('with %s:' % text)
             self._indent += 4
             self._lines_in_context.append(0)
-        yield
-        if text is not None:
-            if self._lines_in_context[-1] == 0:
-                self.add('pass')
-            self._indent -= 4
-            del self._lines_in_context[-1]
+        try:
+            yield
+        finally:
+            if text is not None:
+                if self._lines_in_context[-1] == 0:
+                    self.add('pass')
+                self._indent -= 4
+                del self._lines_in_context[-1]
 
     def add(self, text):
         self._lines_in_context[-1] += 1
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index 3c48b834b53..047828226f7 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -811,8 +811,10 @@ class TestFFT(TestCase):
                 plan_cache = torch.backends.cuda.cufft_plan_cache[device]
             original = plan_cache.max_size
             plan_cache.max_size = n
-            yield
-            plan_cache.max_size = original
+            try:
+                yield
+            finally:
+                plan_cache.max_size = original
 
         with plan_cache_max_size(devices[0], max(1, torch.backends.cuda.cufft_plan_cache.size - 10)):
             self._test_fft_ifft_rfft_irfft(devices[0], dtype)
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 4d305caa7f5..b9739e32383 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -1012,7 +1012,6 @@ def disable_cache_limit():
     try:
         yield
     finally:
-        pass
         config.cache_size_limit = prior
 
 
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index cd81e59ab47..214600c39ec 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -1228,10 +1228,12 @@ def track_graph_compiling(aot_config, graph_name):
     global graph_being_compiled
     # TODO: Don't shove the aot_id in here; set it in the context
     graph_being_compiled = [f"{aot_config.aot_id}_{graph_name}"]
-    yield
-    global nth_graph
-    nth_graph += 1
-    graph_being_compiled = []
+    try:
+        yield
+    finally:
+        global nth_graph
+        nth_graph += 1
+        graph_being_compiled = []
 
 
 def make_boxed_func(f):
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index bd58397de1e..deabae5f970 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -560,8 +560,10 @@ class Kernel(CodeGen):
     def set_current_node(self, node):
         prior = self.current_node
         self.current_node = node
-        yield
-        self.current_node = prior
+        try:
+            yield
+        finally:
+            self.current_node = prior
 
     @contextlib.contextmanager
     def swap_buffers(self, lb, cb=None, sb=None):
@@ -575,11 +577,13 @@ class Kernel(CodeGen):
         self.compute = cb
         self.stores = sb
         self.cse = cse.clone()
-        yield
-        self.loads = loads
-        self.compute = compute
-        self.stores = stores
-        self.cse = cse
+        try:
+            yield
+        finally:
+            self.loads = loads
+            self.compute = compute
+            self.stores = stores
+            self.cse = cse
 
     def load(self, name: str, index: sympy.Expr):
         raise NotImplementedError()
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index eefdd4f7a46..abaf523f9cf 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -696,11 +696,13 @@ class TritonKernel(Kernel):
                 # and write out a reduction loop
                 self.codegen_body()
             self.inside_reduction = False
-            yield
-            if not self.persistent_reduction:
-                # flush out any code before opening the next loop
-                self.codegen_body()
-            self.inside_reduction = True
+            try:
+                yield
+                if not self.persistent_reduction:
+                    # flush out any code before opening the next loop
+                    self.codegen_body()
+            finally:
+                self.inside_reduction = True
 
         return ctx()
 
@@ -957,10 +959,12 @@ class TritonKernel(Kernel):
             mask = self.cse.generate(self.compute, f"{mask} & {prior}")
 
         self._load_mask = mask
-        with self.swap_buffers(self.compute, self.compute):
-            # TODO(jansel): do we need a reshape here?
-            yield mask
-        self._load_mask = prior
+        try:
+            with self.swap_buffers(self.compute, self.compute):
+                # TODO(jansel): do we need a reshape here?
+                yield mask
+        finally:
+            self._load_mask = prior
 
     def load(self, name: str, index: sympy.Expr):
         var = self.args.input(name)
diff --git a/torch/_inductor/triton_ops/autotune.py b/torch/_inductor/triton_ops/autotune.py
index 61945858232..558a4240d8f 100644
--- a/torch/_inductor/triton_ops/autotune.py
+++ b/torch/_inductor/triton_ops/autotune.py
@@ -228,7 +228,7 @@ def end_graph():
     cur_file = inspect.stack()[1].filename
     print(f"SUMMARY ({cur_file})")
     print(
-        f"{overall_time:.2f}ms\t {overall_gb:.2f} GB\t {overall_gb/(overall_time/1e3):.2f}GB/s"
+        f"{overall_time:.2f}ms   \t {overall_gb:.2f} GB\t {overall_gb/(overall_time/1e3):.2f}GB/s"
     )
     print()
 
@@ -250,10 +250,12 @@ class DebugAutotuner(CachingAutotuner):
         num_gb = get_num_bytes(*args) / 1e9
         gb_per_s = num_gb / (ms / 1e3)
 
-        collected_calls.append((kernel_name, ms, num_gb, gb_per_s))
+        collected_calls.append((ms, num_gb, gb_per_s, kernel_name)),
         import colorama
 
-        info_str = f"{kernel_name}\t {ms:.3f}ms\t{num_gb:.3f} GB \t {gb_per_s:.2f}GB/s"
+        info_str = (
+            f"{ms:.3f}ms    \t{num_gb:.3f} GB \t {gb_per_s:.2f}GB/s \t {kernel_name}"
+        )
         if ms > 0.012 and gb_per_s < 650:
             print(colorama.Fore.RED + info_str + colorama.Fore.RESET)
         else:
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index e4d28fbc99e..51692d5da89 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -444,8 +444,10 @@ class IndentedBuffer:
         @contextlib.contextmanager
         def ctx():
             self._indent += offset
-            yield
-            self._indent -= offset
+            try:
+                yield
+            finally:
+                self._indent -= offset
 
         return ctx()
 
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index 868f3d9fc6b..31427f9671e 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -1243,8 +1243,10 @@ def _create_named_tuple(
 def _disable_emit_hooks():
     hooks = torch._C._jit_get_emit_hooks()
     torch._C._jit_set_emit_hooks(None, None)
-    yield
-    torch._C._jit_set_emit_hooks(hooks[0], hooks[1])
+    try:
+        yield
+    finally:
+        torch._C._jit_set_emit_hooks(hooks[0], hooks[1])
 
 
 def _disable_emit_hooks_decorator(_DecoratorContextManager) -> None:  # noqa: F811
diff --git a/torch/cuda/nvtx.py b/torch/cuda/nvtx.py
index 16fa078dff8..1ec9e261089 100644
--- a/torch/cuda/nvtx.py
+++ b/torch/cuda/nvtx.py
@@ -84,5 +84,7 @@ def range(msg, *args, **kwargs):
         msg (str): message to associate with the range
     """
     range_push(msg.format(*args, **kwargs))
-    yield
-    range_pop()
+    try:
+        yield
+    finally:
+        range_pop()
diff --git a/torch/distributed/elastic/multiprocessing/redirects.py b/torch/distributed/elastic/multiprocessing/redirects.py
index 08ac09e91ca..bad5ec3556a 100644
--- a/torch/distributed/elastic/multiprocessing/redirects.py
+++ b/torch/distributed/elastic/multiprocessing/redirects.py
@@ -93,8 +93,10 @@ def redirect(std: str, to_file: str):
 
     with os.fdopen(os.dup(std_fd)) as orig_std, open(to_file, mode="w+b") as dst:
         _redirect(dst)
-        yield
-        _redirect(orig_std)
+        try:
+            yield
+        finally:
+            _redirect(orig_std)
 
 
 redirect_stdout = partial(redirect, "stdout")
diff --git a/torch/profiler/itt.py b/torch/profiler/itt.py
index 22f4dcf828c..7f4de54597f 100644
--- a/torch/profiler/itt.py
+++ b/torch/profiler/itt.py
@@ -69,5 +69,7 @@ def range(msg, *args, **kwargs):
         msg (str): message to associate with the range
     """
     range_push(msg.format(*args, **kwargs))
-    yield
-    range_pop()
+    try:
+        yield
+    finally:
+        range_pop()
diff --git a/torch/serialization.py b/torch/serialization.py
index 83f6fa275bb..179f93a697a 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -56,8 +56,10 @@ class SourceChangeWarning(Warning):
 @contextmanager
 def mkdtemp():
     path = tempfile.mkdtemp()
-    yield path
-    shutil.rmtree(path)
+    try:
+        yield path
+    finally:
+        shutil.rmtree(path)
 
 
 _package_registry = []
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index f1dcc1e9a62..448a5b9f80b 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -1189,11 +1189,13 @@ def _dynamo_dist_per_rank_init(rank, world_size, init_pg=True):
         c10d.init_process_group("nccl", rank=rank, world_size=world_size)
     torch._dynamo.reset()
     torch._dynamo.utils.counters.clear()
-    yield
-    torch._dynamo.reset()
-    torch._dynamo.utils.counters.clear()
-    if init_pg:
-        c10d.destroy_process_group()
+    try:
+        yield
+    finally:
+        torch._dynamo.reset()
+        torch._dynamo.utils.counters.clear()
+        if init_pg:
+            c10d.destroy_process_group()
 
 
 class DynamoDistributedSingleProcTestCase(torch._dynamo.test_case.TestCase):