diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 0eb9243630c..0a78fd8f8d1 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-39e9d3084686b291546cbfdbfc3e34f53659783d
+2ec22641e390cda25ec7c61fcbce07507727d584
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index 0d4459356e3..7c517a9ef12 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -360,8 +360,8 @@ class GraphModule(torch.nn.Module):
         actual_graph = self._test_wrap_simple(
             f,
             default_args_generator((x, y)),
-            ifdynstaticdefault(3, 4),
-            expected_opcount=3,
+            ifdynstaticdefault(2, 3),
+            expected_opcount=2,
             return_graph=True,
         )
         if torch._dynamo.config.assume_static_by_default:
@@ -369,20 +369,18 @@ class GraphModule(torch.nn.Module):
                 actual_graph,
                 """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_: "f32[3, 1]", L_y_: "f64[]"):
+    def forward(self, L_x_: "f32[3, 1]"):
         l_x_ = L_x_
-        l_y_ = L_y_
 
-        item: "Sym(zf0)" = l_y_.item();  l_y_ = None
         wrap_body_0 = self.wrap_body_0
-        wrap = torch.ops.higher_order.wrap(wrap_body_0, l_x_, item);  wrap_body_0 = l_x_ = item = None
+        wrap = torch.ops.higher_order.wrap(wrap_body_0, l_x_);  wrap_body_0 = l_x_ = None
         getitem: "f32[3]" = wrap[0];  wrap = None
         return (getitem,)
 
     class wrap_body_0(torch.nn.Module):
-        def forward(self, l_x_: "f32[3, 1]", item: "Sym(zf0)"):
+        def forward(self, l_x_: "f32[3, 1]"):
             view: "f32[3]" = l_x_.view(3);  l_x_ = None
-            add: "f32[3]" = view + item;  view = item = None
+            add: "f32[3]" = view + 0.5;  view = None
             return (add,)
 """,
             )
@@ -391,20 +389,18 @@ class GraphModule(torch.nn.Module):
                 actual_graph,
                 """\
 class GraphModule(torch.nn.Module):
-    def forward(self, s0: "Sym(s0)", L_x_: "f32[s0, 1]", L_y_: "f64[]"):
+    def forward(self, s0: "Sym(s0)", L_x_: "f32[s0, 1]"):
         l_x_ = L_x_
-        l_y_ = L_y_
 
-        item: "Sym(zf1)" = l_y_.item();  l_y_ = None
         wrap_body_0 = self.wrap_body_0
-        wrap = torch.ops.higher_order.wrap(wrap_body_0, s0, l_x_, item);  wrap_body_0 = s0 = l_x_ = item = None
+        wrap = torch.ops.higher_order.wrap(wrap_body_0, s0, l_x_);  wrap_body_0 = s0 = l_x_ = None
         getitem: "f32[s0]" = wrap[0];  wrap = None
         return (getitem,)
 
     class wrap_body_0(torch.nn.Module):
-        def forward(self, s0: "Sym(s0)", l_x_: "f32[s0, 1]", item: "Sym(zf1)"):
+        def forward(self, s0: "Sym(s0)", l_x_: "f32[s0, 1]"):
             view: "f32[s0]" = l_x_.view(s0);  l_x_ = s0 = None
-            add: "f32[s0]" = view + item;  view = item = None
+            add: "f32[s0]" = view + 0.5;  view = None
             return (add,)
 """,
             )
@@ -2409,7 +2405,7 @@ class GraphModule(torch.nn.Module):
 
         x = torch.zeros([])
         # Numbers don't get lifted, so args is still 2.
-        self._test_wrap_simple(f, default_args_generator((x,)), 3)
+        self._test_wrap_simple(f, default_args_generator((x,)), 2)
 
     def test_capture_global_num_adds_guard(self):
         @torch.compile(backend="eager", fullgraph=True)
@@ -2432,7 +2428,7 @@ class GraphModule(torch.nn.Module):
         x = torch.zeros([])
         y = 3.14
         # Numbers don't get lifted, so args is still 2.
-        self._test_wrap_simple(f, default_args_generator((x, y)), 3, expected_opcount=3)
+        self._test_wrap_simple(f, default_args_generator((x, y)), 2)
 
     def test_side_effect_in_body(self):
         counters.clear()
diff --git a/test/dynamo/test_logging.py b/test/dynamo/test_logging.py
index 2c4afd05972..35c1f916d2b 100644
--- a/test/dynamo/test_logging.py
+++ b/test/dynamo/test_logging.py
@@ -628,11 +628,11 @@ TRACE FX call mul from test_logging.py:N in fn (LoggingTests.test_trace_call_pre
 
         def inner(x, ys, zs):
             for y, z in zip(ys, zs):
-                x += y * (3.0 if z else 3.2)
+                x += y * z
             return x
 
         ys = [1.0, 2.0]
-        zs = [True]
+        zs = [3.0]
         x = torch.tensor([1.0])
 
         fn_opt = torch._dynamo.optimize("eager")(fn)
@@ -641,9 +641,8 @@ TRACE FX call mul from test_logging.py:N in fn (LoggingTests.test_trace_call_pre
 
         record_str = "\n".join(r.getMessage() for r in records)
 
-        # TODO: this is a very sensitive test
         self.assertIn(
-            f"___check_obj_id(L['zs'][0], {id(True)})",
+            """L['zs'][0] == 3.0""",
             record_str,
         )
         self.assertIn(
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index b08415ccbfe..4b45ee28aa1 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -1396,7 +1396,7 @@ utils_device.CURRENT_DEVICE == None""".split(
         cfg2.val = 2.0
         v = opt_fn(v, cfg2)  # 7
         self.assertEqual(v[0], 7)
-        self.assertEqual(cnts.op_count, 6)
+        self.assertEqual(cnts.op_count, 8)
 
     def test_config_getattr_default(self):
         class Cfg:
@@ -1491,7 +1491,7 @@ utils_device.CURRENT_DEVICE == None""".split(
         self.assertEqual(opt_fn_ret(1.5)[0], -459)
         self.assertEqual(out[0], 2100)
         self.assertEqual(cnts.frame_count, 2)
-        self.assertEqual(cnts.op_count, 9)
+        self.assertEqual(cnts.op_count, 7)
 
     def test_tensor_dict1(self):
         def fn(inputs):
@@ -3717,7 +3717,7 @@ utils_device.CURRENT_DEVICE == None""".split(
         self.assertAlmostEqual(cell1 + 1, result1)
         self.assertTrue(torch.allclose(cell2 + 3, result2))
         self.assertEqual(cnts.frame_count, 1)
-        self.assertEqual(cnts.op_count, 4)
+        self.assertEqual(cnts.op_count, 1)
 
     def test_closure_out_of_scope_cell_with_mutation(self):
         cell1 = torch.rand(1).item()
@@ -3745,12 +3745,8 @@ utils_device.CURRENT_DEVICE == None""".split(
             result1, result2, _ = opt_fn()
             self.assertAlmostEqual(orig1 + 1 * i, result1)
             self.assertTrue(torch.allclose(orig2 + 10 * i, result2))
-            if i == 1:
-                self.assertEqual(cnts.frame_count, 1)
-                self.assertEqual(cnts.op_count, 6)
-            else:
-                self.assertEqual(cnts.frame_count, 0)
-                self.assertEqual(cnts.op_count, 0)
+            self.assertEqual(cnts.frame_count, 1)
+            self.assertEqual(cnts.op_count, 3)
             cnts.clear()
 
     def test_closure_with_mutation_and_graph_break(self):
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 141ad7be761..64a9671d9d0 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1242,13 +1242,13 @@ class ReproTests(torch._dynamo.test_case.TestCase):
         with torch.no_grad():
             cnt = self._reformer(nopython=True)
         self.assertEqual(cnt.frame_count, 1)
-        self.assertEqual(cnt.op_count, 13)
+        self.assertEqual(cnt.op_count, 11)
 
     def test_reformer_train(self):
         with torch.enable_grad():
             cnt = self._reformer(nopython=False)
         expected_op_count = (
-            """13""" if torch._dynamo.config.inline_inbuilt_nn_modules else """5"""
+            """11""" if torch._dynamo.config.inline_inbuilt_nn_modules else """5"""
         )
 
         self.assertExpectedInline(cnt.frame_count, """1""")
@@ -1725,7 +1725,7 @@ class ReproTests(torch._dynamo.test_case.TestCase):
         opt_model(inp)
         opt_model(inp)
         self.assertEqual(cnt.frame_count, 1)
-        self.assertEqual(18, cnt.op_count)
+        self.assertEqual(12, cnt.op_count)
 
     def test_exec_import(self):
         def fn1():
diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py
index f0788f8599f..e3da411034a 100644
--- a/test/dynamo/test_structured_trace.py
+++ b/test/dynamo/test_structured_trace.py
@@ -674,13 +674,7 @@ class StructuredTraceTest(TestCase):
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_storage": {"id": 1, "describer_id": "ID", "size": 8}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 1, "ndim": 0, "dtype": "torch.float64", "device": "device(type='cpu')", "size": [], "is_leaf": true, "stride": [], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_source": {"describer_id": "ID", "id": 1, "source": "___as_tensor(L['ys'][0])"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_storage": {"id": 2, "describer_id": "ID", "size": 8}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 2, "ndim": 0, "dtype": "torch.float64", "device": "device(type='cpu')", "size": [], "is_leaf": true, "stride": [], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_source": {"describer_id": "ID", "id": 2, "source": "___as_tensor(L['zs'][0])"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"dynamo_output_graph": {"sizes": {"l_x_": [1], "l_ys_0_": [], "l_zs_0_": [], "x": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_output_graph": {"sizes": {"l_x_": [1], "x": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "recompile_reasons", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
@@ -688,13 +682,7 @@ class StructuredTraceTest(TestCase):
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
-{"describe_storage": {"id": 1, "describer_id": "ID", "size": 8}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
-{"describe_tensor": {"id": 1, "ndim": 0, "dtype": "torch.float64", "device": "device(type='cpu')", "size": [], "is_leaf": true, "stride": [], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
-{"describe_source": {"describer_id": "ID", "id": 1, "source": "___as_tensor(L['ys'][0])"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
-{"describe_storage": {"id": 2, "describer_id": "ID", "size": 8}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
-{"describe_tensor": {"id": 2, "ndim": 0, "dtype": "torch.float64", "device": "device(type='cpu')", "size": [], "is_leaf": true, "stride": [], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
-{"describe_source": {"describer_id": "ID", "id": 2, "source": "___as_tensor(L['zs'][0])"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
-{"dynamo_output_graph": {"sizes": {"l_x_": [1], "l_ys_0_": [], "l_zs_0_": [], "x": [1]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_output_graph": {"sizes": {"l_x_": [1], "x": [1]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 """,  # noqa: B950
diff --git a/test/dynamo/test_utils.py b/test/dynamo/test_utils.py
index 578d276ec2c..e4e95bb9c69 100644
--- a/test/dynamo/test_utils.py
+++ b/test/dynamo/test_utils.py
@@ -270,7 +270,7 @@ class TestDynamoTimed(TestCase):
  'runtime_cudagraphify_time_us': None,
  'runtime_triton_autotune_time_us': None,
  'shape_env_guard_count': 0,
- 'specialize_float': False,
+ 'specialize_float': True,
  'start_time': 0.0001,
  'start_time_us': 100,
  'structured_logging_overhead_s': 0.0,
diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
index 729934b718a..010be06a13f 100644
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@@ -114,115 +114,67 @@ class KernelCounts(NamedTuple):
 # This maps the test name to the
 # expected kernel count
 KERNEL_COUNT_OVERRIDES = {
-    "test_adadelta_cpu": 6,
-    "test_adadelta_foreach_rho_weight_decay_cpu": 12,
-    "test_adadelta_foreach_weight_decay_cpu": 12,
-    "test_adadelta_foreach_weight_decay_maximize_cpu": 12,
-    "test_adadelta_maximize_cpu": 6,
-    "test_adadelta_rho_weight_decay_cpu": 6,
-    "test_adadelta_tensor_lr_capturable_cuda": 6,
-    "test_adadelta_tensor_lr_capturable_xpu": 6,
-    "test_adadelta_weight_decay_cpu": 6,
-    "test_adadelta_weight_decay_maximize_cpu": 6,
-    "test_adagrad_cpu": 6,
-    "test_adagrad_cuda": 6,
-    "test_adagrad_initial_accumulator_value_weight_decay_cpu": 6,
-    "test_adagrad_initial_accumulator_value_weight_decay_cuda": 6,
-    "test_adagrad_initial_accumulator_value_weight_decay_foreach_xpu": 2,
-    "test_adagrad_lr_decay_weight_decay_cpu": 6,
-    "test_adagrad_lr_decay_weight_decay_cuda": 6,
-    "test_adagrad_lr_decay_weight_decay_foreach_xpu": 2,
-    "test_adagrad_tensor_lr_cpu": 6,
-    "test_adagrad_tensor_lr_cuda": 6,
-    "test_adagrad_tensor_lr_xpu": 6,
-    "test_adagrad_weight_decay_cpu": 6,
-    "test_adagrad_weight_decay_cuda": 6,
-    "test_adagrad_weight_decay_foreach_xpu": 2,
-    "test_adagrad_weight_decay_maximize_cpu": 6,
-    "test_adagrad_weight_decay_maximize_cuda": 6,
-    "test_adagrad_weight_decay_maximize_foreach_xpu": 2,
-    "test_adam_amsgrad_capturable_cuda": 6,
-    "test_adam_amsgrad_capturable_xpu": 6,
-    "test_adam_cpu": 6,
+    "test_rmsprop_foreach_weight_decay_cpu": 12,
+    "test_nadam_foreach_weight_decay_momentum_decay_cpu": 20,
+    "test_adamw_amsgrad_capturable_foreach_cuda": 3,
+    "test_adamw_amsgrad_capturable_foreach_xpu": 3,
+    "test_adamw_amsgrad_capturable_cuda": 6,
+    "test_adamw_amsgrad_capturable_xpu": 6,
+    "test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_cuda": 6,
+    "test_adamw_tensor_lr_tensor_betas_capturable_cuda": 6,
+    "test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_xpu": 6,
+    "test_adamw_tensor_lr_amsgrad_capturable_cuda": 6,
+    "test_adamw_tensor_lr_amsgrad_capturable_xpu": 6,
     "test_adam_tensor_lr_amsgrad_capturable_cuda": 6,
     "test_adam_tensor_lr_amsgrad_capturable_xpu": 6,
     "test_adam_tensor_lr_tensor_betas_amsgrad_capturable_cuda": 6,
     "test_adam_tensor_lr_tensor_betas_capturable_cuda": 6,
-    "test_adam_weight_decay_cpu": 6,
-    "test_adam_weight_decay_maximize_cpu": 6,
-    "test_adamax_cpu": 6,
-    "test_adamax_maximize_cpu": 6,
-    "test_adamax_tensor_lr_weight_decay_capturable_cuda": 6,
-    "test_adamax_tensor_lr_weight_decay_capturable_xpu": 6,
-    "test_adamax_weight_decay_cpu": 6,
-    "test_adamax_weight_decay_maximize_cpu": 6,
-    "test_adamw_amsgrad_capturable_cuda": 6,
-    "test_adamw_amsgrad_capturable_foreach_cuda": 3,
-    "test_adamw_amsgrad_capturable_foreach_xpu": 3,
-    "test_adamw_amsgrad_capturable_xpu": 6,
-    "test_adamw_cpu": 6,
-    "test_adamw_tensor_lr_amsgrad_capturable_cuda": 6,
-    "test_adamw_tensor_lr_amsgrad_capturable_xpu": 6,
-    "test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_cuda": 6,
-    "test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_xpu": 6,
-    "test_adamw_tensor_lr_tensor_betas_capturable_cuda": 6,
-    "test_adamw_weight_decay_cpu": 6,
-    "test_adamw_weight_decay_maximize_cpu": 6,
-    "test_asgd_cpu": 3,
-    "test_asgd_lambd_cpu": 3,
-    "test_asgd_maximize_cpu": 3,
-    "test_asgd_recompile_single": 16,
-    "test_asgd_t0_cpu": 3,
-    "test_asgd_tensor_lr_weight_decay_maximize_capturable_cuda": 5,
-    "test_asgd_tensor_lr_weight_decay_maximize_capturable_xpu": 8,
-    "test_asgd_weight_decay_cpu": 3,
-    "test_asgd_weight_decay_maximize_cpu": 3,
-    "test_nadam_cpu": 3,
-    "test_nadam_foreach_weight_decay_momentum_decay_cpu": 20,
-    "test_nadam_momentum_decay_cpu": 3,
-    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_cuda": 6,
-    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_xpu": 9,
-    "test_nadam_weight_decay_cpu": 5,
-    "test_nadam_weight_decay_maximize_cpu": 5,
-    "test_nadam_weight_decay_momentum_decay_cpu": 5,
-    "test_nadam_weight_decay_momentum_decay_decoupled_weight_decay_cpu": 5,
-    "test_radam_cpu": 7,
-    "test_radam_eps_cpu": 7,
-    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_cuda": 6,
-    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_xpu": 6,
-    "test_radam_weight_decay_cpu": 7,
-    "test_radam_weight_decay_decoupled_weight_decay_cpu": 7,
-    "test_radam_weight_decay_maximize_cpu": 7,
-    "test_rmsprop_cpu": 6,
-    "test_rmsprop_foreach_weight_decay_cpu": 12,
-    "test_rmsprop_maximize_cpu": 6,
-    "test_rmsprop_maximize_weight_decay_cpu": 6,
+    "test_adam_amsgrad_capturable_cuda": 6,
+    "test_adam_amsgrad_capturable_xpu": 6,
+    "test_adadelta_tensor_lr_capturable_cuda": 6,
+    "test_adadelta_tensor_lr_capturable_xpu": 6,
     "test_rmsprop_tensor_lr_capturable_cuda": 6,
     "test_rmsprop_tensor_lr_capturable_xpu": 6,
-    "test_rmsprop_weight_decay_centered_cpu": 6,
-    "test_rmsprop_weight_decay_cpu": 6,
-    "test_sgd_cpu": 4,
-    "test_sgd_cuda": 4,
-    "test_sgd_foreach_momentum_nesterov_weight_decay_cpu": 16,
+    "test_adadelta_foreach_weight_decay_maximize_cpu": 12,
+    "test_adadelta_foreach_rho_weight_decay_cpu": 12,
+    "test_adadelta_foreach_weight_decay_cpu": 12,
     "test_sgd_foreach_momentum_weight_decay_cpu": 16,
+    "test_sgd_foreach_momentum_nesterov_weight_decay_cpu": 16,
     "test_sgd_momentum_dampening_foreach_cuda": 5,
     "test_sgd_momentum_dampening_foreach_xpu": 5,
     "test_sgd_momentum_foreach_cuda": 5,
     "test_sgd_momentum_foreach_xpu": 5,
-    "test_sgd_momentum_nesterov_weight_decay_foreach_cuda": 2,
-    "test_sgd_momentum_nesterov_weight_decay_foreach_xpu": 2,
+    "test_sgd_weight_decay_maximize_cuda": 4,
+    "test_sgd_weight_decay_maximize_xpu": 4,
+    "test_sgd_weight_decay_maximize_cpu": 4,
+    "test_sgd_weight_decay_cpu": 4,
+    "test_sgd_weight_decay_cuda": 4,
+    "test_sgd_weight_decay_xpu": 4,
     "test_sgd_momentum_weight_decay_foreach_cuda": 2,
     "test_sgd_momentum_weight_decay_foreach_xpu": 2,
+    "test_sgd_momentum_nesterov_weight_decay_foreach_cuda": 2,
+    "test_sgd_momentum_nesterov_weight_decay_foreach_xpu": 2,
+    "test_sgd_cuda": 4,
+    "test_sgd_cpu": 4,
+    "test_sgd_xpu": 4,
+    "test_adagrad_initial_accumulator_value_weight_decay_foreach_xpu": 2,
+    "test_adagrad_lr_decay_weight_decay_foreach_xpu": 2,
+    "test_adagrad_weight_decay_foreach_xpu": 2,
+    "test_adagrad_weight_decay_maximize_foreach_xpu": 2,
+    "test_adagrad_tensor_lr_cpu": 6,
+    "test_adagrad_tensor_lr_cuda": 6,
+    "test_adagrad_tensor_lr_xpu": 6,
+    "test_adamax_tensor_lr_weight_decay_capturable_cuda": 6,
+    "test_adamax_tensor_lr_weight_decay_capturable_xpu": 6,
+    "test_asgd_tensor_lr_weight_decay_maximize_capturable_cuda": 5,
+    "test_asgd_tensor_lr_weight_decay_maximize_capturable_xpu": 8,
+    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_cuda": 6,
+    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_xpu": 9,
+    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_cuda": 6,
+    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_xpu": 6,
     "test_sgd_tensor_lr_cpu": 2,
     "test_sgd_tensor_lr_cuda": 2,
     "test_sgd_tensor_lr_xpu": 2,
-    "test_sgd_weight_decay_cpu": 4,
-    "test_sgd_weight_decay_cuda": 4,
-    "test_sgd_weight_decay_maximize_cpu": 4,
-    "test_sgd_weight_decay_maximize_cuda": 4,
-    "test_sgd_weight_decay_maximize_xpu": 4,
-    "test_sgd_weight_decay_xpu": 4,
-    "test_sgd_xpu": 4,
 }
 
 # also tracks currently supported optimizers
@@ -679,7 +631,7 @@ class CompiledOptimizerTests(TestCase):
     test_adagrad_recompile = make_recompile_test(Adagrad, lr=0.01)
     test_asgd_recompile_default = make_recompile_test(ASGD, lr=0.01)
     test_asgd_recompile_single = make_recompile_test(
-        ASGD, kernel_count=3, lr=0.01, foreach=False
+        ASGD, kernel_count=8, lr=0.01, foreach=False
     )
     test_asgd_recompile_foreach = make_recompile_test(ASGD, lr=0.01, foreach=True)
     test_sgd_recompile_single = make_recompile_test(
diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
index ec75cc4bca4..5e23f1ba448 100644
--- a/test/inductor/test_cpu_select_algorithm.py
+++ b/test/inductor/test_cpu_select_algorithm.py
@@ -296,7 +296,6 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
             )
             and epilogue != "mul"
             and epilogue != "div"
-            and epilogue != "leaky_relu"
             or (
                 dtype in (torch.float16, torch.bfloat16)
                 and epilogue == "add"
diff --git a/test/inductor/test_efficient_conv_bn_eval.py b/test/inductor/test_efficient_conv_bn_eval.py
index 7161e5ee370..9307345e6d5 100644
--- a/test/inductor/test_efficient_conv_bn_eval.py
+++ b/test/inductor/test_efficient_conv_bn_eval.py
@@ -13,7 +13,6 @@ from torch import nn
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
-from torch._dynamo import config as dynamo_config
 from torch._dynamo.utils import counters
 from torch._inductor import config as inductor_config
 from torch._inductor.test_case import TestCase
@@ -95,9 +94,6 @@ class MultiUserConvOp(nn.Module):
 
 
 class EfficientConvBNEvalTemplate(TestCase):
-    # With specialize_float = False, momentum becomes an input and so
-    # the number of bytes accessed wobbles
-    @dynamo_config.patch(specialize_float=True)
     @inductor_config.patch({"efficient_conv_bn_eval_fx_passes": True})
     def test_basic(self):
         def test_conv_bn_eval(
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
index 89658ee2d06..07ea40afc5b 100644
--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -114,9 +114,6 @@ def cal_conv_generated_kernel_number(mod, input, dtype):
     return input_kernel + output_kernel
 
 
-# The pattern match for this is kind of broken.  I'll cc the
-# person who wrote this test/match on the diff to see if they can help me fix it.
-@torch._dynamo.config.patch(specialize_float=True)
 @config.patch({"freezing": True})
 class TestPatternMatcherBase(TestCase):
     def _check_unary_is_decomposed(self, unary_fn):
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index 59b2a5219ac..7d9ec01e7a3 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -5,7 +5,6 @@ from unittest.mock import patch
 
 import functorch
 import torch
-import torch._dynamo.config as dynamo_config
 import torch._inductor.config as config
 import torch.autograd
 from torch._inductor import metrics
@@ -481,9 +480,6 @@ class FusionTests(TestCase):
         inp = (T(10, 10), T(10, 10), T(10, 10))
         self.assertExpectedInline(count_numel(f, *inp), """500""")
 
-    # With specialize_float = False, epsilon becomes an input and so
-    # the number of bytes accessed wobbles
-    @dynamo_config.patch(specialize_float=True)
     def test_reduction_pointwise_multi_level_reduction(self):
         hidden_size = 4096
         layer_norm = torch.nn.LayerNorm(hidden_size).cuda().float()
diff --git a/test/test_transformers.py b/test/test_transformers.py
index c0c886ad989..e08643d1bb3 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -3939,7 +3939,7 @@ class TestAttnBias(NNTestCase):
                                    SDPBackend.MATH,
                                    SDPBackend.CUDNN_ATTENTION]):
             self.run_test(device, make_q_tensor, make_kv_tensor, attn_bias, forw_tol, grad_tol, backend=cnts)
-        self.assertEqual(cnts.frame_count, 2, "Compiled graph should have 2 frames!")
+        self.assertEqual(cnts.frame_count, 1, "Compiled graph should have 1 frame!")
 
     @skipIfRocm
     @parametrize("shape", [(16, 16, 128, 128, 16), (16, 16, 128, 256, 32), (16, 16, 256, 128, 32), (1, 1, 23, 56, 15)])
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 139c3ec0b6d..5c36654ae5d 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -65,7 +65,7 @@ specialize_int = False
 # Whether or not to specialize on float inputs.  Dynamo will always promote
 # float inputs into Tensor inputs, but at the moment, backends inconsistently
 # support codegen on float (this is to be fixed).
-specialize_float = True if is_fbcode() else False
+specialize_float = True
 
 # legacy config, does nothing now!
 dynamic_shapes = True