diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 69c75a96f5d..c31b1ea164f 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -8,10 +8,13 @@ import re
 import tempfile
 import textwrap
 import unittest
+from unittest.mock import patch
 from dataclasses import dataclass, field
 from typing import List, Optional
 
 import expecttest
+import subprocess
+import sys
 import torch
 import torch.nn as nn
 import torch.optim
@@ -1325,6 +1328,78 @@ class TestProfiler(TestCase):
                 self.assertTrue(len(e.input_shapes) > 0)
                 self.assertTrue(len(e.input_shapes[0]) > 0)
 
+    @patch.dict(os.environ, {"KINETO_USE_DAEMON": "1"})
+    def test_kineto_profiler_with_environment_variable(self):
+        script = """
+import torch
+import torch.nn as nn
+from torch.profiler import supported_activities, profile
+from torch.autograd.profiler import KinetoStepTracker
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(10, 5)
+        self.fc2 = nn.Linear(5, 2)
+
+    def forward(self, x):
+        return self.fc2(self.fc1(x))
+
+
+def payload(use_cuda=False):
+    x = torch.randn(10, 10)
+    if use_cuda:
+        x = x.cuda()
+    y = torch.randn(10, 10)
+    if use_cuda:
+        y = y.cuda()
+    z = torch.mm(x, y)
+    z = z + y
+    if use_cuda:
+        z = z.cpu()
+
+niters = 8
+use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
+net = SimpleNet()
+opt = torch.optim.SGD(net.parameters(), lr=0.01)
+opt.zero_grad()
+inputs = torch.rand(10)
+
+with profile(activities=supported_activities()):
+    payload(use_cuda=use_cuda)
+
+initial_step = KinetoStepTracker.current_step()
+
+def run_batch():
+    out = net(inputs)
+    loss = torch.nn.functional.cross_entropy(out, torch.rand(2))
+    loss.backward()
+    opt.step()
+
+for _ in range(niters):
+    run_batch()
+
+with profile(
+    activities=supported_activities(),
+    schedule=torch.profiler.schedule(
+        wait=1,
+        warmup=1,
+        active=2),
+) as p:
+    for _ in range(niters):
+        run_batch()
+        p.step()
+assert KinetoStepTracker.current_step() == initial_step + 2 * niters
+"""
+        try:
+            subprocess.check_output(
+                [sys.executable, '-W', 'all', '-c', script],
+                cwd=os.path.dirname(os.path.realpath(__file__))
+            )
+        except subprocess.CalledProcessError as e:
+            if e.returncode != 0:
+                self.assertTrue(False, "Kineto is not working properly with the Dynolog environment variable")
+
 
 def find_node_with_name(nodes, name):
     for node in _utils.traverse_dfs(nodes):
diff --git a/torch/optim/optimizer.pyi b/torch/optim/optimizer.pyi
index c55503968f6..47fdcce7e3d 100644
--- a/torch/optim/optimizer.pyi
+++ b/torch/optim/optimizer.pyi
@@ -4,6 +4,9 @@ from torch.utils.hooks import RemovableHandle
 
 _params_t = Union[Iterable[Tensor], Iterable[Dict[str, Any]]]
 
+def register_optimizer_step_pre_hook(hook: Callable[..., None]) -> RemovableHandle: ...
+
+def register_optimizer_step_post_hook(hook: Callable[..., None]) -> RemovableHandle: ...
 
 class Optimizer:
     defaults: Dict[str, Any]
diff --git a/torch/profiler/__init__.py b/torch/profiler/__init__.py
index a0185a9799b..90bce0924df 100644
--- a/torch/profiler/__init__.py
+++ b/torch/profiler/__init__.py
@@ -7,9 +7,12 @@ examine their input shapes and stack traces, study device kernel activity and vi
     An earlier version of the API in :mod:`torch.autograd` module is considered legacy and will be deprecated.
 
 """
+import os
+
 from torch._C._autograd import _supported_activities, DeviceType, kineto_available
 from torch._C._profiler import _ExperimentalConfig, ProfilerActivity, RecordScope
-from torch.autograd.profiler import record_function
+from torch.autograd.profiler import record_function, KinetoStepTracker
+from torch.optim.optimizer import register_optimizer_step_post_hook
 
 from .profiler import (
     _KinetoProfile,
@@ -35,3 +38,9 @@ __all__ = [
 ]
 
 from . import itt
+
+def _optimizer_post_hook(optimizer, args, kwargs):
+    KinetoStepTracker.increment_step("Optimizer")
+
+if os.environ.get("KINETO_USE_DAEMON", None):
+    _ = register_optimizer_step_post_hook(_optimizer_post_hook)