diff --git a/test/test_profiler.py b/test/test_profiler.py
index e093a50178a..2820608a9e3 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -1131,7 +1131,7 @@ class TestProfiler(TestCase):
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
     def test_utils_compute_queue_depth(self):
-        x = torch.ones((4096, 4096), device="cuda")
+        x = torch.ones((8096, 8096), device="cuda")
         with profile() as prof:
             # First half we want it to be compute bound
             for _ in range(5):
@@ -1143,8 +1143,23 @@ class TestProfiler(TestCase):
                 y[0] += 1
                 time.sleep(0.1)
         basic_evaluation = _utils.BasicEvaluation(prof.profiler)
-        for entry in basic_evaluation.compute_queue_depth():
-            self.assertTrue(entry.queue_depth >= 0)
+        # We can assume golden because mm is compute intensive,
+        # so kernel will queued up.
+        # But later tensor indexing is overhead bound, and there
+        # is sleep to make sure kernel finished before next dispatch.
+        golden_queue_depth_list = [1, 2, 3, 4, 5, 1, 1, 1]
+        for entry, golden in zip(basic_evaluation.compute_queue_depth(),
+                                 golden_queue_depth_list):
+            self.assertTrue(entry.queue_depth == golden)
+
+    def test_utils_compute_queue_depth_when_no_cuda_events(self):
+        # For traces with only cpu events, we expect empty queue depth list
+        x = torch.ones((1024, 1024))
+        with profile() as prof:
+            for _ in range(5):
+                x = x @ x
+        basic_evaluation = _utils.BasicEvaluation(prof.profiler)
+        self.assertFalse(basic_evaluation.compute_queue_depth())
 
 
     def test_extra_fields(self):