Run inference in an Executor (#115286)

Experiment: run model predictions in the backend in a ThreadPoolExecutor so that each model prediction does not block reading requests from the queue

Baseline is reset in above PR that bugfixes a lot of the metrics calculations but I kept the metrics here anyway

Pull Request resolved: https://github.com/pytorch/pytorch/pull/115286
Approved by: https://github.com/albanD
This commit is contained in:
Mikayla Gawarecki 2023-12-20 06:33:43 -08:00 committed by PyTorch MergeBot
parent b72127cd4b
commit 31f21e033e
12 changed files with 33 additions and 9 deletions

View file

@ -0,0 +1,5 @@
### [#115286](https://github.com/pytorch/pytorch/pull/115286)
* Prior to this PR, the backend worker was a process that read from the request queue, ran the model's forward and put the output in the response queue. In this PR, create a `ThreadPoolExecutor` with 1 worker and asynchronously run the model forward and response step in the executor so that it doesn't block polling the queue for more requests.
##### Results
* Warmup latency improved (likely due to the backend no longer being a new process) but all other metrics were worse.

View file

@ -3,3 +3,4 @@
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
| ------------------ | ------------------- | ------------------------ | ------------------- |
| 8.895 +/- 1.863 | 0.221 +/- 0.005 | 579.469 +/- 13.452 | 22.797 +/- 0.948 |
| 5.810 +/- 0.469 | 0.258 +/- 0.005 | 496.122 +/- 10.009 | 26.776 +/- 1.156 |

View file

@ -3,3 +3,4 @@
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
| ------------------ | ------------------- | ------------------------ | ------------------- |
| 19.006 +/- 2.697 | 0.221 +/- 0.003 | 579.121 +/- 7.831 | 19.304 +/- 2.208 |
| 15.255 +/- 0.806 | 0.255 +/- 0.007 | 502.735 +/- 13.911 | 21.654 +/- 1.404 |

View file

@ -3,3 +3,4 @@
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
| ------------------ | ------------------- | ------------------------ | ------------------- |
| 7.317 +/- 0.602 | 0.268 +/- 0.057 | 3.864 +/- 0.739 | 2.560 +/- 0.245 |
| 5.616 +/- 0.429 | 0.184 +/- 0.044 | 5.894 +/- 2.218 | 3.683 +/- 0.558 |

View file

@ -3,3 +3,4 @@
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
| ------------------ | ------------------- | ------------------------ | ------------------- |
| 18.353 +/- 1.888 | 0.172 +/- 0.081 | 7.080 +/- 3.199 | 1.414 +/- 0.154 |
| 15.015 +/- 1.063 | 0.152 +/- 0.028 | 5.978 +/- 2.694 | 2.388 +/- 1.674 |

View file

@ -3,3 +3,4 @@
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
| ------------------ | ------------------- | ------------------------ | ------------------- |
| 8.555 +/- 1.259 | 0.447 +/- 0.007 | 573.260 +/- 8.471 | 31.546 +/- 0.530 |
| 6.831 +/- 1.526 | 0.527 +/- 0.009 | 486.141 +/- 8.158 | 30.400 +/- 1.956 |

View file

@ -3,3 +3,4 @@
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
| ------------------ | ------------------- | ------------------------ | ------------------- |
| 17.986 +/- 1.737 | 0.454 +/- 0.008 | 564.066 +/- 9.441 | 26.642 +/- 1.730 |
| 16.171 +/- 1.212 | 0.515 +/- 0.010 | 497.133 +/- 9.349 | 27.004 +/- 0.990 |

View file

@ -3,3 +3,4 @@
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
| ------------------ | ------------------- | ------------------------ | ------------------- |
| 8.215 +/- 1.670 | 0.096 +/- 0.020 | 343.878 +/- 63.660 | 11.233 +/- 1.837 |
| 6.570 +/- 1.624 | 0.100 +/- 0.014 | 325.184 +/- 43.357 | 14.427 +/- 1.447 |

View file

@ -3,3 +3,4 @@
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
| ------------------ | ------------------- | ------------------------ | ------------------- |
| 16.586 +/- 2.838 | 2.092 +/- 6.316 | 314.174 +/- 123.214 | 44.014 +/- 113.006 |
| 15.280 +/- 1.284 | 0.104 +/- 0.023 | 319.020 +/- 63.883 | 9.260 +/- 1.839 |

View file

@ -3,3 +3,4 @@
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
| ------------------ | ------------------- | ------------------------ | ------------------- |
| 8.704 +/- 1.577 | 0.152 +/- 0.014 | 425.039 +/- 38.873 | 16.389 +/- 0.836 |
| 6.361 +/- 1.443 | 0.170 +/- 0.013 | 378.304 +/- 29.090 | 19.316 +/- 0.899 |

View file

@ -3,3 +3,4 @@
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
| ------------------ | ------------------- | ------------------------ | ------------------- |
| 17.767 +/- 1.145 | 0.142 +/- 0.014 | 455.207 +/- 44.402 | 13.422 +/- 0.912 |
| 15.184 +/- 1.179 | 0.184 +/- 0.013 | 349.463 +/- 23.731 | 12.900 +/- 0.811 |

View file

@ -1,7 +1,10 @@
import argparse
import asyncio
import os.path
import subprocess
import time
from concurrent.futures import ThreadPoolExecutor
from queue import Empty
import numpy as np
@ -59,7 +62,7 @@ class FrontendWorker(mp.Process):
def _run_gpu_utilization(self):
"""
This function will poll nvidi-smi for GPU utilization every 100ms to
This function will poll nvidia-smi for GPU utilization every 100ms to
record the average GPU utilization.
"""
@ -123,7 +126,7 @@ class FrontendWorker(mp.Process):
gpu_utilization_thread.join()
class BackendWorker(mp.Process):
class BackendWorker:
"""
This worker will take tensors from the request queue, do some computation,
and then return the result back in the response queue.
@ -174,7 +177,15 @@ class BackendWorker(mp.Process):
self.metrics_dict["m_compile_time"] = end_compile_time - start_compile_time
return m
def run(self):
def model_predict(self, model, data, request_time):
with torch.no_grad():
data = data.to(self.device, non_blocking=True)
out = model(data)
self.response_queue.put((out, request_time))
async def run(self):
pool = ThreadPoolExecutor(max_workers=1)
while True:
try:
data, request_time = self.request_queue.get(timeout=10)
@ -185,10 +196,9 @@ class BackendWorker(mp.Process):
model = self._setup()
self._setup_complete = True
with torch.no_grad():
data = data.to(self.device, non_blocking=True)
out = model(data)
self.response_queue.put((out, request_time))
asyncio.get_running_loop().run_in_executor(
pool, self.model_predict, model, data, request_time
)
if __name__ == "__main__":
@ -237,10 +247,9 @@ if __name__ == "__main__":
)
frontend.start()
backend.start()
asyncio.run(backend.run())
frontend.join()
backend.join()
metrics_dict = {k: [v] for k, v in metrics_dict._getvalue().items()}
output = pd.DataFrame.from_dict(metrics_dict, orient="columns")