mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Run inference in an Executor (#115286)
Experiment: run model predictions in the backend in a ThreadPoolExecutor so that each model prediction does not block reading requests from the queue Baseline is reset in above PR that bugfixes a lot of the metrics calculations but I kept the metrics here anyway Pull Request resolved: https://github.com/pytorch/pytorch/pull/115286 Approved by: https://github.com/albanD
This commit is contained in:
parent
b72127cd4b
commit
31f21e033e
12 changed files with 33 additions and 9 deletions
5
benchmarks/inference/CHANGELOG.md
Normal file
5
benchmarks/inference/CHANGELOG.md
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
### [#115286](https://github.com/pytorch/pytorch/pull/115286)
|
||||
* Prior to this PR, the backend worker was a process that read from the request queue, ran the model's forward and put the output in the response queue. In this PR, create a `ThreadPoolExecutor` with 1 worker and asynchronously run the model forward and response step in the executor so that it doesn't block polling the queue for more requests.
|
||||
|
||||
##### Results
|
||||
* Warmup latency improved (likely due to the backend no longer being a new process) but all other metrics were worse.
|
||||
|
|
@ -3,3 +3,4 @@
|
|||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 8.895 +/- 1.863 | 0.221 +/- 0.005 | 579.469 +/- 13.452 | 22.797 +/- 0.948 |
|
||||
| 5.810 +/- 0.469 | 0.258 +/- 0.005 | 496.122 +/- 10.009 | 26.776 +/- 1.156 |
|
||||
|
|
|
|||
|
|
@ -3,3 +3,4 @@
|
|||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 19.006 +/- 2.697 | 0.221 +/- 0.003 | 579.121 +/- 7.831 | 19.304 +/- 2.208 |
|
||||
| 15.255 +/- 0.806 | 0.255 +/- 0.007 | 502.735 +/- 13.911 | 21.654 +/- 1.404 |
|
||||
|
|
|
|||
|
|
@ -3,3 +3,4 @@
|
|||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 7.317 +/- 0.602 | 0.268 +/- 0.057 | 3.864 +/- 0.739 | 2.560 +/- 0.245 |
|
||||
| 5.616 +/- 0.429 | 0.184 +/- 0.044 | 5.894 +/- 2.218 | 3.683 +/- 0.558 |
|
||||
|
|
|
|||
|
|
@ -3,3 +3,4 @@
|
|||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 18.353 +/- 1.888 | 0.172 +/- 0.081 | 7.080 +/- 3.199 | 1.414 +/- 0.154 |
|
||||
| 15.015 +/- 1.063 | 0.152 +/- 0.028 | 5.978 +/- 2.694 | 2.388 +/- 1.674 |
|
||||
|
|
|
|||
|
|
@ -3,3 +3,4 @@
|
|||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 8.555 +/- 1.259 | 0.447 +/- 0.007 | 573.260 +/- 8.471 | 31.546 +/- 0.530 |
|
||||
| 6.831 +/- 1.526 | 0.527 +/- 0.009 | 486.141 +/- 8.158 | 30.400 +/- 1.956 |
|
||||
|
|
|
|||
|
|
@ -3,3 +3,4 @@
|
|||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 17.986 +/- 1.737 | 0.454 +/- 0.008 | 564.066 +/- 9.441 | 26.642 +/- 1.730 |
|
||||
| 16.171 +/- 1.212 | 0.515 +/- 0.010 | 497.133 +/- 9.349 | 27.004 +/- 0.990 |
|
||||
|
|
|
|||
|
|
@ -3,3 +3,4 @@
|
|||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 8.215 +/- 1.670 | 0.096 +/- 0.020 | 343.878 +/- 63.660 | 11.233 +/- 1.837 |
|
||||
| 6.570 +/- 1.624 | 0.100 +/- 0.014 | 325.184 +/- 43.357 | 14.427 +/- 1.447 |
|
||||
|
|
|
|||
|
|
@ -3,3 +3,4 @@
|
|||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 16.586 +/- 2.838 | 2.092 +/- 6.316 | 314.174 +/- 123.214 | 44.014 +/- 113.006 |
|
||||
| 15.280 +/- 1.284 | 0.104 +/- 0.023 | 319.020 +/- 63.883 | 9.260 +/- 1.839 |
|
||||
|
|
|
|||
|
|
@ -3,3 +3,4 @@
|
|||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 8.704 +/- 1.577 | 0.152 +/- 0.014 | 425.039 +/- 38.873 | 16.389 +/- 0.836 |
|
||||
| 6.361 +/- 1.443 | 0.170 +/- 0.013 | 378.304 +/- 29.090 | 19.316 +/- 0.899 |
|
||||
|
|
|
|||
|
|
@ -3,3 +3,4 @@
|
|||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 17.767 +/- 1.145 | 0.142 +/- 0.014 | 455.207 +/- 44.402 | 13.422 +/- 0.912 |
|
||||
| 15.184 +/- 1.179 | 0.184 +/- 0.013 | 349.463 +/- 23.731 | 12.900 +/- 0.811 |
|
||||
|
|
|
|||
|
|
@ -1,7 +1,10 @@
|
|||
import argparse
|
||||
|
||||
import asyncio
|
||||
import os.path
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from queue import Empty
|
||||
|
||||
import numpy as np
|
||||
|
|
@ -59,7 +62,7 @@ class FrontendWorker(mp.Process):
|
|||
|
||||
def _run_gpu_utilization(self):
|
||||
"""
|
||||
This function will poll nvidi-smi for GPU utilization every 100ms to
|
||||
This function will poll nvidia-smi for GPU utilization every 100ms to
|
||||
record the average GPU utilization.
|
||||
"""
|
||||
|
||||
|
|
@ -123,7 +126,7 @@ class FrontendWorker(mp.Process):
|
|||
gpu_utilization_thread.join()
|
||||
|
||||
|
||||
class BackendWorker(mp.Process):
|
||||
class BackendWorker:
|
||||
"""
|
||||
This worker will take tensors from the request queue, do some computation,
|
||||
and then return the result back in the response queue.
|
||||
|
|
@ -174,7 +177,15 @@ class BackendWorker(mp.Process):
|
|||
self.metrics_dict["m_compile_time"] = end_compile_time - start_compile_time
|
||||
return m
|
||||
|
||||
def run(self):
|
||||
def model_predict(self, model, data, request_time):
|
||||
with torch.no_grad():
|
||||
data = data.to(self.device, non_blocking=True)
|
||||
out = model(data)
|
||||
self.response_queue.put((out, request_time))
|
||||
|
||||
async def run(self):
|
||||
pool = ThreadPoolExecutor(max_workers=1)
|
||||
|
||||
while True:
|
||||
try:
|
||||
data, request_time = self.request_queue.get(timeout=10)
|
||||
|
|
@ -185,10 +196,9 @@ class BackendWorker(mp.Process):
|
|||
model = self._setup()
|
||||
self._setup_complete = True
|
||||
|
||||
with torch.no_grad():
|
||||
data = data.to(self.device, non_blocking=True)
|
||||
out = model(data)
|
||||
self.response_queue.put((out, request_time))
|
||||
asyncio.get_running_loop().run_in_executor(
|
||||
pool, self.model_predict, model, data, request_time
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
@ -237,10 +247,9 @@ if __name__ == "__main__":
|
|||
)
|
||||
|
||||
frontend.start()
|
||||
backend.start()
|
||||
asyncio.run(backend.run())
|
||||
|
||||
frontend.join()
|
||||
backend.join()
|
||||
|
||||
metrics_dict = {k: [v] for k, v in metrics_dict._getvalue().items()}
|
||||
output = pd.DataFrame.from_dict(metrics_dict, orient="columns")
|
||||
|
|
|
|||
Loading…
Reference in a new issue