mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Summary:
This diff adds dependency-aware concurrent/parallel execution of operators in stepnets. For CPU, we use multi-threaded execution. For CUDA, we use multiple streams and cuda events for parallelism and dependency tracking.
Much of the diff is about computing dependency graph, which was quite tricky because we need to also avoid write-races of multiple operators running in multiple timesteps in parallel. Also, recurrent blobs "change name" when passing over timestep ("_prev"), so that needs to be handled as well.
This diff also restores the link-ops that I unlanded earlier.
The performance gain of this diff is very good for CPU (same perf as with static_dag, even better on forward-only). On CUDA, the gains are modest, at least with the sizes i was testing with.
Reviewed By: salexspb
Differential Revision: D5001637
fbshipit-source-id: 3d0a71593d73a9ff22f4c1a5c9abf2a4a0c633c8
325 lines
9.8 KiB
Python
325 lines
9.8 KiB
Python
## @package lstm_benchmark
|
|
# Module caffe2.python.lstm_benchmark
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
from caffe2.proto import caffe2_pb2
|
|
from caffe2.python import workspace, core, utils, rnn_cell, model_helper
|
|
|
|
import argparse
|
|
import numpy as np
|
|
import time
|
|
|
|
import logging
|
|
|
|
logging.basicConfig()
|
|
log = logging.getLogger("lstm_bench")
|
|
log.setLevel(logging.DEBUG)
|
|
|
|
|
|
def generate_data(T, shape, num_labels, fixed_shape):
|
|
'''
|
|
Fill a queue with input data
|
|
'''
|
|
log.info("Generating T={} sequence batches".format(T))
|
|
|
|
generate_input_init_net = core.Net('generate_input_init')
|
|
queue = generate_input_init_net.CreateBlobsQueue(
|
|
[], "inputqueue", num_blobs=1, capacity=T,
|
|
)
|
|
label_queue = generate_input_init_net.CreateBlobsQueue(
|
|
[], "labelqueue", num_blobs=1, capacity=T,
|
|
)
|
|
|
|
workspace.RunNetOnce(generate_input_init_net)
|
|
generate_input_net = core.Net('generate_input')
|
|
|
|
generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
|
|
generate_input_net.EnqueueBlobs([label_queue, "label_scr"], ["label_scr"])
|
|
np.random.seed(2603)
|
|
|
|
entry_counts = []
|
|
for t in range(T):
|
|
if (t % (max(10, T // 10)) == 0):
|
|
print("Generating data {}/{}".format(t, T))
|
|
# Randomize the seqlength
|
|
random_shape = (
|
|
[np.random.randint(1, shape[0])] + shape[1:]
|
|
if t > 0 and not fixed_shape else shape
|
|
)
|
|
X = np.random.rand(*random_shape).astype(np.float32)
|
|
batch_size = random_shape[1]
|
|
L = num_labels * batch_size
|
|
labels = (np.random.rand(random_shape[0]) * L).astype(np.int32)
|
|
workspace.FeedBlob("scratch", X)
|
|
workspace.FeedBlob("label_scr", labels)
|
|
workspace.RunNetOnce(generate_input_net.Proto())
|
|
entry_counts.append(random_shape[0] * random_shape[1])
|
|
|
|
log.info("Finished data generation")
|
|
|
|
return queue, label_queue, entry_counts
|
|
|
|
|
|
def create_model(args, queue, label_queue, input_shape):
|
|
model = model_helper.ModelHelper(name="LSTM_bench")
|
|
seq_lengths, target = \
|
|
model.net.AddExternalInputs(
|
|
'seq_lengths',
|
|
'target',
|
|
)
|
|
|
|
input_blob = model.net.DequeueBlobs(queue, "input_data")
|
|
labels = model.net.DequeueBlobs(label_queue, "label")
|
|
|
|
init_blobs = []
|
|
if args.implementation in ["own", "static", "static_dag"]:
|
|
T = None
|
|
if "static" in args.implementation:
|
|
assert args.fixed_shape, \
|
|
"Random input length is not static RNN compatible"
|
|
T = args.seq_length
|
|
print("Using static RNN of size {}".format(T))
|
|
|
|
for i in range(args.num_layers):
|
|
hidden_init, cell_init = model.net.AddExternalInputs(
|
|
"hidden_init_{}".format(i),
|
|
"cell_init_{}".format(i)
|
|
)
|
|
init_blobs.extend([hidden_init, cell_init])
|
|
|
|
output, last_hidden, _, last_state = rnn_cell.LSTM(
|
|
model=model,
|
|
input_blob=input_blob,
|
|
seq_lengths=seq_lengths,
|
|
initial_states=init_blobs,
|
|
dim_in=args.input_dim,
|
|
dim_out=[args.hidden_dim] * args.num_layers,
|
|
scope="lstm1",
|
|
memory_optimization=args.memory_optimization,
|
|
forward_only=args.forward_only,
|
|
drop_states=True,
|
|
return_last_layer_only=True,
|
|
static_rnn_unroll_size=T,
|
|
)
|
|
|
|
if "dag" in args.implementation:
|
|
print("Using DAG net type")
|
|
model.net.Proto().type = 'dag'
|
|
model.net.Proto().num_workers = 4
|
|
elif args.implementation == "cudnn":
|
|
# We need to feed a placeholder input so that RecurrentInitOp
|
|
# can infer the dimensions.
|
|
init_blobs = model.net.AddExternalInputs("hidden_init", "cell_init")
|
|
model.param_init_net.ConstantFill([], input_blob, shape=input_shape)
|
|
output, last_hidden, _ = rnn_cell.cudnn_LSTM(
|
|
model=model,
|
|
input_blob=input_blob,
|
|
initial_states=init_blobs,
|
|
dim_in=args.input_dim,
|
|
dim_out=args.hidden_dim,
|
|
scope="cudnnlstm",
|
|
num_layers=args.num_layers,
|
|
)
|
|
|
|
else:
|
|
assert False, "Unknown implementation"
|
|
|
|
weights = model.net.UniformFill(labels, "weights")
|
|
softmax, loss = model.net.SoftmaxWithLoss(
|
|
[model.Flatten(output), labels, weights],
|
|
['softmax', 'loss'],
|
|
)
|
|
|
|
if not args.forward_only:
|
|
model.AddGradientOperators([loss])
|
|
|
|
# carry states over
|
|
for init_blob in init_blobs:
|
|
model.net.Copy(last_hidden, init_blob)
|
|
|
|
sz = args.hidden_dim
|
|
if args.implementation == "cudnn":
|
|
sz *= args.num_layers
|
|
workspace.FeedBlob(init_blob, np.zeros(
|
|
[1, args.batch_size, sz], dtype=np.float32
|
|
))
|
|
return model, output
|
|
|
|
|
|
def Caffe2LSTM(args):
|
|
T = args.data_size // args.batch_size
|
|
|
|
input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
|
|
queue, label_queue, entry_counts = generate_data(T // args.seq_length,
|
|
input_blob_shape,
|
|
args.hidden_dim,
|
|
args.fixed_shape)
|
|
|
|
workspace.FeedBlob(
|
|
"seq_lengths",
|
|
np.array([args.seq_length] * args.batch_size, dtype=np.int32)
|
|
)
|
|
|
|
model, output = create_model(args, queue, label_queue, input_blob_shape)
|
|
|
|
workspace.RunNetOnce(model.param_init_net)
|
|
workspace.CreateNet(model.net)
|
|
|
|
start_time = time.time()
|
|
num_iters = T // args.seq_length
|
|
total_iters = 0
|
|
|
|
# Run the Benchmark
|
|
log.info("------ Warming up ------")
|
|
workspace.RunNet(model.net.Proto().name)
|
|
|
|
if (args.gpu):
|
|
log.info("Memory stats:")
|
|
stats = utils.GetGPUMemoryUsageStats()
|
|
log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
|
|
|
|
log.info("------ Starting benchmark ------")
|
|
start_time = time.time()
|
|
last_time = time.time()
|
|
for iteration in range(1, num_iters, args.iters_to_report):
|
|
iters_once = min(args.iters_to_report, num_iters - iteration)
|
|
total_iters += iters_once
|
|
workspace.RunNet(model.net.Proto().name, iters_once)
|
|
|
|
new_time = time.time()
|
|
log.info(
|
|
"Iter: {} / {}. Entries Per Second: {}k.".format(
|
|
iteration,
|
|
num_iters,
|
|
np.sum(entry_counts[iteration:iteration + iters_once]) /
|
|
(new_time - last_time) // 100 / 10,
|
|
)
|
|
)
|
|
last_time = new_time
|
|
|
|
log.info("Done. Total EPS excluding 1st iteration: {}k {}".format(
|
|
np.sum(entry_counts[1:]) / (time.time() - start_time) // 100 / 10,
|
|
" (with RNN executor)" if args.rnn_executor else "",
|
|
))
|
|
|
|
if (args.gpu):
|
|
log.info("Memory stats:")
|
|
stats = utils.GetGPUMemoryUsageStats()
|
|
log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
|
|
if (stats['max_total'] != stats['total']):
|
|
log.warning(
|
|
"Max usage differs from current total usage: {} > {}".
|
|
format(stats['max_total'], stats['total'])
|
|
)
|
|
log.warning("This means that costly deallocations occured.")
|
|
|
|
return time.time() - start_time
|
|
|
|
|
|
@utils.debug
|
|
def Benchmark(args):
|
|
return Caffe2LSTM(args)
|
|
|
|
|
|
def GetArgumentParser():
|
|
parser = argparse.ArgumentParser(description="LSTM benchmark.")
|
|
|
|
parser.add_argument(
|
|
"--hidden_dim",
|
|
type=int,
|
|
default=800,
|
|
help="Hidden dimension",
|
|
)
|
|
parser.add_argument(
|
|
"--input_dim",
|
|
type=int,
|
|
default=40,
|
|
help="Input dimension",
|
|
)
|
|
parser.add_argument(
|
|
"--batch_size",
|
|
type=int,
|
|
default=128,
|
|
help="The batch size."
|
|
)
|
|
parser.add_argument(
|
|
"--seq_length",
|
|
type=int,
|
|
default=20,
|
|
help="Max sequence length"
|
|
)
|
|
parser.add_argument(
|
|
"--data_size",
|
|
type=int,
|
|
default=1000000,
|
|
help="Number of data points to generate"
|
|
)
|
|
parser.add_argument(
|
|
"--iters_to_report",
|
|
type=int,
|
|
default=20,
|
|
help="Number of iteration to report progress"
|
|
)
|
|
parser.add_argument(
|
|
"--gpu",
|
|
action="store_true",
|
|
help="Run all on GPU",
|
|
)
|
|
parser.add_argument(
|
|
"--implementation",
|
|
type=str,
|
|
default="own",
|
|
help="'cudnn', 'own', 'static' or 'static_dag'",
|
|
)
|
|
parser.add_argument(
|
|
"--fixed_shape",
|
|
action="store_true",
|
|
help=("Whether to randomize shape of input batches. "
|
|
"Static RNN requires fixed shape"),
|
|
)
|
|
parser.add_argument(
|
|
"--memory_optimization",
|
|
action="store_true",
|
|
help="Whether to use memory optimized LSTM or not",
|
|
)
|
|
parser.add_argument(
|
|
"--forward_only",
|
|
action="store_true",
|
|
help="Whether to run only forward pass"
|
|
)
|
|
parser.add_argument(
|
|
"--num_layers",
|
|
type=int,
|
|
default=1,
|
|
help="Number of LSTM layers. All output dimensions are going to be"
|
|
"of hidden_dim size",
|
|
)
|
|
parser.add_argument(
|
|
"--rnn_executor",
|
|
action="store_true",
|
|
help="Whether to use RNN executor"
|
|
)
|
|
|
|
return parser
|
|
|
|
|
|
if __name__ == '__main__':
|
|
args, extra_args = GetArgumentParser().parse_known_args()
|
|
|
|
rnn_executor_opt = 1 if args.rnn_executor else 0
|
|
|
|
workspace.GlobalInit([
|
|
'caffe2',
|
|
'--caffe2_log_level=0',
|
|
'--caffe2_print_blob_sizes_at_exit=0',
|
|
'--caffe2_rnn_executor={}'.format(rnn_executor_opt),
|
|
'--caffe2_gpu_memory_tracking=1'] + extra_args)
|
|
|
|
device = core.DeviceOption(
|
|
caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0)
|
|
|
|
with core.DeviceScope(device):
|
|
Benchmark(args)
|