pytorch/caffe2/python/lstm_benchmark.py
Aapo Kyrola 453c60ce28 Threaded dependency-aware RNNExecutor (frontier/diagonal execution).
Summary:
This diff adds dependency-aware concurrent/parallel execution of operators in stepnets. For CPU, we use multi-threaded execution. For CUDA, we use multiple streams and cuda events for parallelism and dependency tracking.

Much of the diff is about computing dependency graph, which was quite tricky because we need to also avoid write-races of multiple operators running in multiple timesteps in parallel. Also, recurrent blobs "change name" when passing over timestep ("_prev"), so that needs to be handled as well.

This diff also restores the link-ops that I unlanded earlier.

The performance gain of this diff is very good for CPU (same perf as with static_dag, even better on forward-only). On CUDA, the gains are modest, at least with the sizes i was testing with.

Reviewed By: salexspb

Differential Revision: D5001637

fbshipit-source-id: 3d0a71593d73a9ff22f4c1a5c9abf2a4a0c633c8
2017-08-15 23:55:15 -07:00

325 lines
9.8 KiB
Python

## @package lstm_benchmark
# Module caffe2.python.lstm_benchmark
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.proto import caffe2_pb2
from caffe2.python import workspace, core, utils, rnn_cell, model_helper
import argparse
import numpy as np
import time
import logging
logging.basicConfig()
log = logging.getLogger("lstm_bench")
log.setLevel(logging.DEBUG)
def generate_data(T, shape, num_labels, fixed_shape):
'''
Fill a queue with input data
'''
log.info("Generating T={} sequence batches".format(T))
generate_input_init_net = core.Net('generate_input_init')
queue = generate_input_init_net.CreateBlobsQueue(
[], "inputqueue", num_blobs=1, capacity=T,
)
label_queue = generate_input_init_net.CreateBlobsQueue(
[], "labelqueue", num_blobs=1, capacity=T,
)
workspace.RunNetOnce(generate_input_init_net)
generate_input_net = core.Net('generate_input')
generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
generate_input_net.EnqueueBlobs([label_queue, "label_scr"], ["label_scr"])
np.random.seed(2603)
entry_counts = []
for t in range(T):
if (t % (max(10, T // 10)) == 0):
print("Generating data {}/{}".format(t, T))
# Randomize the seqlength
random_shape = (
[np.random.randint(1, shape[0])] + shape[1:]
if t > 0 and not fixed_shape else shape
)
X = np.random.rand(*random_shape).astype(np.float32)
batch_size = random_shape[1]
L = num_labels * batch_size
labels = (np.random.rand(random_shape[0]) * L).astype(np.int32)
workspace.FeedBlob("scratch", X)
workspace.FeedBlob("label_scr", labels)
workspace.RunNetOnce(generate_input_net.Proto())
entry_counts.append(random_shape[0] * random_shape[1])
log.info("Finished data generation")
return queue, label_queue, entry_counts
def create_model(args, queue, label_queue, input_shape):
model = model_helper.ModelHelper(name="LSTM_bench")
seq_lengths, target = \
model.net.AddExternalInputs(
'seq_lengths',
'target',
)
input_blob = model.net.DequeueBlobs(queue, "input_data")
labels = model.net.DequeueBlobs(label_queue, "label")
init_blobs = []
if args.implementation in ["own", "static", "static_dag"]:
T = None
if "static" in args.implementation:
assert args.fixed_shape, \
"Random input length is not static RNN compatible"
T = args.seq_length
print("Using static RNN of size {}".format(T))
for i in range(args.num_layers):
hidden_init, cell_init = model.net.AddExternalInputs(
"hidden_init_{}".format(i),
"cell_init_{}".format(i)
)
init_blobs.extend([hidden_init, cell_init])
output, last_hidden, _, last_state = rnn_cell.LSTM(
model=model,
input_blob=input_blob,
seq_lengths=seq_lengths,
initial_states=init_blobs,
dim_in=args.input_dim,
dim_out=[args.hidden_dim] * args.num_layers,
scope="lstm1",
memory_optimization=args.memory_optimization,
forward_only=args.forward_only,
drop_states=True,
return_last_layer_only=True,
static_rnn_unroll_size=T,
)
if "dag" in args.implementation:
print("Using DAG net type")
model.net.Proto().type = 'dag'
model.net.Proto().num_workers = 4
elif args.implementation == "cudnn":
# We need to feed a placeholder input so that RecurrentInitOp
# can infer the dimensions.
init_blobs = model.net.AddExternalInputs("hidden_init", "cell_init")
model.param_init_net.ConstantFill([], input_blob, shape=input_shape)
output, last_hidden, _ = rnn_cell.cudnn_LSTM(
model=model,
input_blob=input_blob,
initial_states=init_blobs,
dim_in=args.input_dim,
dim_out=args.hidden_dim,
scope="cudnnlstm",
num_layers=args.num_layers,
)
else:
assert False, "Unknown implementation"
weights = model.net.UniformFill(labels, "weights")
softmax, loss = model.net.SoftmaxWithLoss(
[model.Flatten(output), labels, weights],
['softmax', 'loss'],
)
if not args.forward_only:
model.AddGradientOperators([loss])
# carry states over
for init_blob in init_blobs:
model.net.Copy(last_hidden, init_blob)
sz = args.hidden_dim
if args.implementation == "cudnn":
sz *= args.num_layers
workspace.FeedBlob(init_blob, np.zeros(
[1, args.batch_size, sz], dtype=np.float32
))
return model, output
def Caffe2LSTM(args):
T = args.data_size // args.batch_size
input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
queue, label_queue, entry_counts = generate_data(T // args.seq_length,
input_blob_shape,
args.hidden_dim,
args.fixed_shape)
workspace.FeedBlob(
"seq_lengths",
np.array([args.seq_length] * args.batch_size, dtype=np.int32)
)
model, output = create_model(args, queue, label_queue, input_blob_shape)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
start_time = time.time()
num_iters = T // args.seq_length
total_iters = 0
# Run the Benchmark
log.info("------ Warming up ------")
workspace.RunNet(model.net.Proto().name)
if (args.gpu):
log.info("Memory stats:")
stats = utils.GetGPUMemoryUsageStats()
log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
log.info("------ Starting benchmark ------")
start_time = time.time()
last_time = time.time()
for iteration in range(1, num_iters, args.iters_to_report):
iters_once = min(args.iters_to_report, num_iters - iteration)
total_iters += iters_once
workspace.RunNet(model.net.Proto().name, iters_once)
new_time = time.time()
log.info(
"Iter: {} / {}. Entries Per Second: {}k.".format(
iteration,
num_iters,
np.sum(entry_counts[iteration:iteration + iters_once]) /
(new_time - last_time) // 100 / 10,
)
)
last_time = new_time
log.info("Done. Total EPS excluding 1st iteration: {}k {}".format(
np.sum(entry_counts[1:]) / (time.time() - start_time) // 100 / 10,
" (with RNN executor)" if args.rnn_executor else "",
))
if (args.gpu):
log.info("Memory stats:")
stats = utils.GetGPUMemoryUsageStats()
log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
if (stats['max_total'] != stats['total']):
log.warning(
"Max usage differs from current total usage: {} > {}".
format(stats['max_total'], stats['total'])
)
log.warning("This means that costly deallocations occured.")
return time.time() - start_time
@utils.debug
def Benchmark(args):
return Caffe2LSTM(args)
def GetArgumentParser():
parser = argparse.ArgumentParser(description="LSTM benchmark.")
parser.add_argument(
"--hidden_dim",
type=int,
default=800,
help="Hidden dimension",
)
parser.add_argument(
"--input_dim",
type=int,
default=40,
help="Input dimension",
)
parser.add_argument(
"--batch_size",
type=int,
default=128,
help="The batch size."
)
parser.add_argument(
"--seq_length",
type=int,
default=20,
help="Max sequence length"
)
parser.add_argument(
"--data_size",
type=int,
default=1000000,
help="Number of data points to generate"
)
parser.add_argument(
"--iters_to_report",
type=int,
default=20,
help="Number of iteration to report progress"
)
parser.add_argument(
"--gpu",
action="store_true",
help="Run all on GPU",
)
parser.add_argument(
"--implementation",
type=str,
default="own",
help="'cudnn', 'own', 'static' or 'static_dag'",
)
parser.add_argument(
"--fixed_shape",
action="store_true",
help=("Whether to randomize shape of input batches. "
"Static RNN requires fixed shape"),
)
parser.add_argument(
"--memory_optimization",
action="store_true",
help="Whether to use memory optimized LSTM or not",
)
parser.add_argument(
"--forward_only",
action="store_true",
help="Whether to run only forward pass"
)
parser.add_argument(
"--num_layers",
type=int,
default=1,
help="Number of LSTM layers. All output dimensions are going to be"
"of hidden_dim size",
)
parser.add_argument(
"--rnn_executor",
action="store_true",
help="Whether to use RNN executor"
)
return parser
if __name__ == '__main__':
args, extra_args = GetArgumentParser().parse_known_args()
rnn_executor_opt = 1 if args.rnn_executor else 0
workspace.GlobalInit([
'caffe2',
'--caffe2_log_level=0',
'--caffe2_print_blob_sizes_at_exit=0',
'--caffe2_rnn_executor={}'.format(rnn_executor_opt),
'--caffe2_gpu_memory_tracking=1'] + extra_args)
device = core.DeviceOption(
caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0)
with core.DeviceScope(device):
Benchmark(args)