2017-03-29 13:44:02 +00:00
|
|
|
## @package lstm_benchmark
|
|
|
|
|
# Module caffe2.python.lstm_benchmark
|
2017-03-01 07:14:11 +00:00
|
|
|
from __future__ import absolute_import
|
|
|
|
|
from __future__ import division
|
|
|
|
|
from __future__ import print_function
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
|
|
from caffe2.proto import caffe2_pb2
|
2017-04-18 07:33:06 +00:00
|
|
|
from caffe2.python import cnn, workspace, core, utils, rnn_cell
|
2017-03-01 07:14:11 +00:00
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
import numpy as np
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
import logging
|
|
|
|
|
|
|
|
|
|
logging.basicConfig()
|
|
|
|
|
log = logging.getLogger("lstm_bench")
|
|
|
|
|
log.setLevel(logging.DEBUG)
|
|
|
|
|
|
|
|
|
|
|
2017-03-30 22:46:44 +00:00
|
|
|
def generate_data(T, shape, num_labels):
|
2017-03-01 07:14:11 +00:00
|
|
|
'''
|
|
|
|
|
Fill a queue with input data
|
|
|
|
|
'''
|
|
|
|
|
log.info("Generating T={} sequence batches".format(T))
|
|
|
|
|
|
|
|
|
|
generate_input_init_net = core.Net('generate_input_init')
|
|
|
|
|
queue = generate_input_init_net.CreateBlobsQueue(
|
|
|
|
|
[], "inputqueue", num_blobs=1, capacity=T,
|
|
|
|
|
)
|
2017-03-30 22:46:44 +00:00
|
|
|
label_queue = generate_input_init_net.CreateBlobsQueue(
|
|
|
|
|
[], "labelqueue", num_blobs=1, capacity=T,
|
|
|
|
|
)
|
2017-03-01 07:14:11 +00:00
|
|
|
|
|
|
|
|
workspace.RunNetOnce(generate_input_init_net)
|
|
|
|
|
generate_input_net = core.Net('generate_input')
|
2017-04-05 21:05:12 +00:00
|
|
|
|
2017-03-28 21:06:33 +00:00
|
|
|
generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
|
2017-03-30 22:46:44 +00:00
|
|
|
generate_input_net.EnqueueBlobs([label_queue, "label_scr"], ["label_scr"])
|
2017-03-28 21:06:33 +00:00
|
|
|
np.random.seed(2603)
|
|
|
|
|
|
|
|
|
|
for t in range(T):
|
|
|
|
|
if (t % 50 == 0):
|
|
|
|
|
print("Generating data {}/{}".format(t, T))
|
|
|
|
|
# Randomize the seqlength
|
|
|
|
|
random_shape = [np.random.randint(1, shape[0])] + shape[1:]
|
|
|
|
|
X = np.random.rand(*random_shape).astype(np.float32)
|
2017-03-30 22:46:44 +00:00
|
|
|
batch_size = random_shape[1]
|
|
|
|
|
L = num_labels * batch_size
|
|
|
|
|
labels = (np.random.rand(random_shape[0]) * L).astype(np.int32)
|
2017-03-28 21:06:33 +00:00
|
|
|
workspace.FeedBlob("scratch", X)
|
2017-03-30 22:46:44 +00:00
|
|
|
workspace.FeedBlob("label_scr", labels)
|
2017-03-28 21:06:33 +00:00
|
|
|
workspace.RunNetOnce(generate_input_net.Proto())
|
2017-04-05 21:05:12 +00:00
|
|
|
|
2017-03-01 07:14:11 +00:00
|
|
|
log.info("Finished data generation")
|
2017-04-05 21:05:12 +00:00
|
|
|
|
2017-03-30 22:46:44 +00:00
|
|
|
return queue, label_queue
|
2017-03-01 07:14:11 +00:00
|
|
|
|
|
|
|
|
|
2017-04-05 21:05:12 +00:00
|
|
|
def create_model(args, queue, label_queue, input_shape):
|
2017-03-01 07:14:11 +00:00
|
|
|
model = cnn.CNNModelHelper(name="LSTM_bench")
|
|
|
|
|
seq_lengths, hidden_init, cell_init, target = \
|
|
|
|
|
model.net.AddExternalInputs(
|
|
|
|
|
'seq_lengths',
|
|
|
|
|
'hidden_init',
|
|
|
|
|
'cell_init',
|
|
|
|
|
'target',
|
|
|
|
|
)
|
|
|
|
|
input_blob = model.DequeueBlobs(queue, "input_data")
|
2017-03-30 22:46:44 +00:00
|
|
|
labels = model.DequeueBlobs(label_queue, "label")
|
2017-03-01 07:14:11 +00:00
|
|
|
|
2017-04-05 21:05:12 +00:00
|
|
|
if args.implementation == "own":
|
2017-04-18 07:33:06 +00:00
|
|
|
output, last_hidden, _, last_state = rnn_cell.LSTM(
|
2017-04-05 21:05:12 +00:00
|
|
|
model=model,
|
|
|
|
|
input_blob=input_blob,
|
|
|
|
|
seq_lengths=seq_lengths,
|
|
|
|
|
initial_states=(hidden_init, cell_init),
|
|
|
|
|
dim_in=args.input_dim,
|
|
|
|
|
dim_out=args.hidden_dim,
|
|
|
|
|
scope="lstm1",
|
option to recompute blobs backward pass with massive memory savings
Summary:
This diff adds an option to recurrent_net to define some cell blobs to be recomputed on backward step, and thus they don't need to be stored in the step workspace. This is done by modifying the backward step to automatically include all operators that are needed to produce the output that is to be recomputed, and by storing those blobs in a shared workspace. To enable the shared workspace, i had to modify the stepworkspaces blob to also store a forward shared workspace. Making it a class field won't work since the lifecycle of the blob does not match the lifecycle of the operator.
For basic LSTM, the performance hit is quite modest (about 15% with one setting, but your mileage might vary. For Attention models, I am sure this is beneficial as computing the attention blobs is not expensive.
For basic LSTM, the memory saving is wonderful: each forward workspace only has 4 bytes (for timestep).
I also modified the neural_mt LSTM Cells, but there is no test available, so I am not 100% sure I did it correctly. Please have a look.
Added options to LSTM, MILSTM and LSTMAttention to enable memory mode.
Reviewed By: urikz
Differential Revision: D4853890
fbshipit-source-id: d8d0e0e75a5330d174fbfa39b96d8e4e8c446baa
2017-04-11 20:00:44 +00:00
|
|
|
memory_optimization=args.memory_optimization,
|
2017-04-05 21:05:12 +00:00
|
|
|
)
|
|
|
|
|
elif args.implementation == "cudnn":
|
|
|
|
|
# We need to feed a placeholder input so that RecurrentInitOp
|
|
|
|
|
# can infer the dimensions.
|
|
|
|
|
model.param_init_net.ConstantFill([], input_blob, shape=input_shape)
|
2017-04-18 07:33:06 +00:00
|
|
|
output, last_hidden, _ = rnn_cell.cudnn_LSTM(
|
2017-04-05 21:05:12 +00:00
|
|
|
model=model,
|
|
|
|
|
input_blob=input_blob,
|
|
|
|
|
initial_states=(hidden_init, cell_init),
|
|
|
|
|
dim_in=args.input_dim,
|
|
|
|
|
dim_out=args.hidden_dim,
|
|
|
|
|
scope="cudnnlstm",
|
2017-04-20 15:30:03 +00:00
|
|
|
num_layers=1,
|
2017-04-05 21:05:12 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
assert False, "Unknown implementation"
|
|
|
|
|
|
|
|
|
|
weights = model.UniformFill(labels, "weights")
|
2017-03-30 22:46:44 +00:00
|
|
|
softmax, loss = model.SoftmaxWithLoss(
|
2017-04-05 21:05:12 +00:00
|
|
|
[model.Flatten(output), labels, weights],
|
2017-03-30 22:46:44 +00:00
|
|
|
['softmax', 'loss'],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
model.AddGradientOperators([loss])
|
2017-03-01 07:14:11 +00:00
|
|
|
|
|
|
|
|
# carry states over
|
|
|
|
|
model.net.Copy(last_hidden, hidden_init)
|
|
|
|
|
model.net.Copy(last_hidden, cell_init)
|
|
|
|
|
|
|
|
|
|
workspace.FeedBlob(hidden_init, np.zeros(
|
|
|
|
|
[1, args.batch_size, args.hidden_dim], dtype=np.float32
|
|
|
|
|
))
|
|
|
|
|
workspace.FeedBlob(cell_init, np.zeros(
|
|
|
|
|
[1, args.batch_size, args.hidden_dim], dtype=np.float32
|
|
|
|
|
))
|
2017-04-05 21:05:12 +00:00
|
|
|
return model, output
|
2017-03-01 07:14:11 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def Caffe2LSTM(args):
|
|
|
|
|
T = args.data_size // args.batch_size
|
2017-04-05 21:05:12 +00:00
|
|
|
|
2017-03-01 07:14:11 +00:00
|
|
|
input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
|
2017-03-30 22:46:44 +00:00
|
|
|
queue, label_queue = generate_data(T // args.seq_length,
|
|
|
|
|
input_blob_shape,
|
|
|
|
|
args.hidden_dim)
|
2017-03-01 07:14:11 +00:00
|
|
|
|
|
|
|
|
workspace.FeedBlob(
|
|
|
|
|
"seq_lengths",
|
|
|
|
|
np.array([args.seq_length] * args.batch_size, dtype=np.int32)
|
|
|
|
|
)
|
|
|
|
|
|
2017-04-05 21:05:12 +00:00
|
|
|
model, output = create_model(args, queue, label_queue, input_blob_shape)
|
2017-03-01 07:14:11 +00:00
|
|
|
|
|
|
|
|
workspace.RunNetOnce(model.param_init_net)
|
|
|
|
|
workspace.CreateNet(model.net)
|
|
|
|
|
|
|
|
|
|
last_time = time.time()
|
|
|
|
|
start_time = last_time
|
|
|
|
|
num_iters = T // args.seq_length
|
|
|
|
|
entries_per_iter = args.seq_length * args.batch_size
|
|
|
|
|
|
|
|
|
|
# Run the Benchmark
|
|
|
|
|
log.info("------ Starting benchmark ------")
|
|
|
|
|
for iteration in range(0, num_iters, args.iters_to_report):
|
|
|
|
|
iters_once = min(args.iters_to_report, num_iters - iteration)
|
|
|
|
|
workspace.RunNet(model.net.Proto().name, iters_once)
|
2017-04-05 21:05:12 +00:00
|
|
|
|
2017-03-01 07:14:11 +00:00
|
|
|
new_time = time.time()
|
2017-04-05 21:05:12 +00:00
|
|
|
log.info("Iter: {} / {}. Entries Per Second: {}k.". format(
|
2017-03-01 07:14:11 +00:00
|
|
|
iteration,
|
|
|
|
|
num_iters,
|
2017-04-05 21:05:12 +00:00
|
|
|
entries_per_iter * iters_once / (new_time - last_time) // 1000,
|
2017-03-01 07:14:11 +00:00
|
|
|
))
|
|
|
|
|
last_time = new_time
|
|
|
|
|
|
|
|
|
|
log.info("Done. Total EPS: {}k".format(
|
2017-04-05 21:05:12 +00:00
|
|
|
entries_per_iter * num_iters / (time.time() - start_time) // 1000,
|
2017-03-01 07:14:11 +00:00
|
|
|
))
|
|
|
|
|
|
2017-04-19 17:33:29 +00:00
|
|
|
if (args.gpu):
|
|
|
|
|
log.info("Memory stats:")
|
|
|
|
|
stats = utils.GetGPUMemoryUsageStats()
|
|
|
|
|
log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
|
|
|
|
|
if (stats['max_total'] != stats['total']):
|
|
|
|
|
log.warning(
|
|
|
|
|
"Max usage differs from current total usage: {} > {}".
|
|
|
|
|
format(stats['max_total'], stats['total'])
|
|
|
|
|
)
|
|
|
|
|
log.warning("This means that costly deallocations occured.")
|
|
|
|
|
|
2017-03-01 07:14:11 +00:00
|
|
|
|
|
|
|
|
@utils.debug
|
|
|
|
|
def Benchmark(args):
|
|
|
|
|
Caffe2LSTM(args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def GetArgumentParser():
|
|
|
|
|
parser = argparse.ArgumentParser(description="LSTM benchmark.")
|
2017-03-30 22:46:44 +00:00
|
|
|
|
2017-03-01 07:14:11 +00:00
|
|
|
parser.add_argument(
|
|
|
|
|
"--hidden_dim",
|
|
|
|
|
type=int,
|
|
|
|
|
default=40,
|
|
|
|
|
help="Hidden dimension",
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--input_dim",
|
|
|
|
|
type=int,
|
|
|
|
|
default=40,
|
|
|
|
|
help="Input dimension",
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--batch_size",
|
|
|
|
|
type=int,
|
|
|
|
|
default=256,
|
|
|
|
|
help="The batch size."
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--seq_length",
|
|
|
|
|
type=int,
|
|
|
|
|
default=20,
|
2017-03-28 21:06:33 +00:00
|
|
|
help="Max sequence length"
|
2017-03-01 07:14:11 +00:00
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--data_size",
|
|
|
|
|
type=int,
|
|
|
|
|
default=10000000,
|
|
|
|
|
help="Number of data points to generate"
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--iters_to_report",
|
|
|
|
|
type=int,
|
|
|
|
|
default=100,
|
|
|
|
|
help="Number of iteration to report progress"
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--gpu",
|
|
|
|
|
action="store_true",
|
|
|
|
|
help="Run all on GPU",
|
|
|
|
|
)
|
2017-04-05 21:05:12 +00:00
|
|
|
parser.add_argument(
|
|
|
|
|
"--implementation",
|
|
|
|
|
type=str,
|
|
|
|
|
default="own",
|
option to recompute blobs backward pass with massive memory savings
Summary:
This diff adds an option to recurrent_net to define some cell blobs to be recomputed on backward step, and thus they don't need to be stored in the step workspace. This is done by modifying the backward step to automatically include all operators that are needed to produce the output that is to be recomputed, and by storing those blobs in a shared workspace. To enable the shared workspace, i had to modify the stepworkspaces blob to also store a forward shared workspace. Making it a class field won't work since the lifecycle of the blob does not match the lifecycle of the operator.
For basic LSTM, the performance hit is quite modest (about 15% with one setting, but your mileage might vary. For Attention models, I am sure this is beneficial as computing the attention blobs is not expensive.
For basic LSTM, the memory saving is wonderful: each forward workspace only has 4 bytes (for timestep).
I also modified the neural_mt LSTM Cells, but there is no test available, so I am not 100% sure I did it correctly. Please have a look.
Added options to LSTM, MILSTM and LSTMAttention to enable memory mode.
Reviewed By: urikz
Differential Revision: D4853890
fbshipit-source-id: d8d0e0e75a5330d174fbfa39b96d8e4e8c446baa
2017-04-11 20:00:44 +00:00
|
|
|
help="'cudnn' or 'own'",
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--memory_optimization",
|
|
|
|
|
action="store_true",
|
|
|
|
|
help="Whether to use memory optimized LSTM or not",
|
2017-04-05 21:05:12 +00:00
|
|
|
)
|
2017-03-01 07:14:11 +00:00
|
|
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
args = GetArgumentParser().parse_args()
|
|
|
|
|
|
2017-04-07 04:19:35 +00:00
|
|
|
workspace.GlobalInit([
|
|
|
|
|
'caffe2',
|
|
|
|
|
'--caffe2_log_level=0',
|
2017-04-19 17:33:29 +00:00
|
|
|
'--caffe2_print_blob_sizes_at_exit=0',
|
|
|
|
|
'--caffe2_gpu_memory_tracking=1'])
|
2017-03-01 07:14:11 +00:00
|
|
|
|
|
|
|
|
device = core.DeviceOption(
|
|
|
|
|
caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0)
|
|
|
|
|
|
|
|
|
|
with core.DeviceScope(device):
|
|
|
|
|
Benchmark(args)
|