2017-03-29 13:44:02 +00:00
|
|
|
## @package recurrent
|
|
|
|
|
# Module caffe2.python.recurrent
|
2016-12-21 04:43:37 +00:00
|
|
|
from __future__ import absolute_import
|
|
|
|
|
from __future__ import division
|
|
|
|
|
from __future__ import print_function
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
2017-04-18 07:33:06 +00:00
|
|
|
from caffe2.python import core
|
2017-03-14 22:43:42 +00:00
|
|
|
from caffe2.python.scope import CurrentNameScope
|
option to recompute blobs backward pass with massive memory savings
Summary:
This diff adds an option to recurrent_net to define some cell blobs to be recomputed on backward step, and thus they don't need to be stored in the step workspace. This is done by modifying the backward step to automatically include all operators that are needed to produce the output that is to be recomputed, and by storing those blobs in a shared workspace. To enable the shared workspace, i had to modify the stepworkspaces blob to also store a forward shared workspace. Making it a class field won't work since the lifecycle of the blob does not match the lifecycle of the operator.
For basic LSTM, the performance hit is quite modest (about 15% with one setting, but your mileage might vary. For Attention models, I am sure this is beneficial as computing the attention blobs is not expensive.
For basic LSTM, the memory saving is wonderful: each forward workspace only has 4 bytes (for timestep).
I also modified the neural_mt LSTM Cells, but there is no test available, so I am not 100% sure I did it correctly. Please have a look.
Added options to LSTM, MILSTM and LSTMAttention to enable memory mode.
Reviewed By: urikz
Differential Revision: D4853890
fbshipit-source-id: d8d0e0e75a5330d174fbfa39b96d8e4e8c446baa
2017-04-11 20:00:44 +00:00
|
|
|
|
2017-04-24 21:04:51 +00:00
|
|
|
|
2016-12-21 04:43:37 +00:00
|
|
|
def recurrent_net(
|
|
|
|
|
net, cell_net, inputs, initial_cell_inputs,
|
option to recompute blobs backward pass with massive memory savings
Summary:
This diff adds an option to recurrent_net to define some cell blobs to be recomputed on backward step, and thus they don't need to be stored in the step workspace. This is done by modifying the backward step to automatically include all operators that are needed to produce the output that is to be recomputed, and by storing those blobs in a shared workspace. To enable the shared workspace, i had to modify the stepworkspaces blob to also store a forward shared workspace. Making it a class field won't work since the lifecycle of the blob does not match the lifecycle of the operator.
For basic LSTM, the performance hit is quite modest (about 15% with one setting, but your mileage might vary. For Attention models, I am sure this is beneficial as computing the attention blobs is not expensive.
For basic LSTM, the memory saving is wonderful: each forward workspace only has 4 bytes (for timestep).
I also modified the neural_mt LSTM Cells, but there is no test available, so I am not 100% sure I did it correctly. Please have a look.
Added options to LSTM, MILSTM and LSTMAttention to enable memory mode.
Reviewed By: urikz
Differential Revision: D4853890
fbshipit-source-id: d8d0e0e75a5330d174fbfa39b96d8e4e8c446baa
2017-04-11 20:00:44 +00:00
|
|
|
links, timestep=None, scope=None, outputs_with_grads=(0,),
|
2017-04-24 21:04:51 +00:00
|
|
|
recompute_blobs_on_backward=None, forward_only=False,
|
2016-12-21 04:43:37 +00:00
|
|
|
):
|
|
|
|
|
'''
|
|
|
|
|
net: the main net operator should be added to
|
|
|
|
|
|
|
|
|
|
cell_net: cell_net which is executed in a recurrent fasion
|
|
|
|
|
|
|
|
|
|
inputs: sequences to be fed into the recurrent net. Currently only one input
|
|
|
|
|
is supported. It has to be in a format T x N x (D1...Dk) where T is lengths
|
|
|
|
|
of the sequence. N is a batch size and (D1...Dk) are the rest of dimentions
|
|
|
|
|
|
|
|
|
|
initial_cell_inputs: inputs of the cell_net for the 0 timestamp.
|
|
|
|
|
Format for each input is:
|
2017-01-25 07:03:40 +00:00
|
|
|
(cell_net_input_name, external_blob_with_data)
|
2016-12-21 04:43:37 +00:00
|
|
|
|
|
|
|
|
links: a dictionary from cell_net input names in moment t+1 and
|
|
|
|
|
output names of moment t. Currently we assume that each output becomes
|
|
|
|
|
an input for the next timestep.
|
|
|
|
|
|
|
|
|
|
timestep: name of the timestep blob to be used. If not provided "timestep"
|
|
|
|
|
is used.
|
|
|
|
|
|
|
|
|
|
scope: Internal blobs are going to be scoped in a format
|
|
|
|
|
<scope_name>/<blob_name>
|
|
|
|
|
If not provided we generate a scope name automatically
|
2017-02-15 22:16:10 +00:00
|
|
|
|
|
|
|
|
outputs_with_grads : position indices of output blobs which will receive
|
|
|
|
|
error gradient (from outside recurrent network) during backpropagation
|
option to recompute blobs backward pass with massive memory savings
Summary:
This diff adds an option to recurrent_net to define some cell blobs to be recomputed on backward step, and thus they don't need to be stored in the step workspace. This is done by modifying the backward step to automatically include all operators that are needed to produce the output that is to be recomputed, and by storing those blobs in a shared workspace. To enable the shared workspace, i had to modify the stepworkspaces blob to also store a forward shared workspace. Making it a class field won't work since the lifecycle of the blob does not match the lifecycle of the operator.
For basic LSTM, the performance hit is quite modest (about 15% with one setting, but your mileage might vary. For Attention models, I am sure this is beneficial as computing the attention blobs is not expensive.
For basic LSTM, the memory saving is wonderful: each forward workspace only has 4 bytes (for timestep).
I also modified the neural_mt LSTM Cells, but there is no test available, so I am not 100% sure I did it correctly. Please have a look.
Added options to LSTM, MILSTM and LSTMAttention to enable memory mode.
Reviewed By: urikz
Differential Revision: D4853890
fbshipit-source-id: d8d0e0e75a5330d174fbfa39b96d8e4e8c446baa
2017-04-11 20:00:44 +00:00
|
|
|
|
|
|
|
|
recompute_blobs_on_backward: specify a list of blobs that will be
|
|
|
|
|
recomputed for backward pass, and thus need not to be
|
|
|
|
|
stored for each forward timestep.
|
2017-04-24 21:04:51 +00:00
|
|
|
|
|
|
|
|
forward_only: if True, only forward steps are executed
|
2016-12-21 04:43:37 +00:00
|
|
|
'''
|
|
|
|
|
assert len(inputs) == 1, "Only one input blob is supported so far"
|
|
|
|
|
|
2017-03-14 22:43:42 +00:00
|
|
|
# Validate scoping
|
|
|
|
|
for einp in cell_net.Proto().external_input:
|
|
|
|
|
assert einp.startswith(CurrentNameScope()), \
|
|
|
|
|
'''
|
|
|
|
|
Cell net external inputs are not properly scoped, use
|
|
|
|
|
AddScopedExternalInputs() when creating them
|
|
|
|
|
'''
|
|
|
|
|
|
2016-12-21 04:43:37 +00:00
|
|
|
input_blobs = [str(i[0]) for i in inputs]
|
|
|
|
|
initial_input_blobs = [str(x[1]) for x in initial_cell_inputs]
|
|
|
|
|
op_name = net.NextName('recurrent')
|
|
|
|
|
|
|
|
|
|
def s(name):
|
|
|
|
|
# We have to manually scope due to our internal/external blob
|
|
|
|
|
# relationships.
|
|
|
|
|
scope_name = op_name if scope is None else scope
|
|
|
|
|
return "{}/{}".format(str(scope_name), str(name))
|
|
|
|
|
|
|
|
|
|
# determine inputs that are considered to be references
|
|
|
|
|
# it is those that are not referred to in inputs or initial_cell_inputs
|
2017-05-30 22:29:57 +00:00
|
|
|
known_inputs = [str(b) for b in input_blobs + initial_input_blobs]
|
2016-12-21 04:43:37 +00:00
|
|
|
known_inputs += [str(x[0]) for x in initial_cell_inputs]
|
|
|
|
|
if timestep is not None:
|
|
|
|
|
known_inputs.append(str(timestep))
|
|
|
|
|
references = [
|
2017-03-14 22:43:42 +00:00
|
|
|
core.BlobReference(b) for b in cell_net.Proto().external_input
|
2016-12-21 04:43:37 +00:00
|
|
|
if b not in known_inputs]
|
|
|
|
|
|
|
|
|
|
inner_outputs = list(cell_net.Proto().external_output)
|
|
|
|
|
# These gradients are expected to be available during the backward pass
|
|
|
|
|
inner_outputs_map = {o: o + '_grad' for o in inner_outputs}
|
|
|
|
|
|
|
|
|
|
# compute the backward pass of the cell net
|
2017-04-24 21:04:51 +00:00
|
|
|
if not forward_only:
|
|
|
|
|
backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass(
|
|
|
|
|
cell_net.Proto().op, inner_outputs_map)
|
|
|
|
|
backward_mapping = {str(k): v for k, v in backward_mapping.items()}
|
|
|
|
|
|
|
|
|
|
backward_cell_net = core.Net("RecurrentBackwardStep")
|
|
|
|
|
del backward_cell_net.Proto().op[:]
|
|
|
|
|
|
|
|
|
|
if recompute_blobs_on_backward is not None:
|
|
|
|
|
# Insert operators to re-compute the specified blobs.
|
|
|
|
|
# They are added in the same order as for the forward pass, thus
|
|
|
|
|
# the order is correct.
|
|
|
|
|
recompute_blobs_on_backward = {str(b) for b in
|
|
|
|
|
recompute_blobs_on_backward}
|
|
|
|
|
|
|
|
|
|
for op in cell_net.Proto().op:
|
|
|
|
|
if not recompute_blobs_on_backward.isdisjoint(set(op.output)):
|
|
|
|
|
backward_cell_net.Proto().op.extend([op])
|
|
|
|
|
# This fires if other outputs than the declared
|
|
|
|
|
# are computed by the ops that are recomputed
|
|
|
|
|
assert set(op.output).issubset(recompute_blobs_on_backward)
|
|
|
|
|
|
|
|
|
|
backward_cell_net.Proto().op.extend(backward_ops)
|
|
|
|
|
# compute blobs used but not defined in the backward pass
|
|
|
|
|
backward_ssa, backward_blob_versions = core.get_ssa(
|
|
|
|
|
backward_cell_net.Proto())
|
|
|
|
|
undefined = core.get_undefined_blobs(backward_ssa)
|
|
|
|
|
|
|
|
|
|
# also add to the output list the intermediate outputs of fwd_step that
|
|
|
|
|
# are used by backward.
|
|
|
|
|
ssa, blob_versions = core.get_ssa(cell_net.Proto())
|
|
|
|
|
scratches = [
|
|
|
|
|
blob for (blob, ver) in blob_versions.items()
|
|
|
|
|
if ver > 0 and
|
|
|
|
|
blob in undefined and
|
|
|
|
|
blob not in cell_net.Proto().external_output]
|
|
|
|
|
backward_cell_net.Proto().external_input.extend(scratches)
|
|
|
|
|
backward_cell_net.Proto().type = 'simple'
|
option to recompute blobs backward pass with massive memory savings
Summary:
This diff adds an option to recurrent_net to define some cell blobs to be recomputed on backward step, and thus they don't need to be stored in the step workspace. This is done by modifying the backward step to automatically include all operators that are needed to produce the output that is to be recomputed, and by storing those blobs in a shared workspace. To enable the shared workspace, i had to modify the stepworkspaces blob to also store a forward shared workspace. Making it a class field won't work since the lifecycle of the blob does not match the lifecycle of the operator.
For basic LSTM, the performance hit is quite modest (about 15% with one setting, but your mileage might vary. For Attention models, I am sure this is beneficial as computing the attention blobs is not expensive.
For basic LSTM, the memory saving is wonderful: each forward workspace only has 4 bytes (for timestep).
I also modified the neural_mt LSTM Cells, but there is no test available, so I am not 100% sure I did it correctly. Please have a look.
Added options to LSTM, MILSTM and LSTMAttention to enable memory mode.
Reviewed By: urikz
Differential Revision: D4853890
fbshipit-source-id: d8d0e0e75a5330d174fbfa39b96d8e4e8c446baa
2017-04-11 20:00:44 +00:00
|
|
|
else:
|
2017-04-24 21:04:51 +00:00
|
|
|
backward_cell_net = None
|
2016-12-21 04:43:37 +00:00
|
|
|
|
|
|
|
|
all_inputs = [i[1] for i in inputs] + [
|
|
|
|
|
x[1] for x in initial_cell_inputs] + references
|
|
|
|
|
all_outputs = []
|
|
|
|
|
|
2017-05-02 01:55:21 +00:00
|
|
|
cell_net.Proto().type = 'rnn'
|
2016-12-21 04:43:37 +00:00
|
|
|
|
|
|
|
|
# Internal arguments used by RecurrentNetwork operator
|
|
|
|
|
|
|
|
|
|
# Links are in the format blob_name, recurrent_states, offset.
|
|
|
|
|
# In the moment t we know that corresponding data block is at
|
|
|
|
|
# t + offset position in the recurrent_states tensor
|
|
|
|
|
forward_links = []
|
|
|
|
|
backward_links = []
|
|
|
|
|
|
|
|
|
|
# Aliases are used to expose outputs to external world
|
|
|
|
|
# Format (internal_blob, external_blob, offset)
|
|
|
|
|
# Negative offset stands for going from the end,
|
|
|
|
|
# positive - from the beginning
|
|
|
|
|
aliases = []
|
|
|
|
|
|
|
|
|
|
# States held inputs to the cell net
|
|
|
|
|
recurrent_states = []
|
|
|
|
|
|
2017-01-25 07:03:40 +00:00
|
|
|
for cell_input, _ in initial_cell_inputs:
|
2016-12-21 04:43:37 +00:00
|
|
|
cell_input = str(cell_input)
|
|
|
|
|
# Recurrent_states is going to be (T + 1) x ...
|
|
|
|
|
# It stores all inputs and outputs of the cell net over time.
|
|
|
|
|
# Or their gradients in the case of the backward pass.
|
|
|
|
|
state = s(cell_input + "_states")
|
|
|
|
|
states_grad = state + "_grad"
|
|
|
|
|
cell_output = links[str(cell_input)]
|
|
|
|
|
forward_links.append((cell_input, state, 0))
|
|
|
|
|
forward_links.append((cell_output, state, 1))
|
|
|
|
|
|
|
|
|
|
aliases.append((state, cell_output + "_all", 1))
|
2017-02-15 22:16:10 +00:00
|
|
|
aliases.append((state, cell_output + "_last", -1))
|
2016-12-21 04:43:37 +00:00
|
|
|
all_outputs.extend([cell_output + "_all", cell_output + "_last"])
|
|
|
|
|
|
|
|
|
|
recurrent_states.append(state)
|
|
|
|
|
|
2017-04-24 21:04:51 +00:00
|
|
|
if backward_cell_net is not None:
|
|
|
|
|
backward_links.append((cell_output + "_grad", states_grad, 1))
|
|
|
|
|
backward_cell_net.Proto().external_input.append(
|
|
|
|
|
str(cell_output) + "_grad")
|
|
|
|
|
|
|
|
|
|
recurrent_input_grad = cell_input + "_grad"
|
|
|
|
|
if not backward_blob_versions.get(recurrent_input_grad, 0):
|
|
|
|
|
# If nobody writes to this recurrent input gradient, we need
|
|
|
|
|
# to make sure it gets to the states grad blob after all.
|
|
|
|
|
# We do this by using backward_links which triggers an alias
|
|
|
|
|
# This logic is being used for example in a SumOp case
|
|
|
|
|
backward_links.append(
|
|
|
|
|
(backward_mapping[cell_input], states_grad, 0))
|
|
|
|
|
else:
|
|
|
|
|
backward_links.append((cell_input + "_grad", states_grad, 0))
|
2017-03-22 01:25:51 +00:00
|
|
|
|
2017-02-23 11:53:14 +00:00
|
|
|
for input_t, input_blob in inputs:
|
2016-12-21 04:43:37 +00:00
|
|
|
forward_links.append((str(input_t), str(input_blob), 0))
|
2017-04-24 21:04:51 +00:00
|
|
|
|
|
|
|
|
if backward_cell_net is not None:
|
|
|
|
|
for input_t, input_blob in inputs:
|
|
|
|
|
backward_links.append((
|
|
|
|
|
backward_mapping[str(input_t)], str(input_blob) + "_grad", 0
|
|
|
|
|
))
|
|
|
|
|
backward_cell_net.Proto().external_input.extend(
|
|
|
|
|
cell_net.Proto().external_input)
|
|
|
|
|
backward_cell_net.Proto().external_input.extend(
|
|
|
|
|
cell_net.Proto().external_output)
|
2016-12-21 04:43:37 +00:00
|
|
|
|
|
|
|
|
def unpack_triple(x):
|
|
|
|
|
if x:
|
|
|
|
|
a, b, c = zip(*x)
|
|
|
|
|
return a, b, c
|
|
|
|
|
return [], [], []
|
|
|
|
|
|
|
|
|
|
# Splitting to separate lists so we can pass them to c++
|
|
|
|
|
# where we ensemle them back
|
|
|
|
|
link_internal, link_external, link_offset = unpack_triple(forward_links)
|
|
|
|
|
alias_src, alias_dst, alias_offset = unpack_triple(aliases)
|
|
|
|
|
|
2017-01-25 00:19:32 +00:00
|
|
|
recurrent_inputs = [str(x[1]) for x in initial_cell_inputs]
|
|
|
|
|
|
2017-06-07 19:27:34 +00:00
|
|
|
# Make sure that recurrent gradients accumulate with internal gradients
|
|
|
|
|
# (if a blob in the backward_cell_net receives gradient from both an
|
|
|
|
|
# external connection as well as from within the backward_cell_net,
|
|
|
|
|
# those gradients need to be added together, rather than one overwriting
|
|
|
|
|
# the other)
|
|
|
|
|
if backward_cell_net is not None:
|
|
|
|
|
proto = backward_cell_net.Proto()
|
|
|
|
|
operators = []
|
|
|
|
|
while len(proto.op) > 0:
|
|
|
|
|
op = proto.op[-1]
|
|
|
|
|
proto.op.remove(op)
|
|
|
|
|
operators.append(op)
|
|
|
|
|
for op in operators[::-1]:
|
|
|
|
|
proto.op.extend([op])
|
|
|
|
|
for j, output_blob in enumerate(op.output):
|
|
|
|
|
if output_blob in proto.external_input:
|
|
|
|
|
# blob cannot be internal by virtue of in-place operation.
|
|
|
|
|
if output_blob in op.input:
|
|
|
|
|
continue
|
|
|
|
|
accum_blob = '{}_accum'.format(output_blob)
|
|
|
|
|
proto.op[-1].output[j] = accum_blob
|
|
|
|
|
backward_cell_net.Sum(
|
|
|
|
|
[output_blob, accum_blob],
|
|
|
|
|
[output_blob],
|
|
|
|
|
)
|
|
|
|
|
|
2017-04-24 21:04:51 +00:00
|
|
|
backward_args = {}
|
|
|
|
|
if backward_cell_net is not None:
|
|
|
|
|
backward_link_internal, backward_link_external, backward_link_offset = \
|
|
|
|
|
unpack_triple(backward_links)
|
|
|
|
|
params = [x for x in references if x in backward_mapping.keys()]
|
2017-05-30 22:29:57 +00:00
|
|
|
param_grads = [
|
|
|
|
|
str(backward_mapping[x])
|
|
|
|
|
for x in references
|
|
|
|
|
if x in backward_mapping.keys()
|
|
|
|
|
]
|
2017-05-05 20:58:45 +00:00
|
|
|
if recompute_blobs_on_backward is None:
|
|
|
|
|
recompute_blobs_on_backward = set()
|
2017-04-24 21:04:51 +00:00
|
|
|
backward_args = {
|
|
|
|
|
'param': map(all_inputs.index, params),
|
|
|
|
|
'backward_link_internal': map(str, backward_link_internal),
|
|
|
|
|
'backward_link_external': map(str, backward_link_external),
|
|
|
|
|
'backward_link_offset': backward_link_offset,
|
|
|
|
|
'backward_step_net': str(backward_cell_net.Proto()),
|
|
|
|
|
'outputs_with_grads': outputs_with_grads,
|
2017-05-30 22:29:57 +00:00
|
|
|
'recompute_blobs_on_backward': [
|
|
|
|
|
str(b) for b in recompute_blobs_on_backward
|
|
|
|
|
],
|
2017-04-26 03:43:19 +00:00
|
|
|
'param_grads': param_grads,
|
2017-04-24 21:04:51 +00:00
|
|
|
}
|
|
|
|
|
|
2017-01-27 19:42:38 +00:00
|
|
|
results = net.RecurrentNetwork(
|
2016-12-21 04:43:37 +00:00
|
|
|
all_inputs,
|
2017-04-08 06:48:03 +00:00
|
|
|
all_outputs + [s("step_workspaces")],
|
2016-12-21 04:43:37 +00:00
|
|
|
alias_src=alias_src,
|
2017-05-30 22:29:57 +00:00
|
|
|
alias_dst=[str(a) for a in alias_dst],
|
2016-12-21 04:43:37 +00:00
|
|
|
alias_offset=alias_offset,
|
|
|
|
|
recurrent_states=recurrent_states,
|
2017-05-30 22:29:57 +00:00
|
|
|
initial_recurrent_state_ids=[
|
|
|
|
|
all_inputs.index(i) for i in recurrent_inputs
|
|
|
|
|
],
|
|
|
|
|
link_internal=[str(l) for l in link_internal],
|
|
|
|
|
link_external=[str(l) for l in link_external],
|
2016-12-21 04:43:37 +00:00
|
|
|
link_offset=link_offset,
|
|
|
|
|
step_net=str(cell_net.Proto()),
|
|
|
|
|
timestep="timestep" if timestep is None else str(timestep),
|
2017-04-24 21:04:51 +00:00
|
|
|
**backward_args
|
2016-12-21 04:43:37 +00:00
|
|
|
)
|
2017-05-02 01:55:21 +00:00
|
|
|
|
|
|
|
|
# Restore net type since 'rnn' is not recognized outside RNNs
|
|
|
|
|
cell_net.Proto().type = 'simple'
|
|
|
|
|
|
2017-01-27 19:42:38 +00:00
|
|
|
# The last output is a list of step workspaces,
|
|
|
|
|
# which is only needed internally for gradient propogation
|
|
|
|
|
return results[:-1]
|