2017-03-29 13:44:02 +00:00
|
|
|
## @package attention
|
|
|
|
|
# Module caffe2.python.attention
|
2020-09-24 00:55:24 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-02-23 11:53:14 +00:00
|
|
|
|
2017-05-16 20:31:24 +00:00
|
|
|
from caffe2.python import brew
|
|
|
|
|
|
2017-02-23 11:53:14 +00:00
|
|
|
|
2017-03-08 19:11:50 +00:00
|
|
|
class AttentionType:
|
soft-coverage attention
Summary:
Implementation of a new variant of attention module, which contains a recurrent decoder state with vectors corresponding to each source-side word and strictly increasing values, thus enabling it to model the degree to which source words have been translated.
The approach is a variant of the approaches described in https://arxiv.org/pdf/1601.04811.pdf. We simply include the sum of all previous attention weights for encoder words as a new recurrent state (coverage_t). A new linear transform on encoder_outputs is used to produce coverage_weights, which has the same dimensionality as encoder_outputs, and implicitly models the fertility of source-side words (and putting this extra information strain on the encoder network).
Thus the encoder output, the decoder state, and the coverage weights have the same dimensionality for a given source word, and attention logits are calculated as v * tanh(coverage * coverage_weights + encoder_output + decoder_state).
Note: the entire coverage state for each translation instance is of shape (encoder_length, coverage_units), but the states for the RecurrentNetwork operator, used to train the decoder, must be flat in the data dimension. This state is therefore initialized with shape (encoder_length * coverage_units) [not shown in the open-source library] and reshaped appropriately within the apply_soft_coverage_attention() function.
Differential Revision: D5593617
fbshipit-source-id: 7d0522b5eb0b26f22e8429e4461a459f2f16ed46
2017-09-01 04:06:54 +00:00
|
|
|
Regular, Recurrent, Dot, SoftCoverage = tuple(range(4))
|
2017-03-08 19:11:50 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def s(scope, name):
|
|
|
|
|
# We have to manually scope due to our internal/external blob
|
|
|
|
|
# relationships.
|
|
|
|
|
return "{}/{}".format(str(scope), str(name))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# c_i = \sum_j w_{ij}\textbf{s}_j
|
|
|
|
|
def _calc_weighted_context(
|
2017-02-23 11:53:14 +00:00
|
|
|
model,
|
|
|
|
|
encoder_outputs_transposed,
|
2017-03-08 19:11:50 +00:00
|
|
|
encoder_output_dim,
|
|
|
|
|
attention_weights_3d,
|
2017-02-23 11:53:14 +00:00
|
|
|
scope,
|
|
|
|
|
):
|
2017-03-08 19:11:50 +00:00
|
|
|
# [batch_size, encoder_output_dim, 1]
|
2017-09-15 03:58:49 +00:00
|
|
|
attention_weighted_encoder_context = brew.batch_mat_mul(
|
|
|
|
|
model,
|
2017-03-08 19:11:50 +00:00
|
|
|
[encoder_outputs_transposed, attention_weights_3d],
|
|
|
|
|
s(scope, 'attention_weighted_encoder_context'),
|
2017-02-23 11:53:14 +00:00
|
|
|
)
|
|
|
|
|
# [batch_size, encoder_output_dim]
|
2017-03-08 19:11:50 +00:00
|
|
|
attention_weighted_encoder_context, _ = model.net.Reshape(
|
|
|
|
|
attention_weighted_encoder_context,
|
|
|
|
|
[
|
|
|
|
|
attention_weighted_encoder_context,
|
2017-04-20 16:59:15 +00:00
|
|
|
s(scope, 'attention_weighted_encoder_context_old_shape'),
|
2017-03-08 19:11:50 +00:00
|
|
|
],
|
2017-03-21 04:52:18 +00:00
|
|
|
shape=[1, -1, encoder_output_dim],
|
2017-02-23 11:53:14 +00:00
|
|
|
)
|
2017-03-08 19:11:50 +00:00
|
|
|
return attention_weighted_encoder_context
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Calculate a softmax over the passed in attention energy logits
|
|
|
|
|
def _calc_attention_weights(
|
|
|
|
|
model,
|
|
|
|
|
attention_logits_transposed,
|
2017-04-20 16:59:15 +00:00
|
|
|
scope,
|
2017-07-23 16:55:41 +00:00
|
|
|
encoder_lengths=None,
|
2017-03-08 19:11:50 +00:00
|
|
|
):
|
2017-07-23 16:55:41 +00:00
|
|
|
if encoder_lengths is not None:
|
2017-08-13 02:05:15 +00:00
|
|
|
attention_logits_transposed = model.net.SequenceMask(
|
|
|
|
|
[attention_logits_transposed, encoder_lengths],
|
|
|
|
|
['masked_attention_logits'],
|
|
|
|
|
mode='sequence',
|
2017-07-23 16:55:41 +00:00
|
|
|
)
|
|
|
|
|
|
2017-03-08 19:11:50 +00:00
|
|
|
# [batch_size, encoder_length, 1]
|
2017-05-16 20:31:24 +00:00
|
|
|
attention_weights_3d = brew.softmax(
|
|
|
|
|
model,
|
2017-05-10 00:06:03 +00:00
|
|
|
attention_logits_transposed,
|
2017-03-08 19:11:50 +00:00
|
|
|
s(scope, 'attention_weights_3d'),
|
2017-05-10 00:06:03 +00:00
|
|
|
engine='CUDNN',
|
|
|
|
|
axis=1,
|
2017-02-23 11:53:14 +00:00
|
|
|
)
|
2017-03-08 19:11:50 +00:00
|
|
|
return attention_weights_3d
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# e_{ij} = \textbf{v}^T tanh \alpha(\textbf{h}_{i-1}, \textbf{s}_j)
|
|
|
|
|
def _calc_attention_logits_from_sum_match(
|
|
|
|
|
model,
|
|
|
|
|
decoder_hidden_encoder_outputs_sum,
|
|
|
|
|
encoder_output_dim,
|
2017-04-20 16:59:15 +00:00
|
|
|
scope,
|
2017-03-08 19:11:50 +00:00
|
|
|
):
|
2017-02-23 11:53:14 +00:00
|
|
|
# [encoder_length, batch_size, encoder_output_dim]
|
|
|
|
|
decoder_hidden_encoder_outputs_sum = model.net.Tanh(
|
|
|
|
|
decoder_hidden_encoder_outputs_sum,
|
|
|
|
|
decoder_hidden_encoder_outputs_sum,
|
|
|
|
|
)
|
2017-03-08 19:11:50 +00:00
|
|
|
|
2017-03-20 01:06:52 +00:00
|
|
|
# [encoder_length, batch_size, 1]
|
2017-06-26 00:14:21 +00:00
|
|
|
attention_logits = brew.fc(
|
|
|
|
|
model,
|
|
|
|
|
decoder_hidden_encoder_outputs_sum,
|
|
|
|
|
s(scope, 'attention_logits'),
|
|
|
|
|
dim_in=encoder_output_dim,
|
|
|
|
|
dim_out=1,
|
2017-04-20 16:59:15 +00:00
|
|
|
axis=2,
|
2017-06-26 00:14:21 +00:00
|
|
|
freeze_bias=True,
|
2017-02-23 11:53:14 +00:00
|
|
|
)
|
2017-06-26 00:14:21 +00:00
|
|
|
|
2017-05-10 00:06:03 +00:00
|
|
|
# [batch_size, encoder_length, 1]
|
2017-06-16 17:52:08 +00:00
|
|
|
attention_logits_transposed = brew.transpose(
|
|
|
|
|
model,
|
2017-02-23 11:53:14 +00:00
|
|
|
attention_logits,
|
2017-03-08 19:11:50 +00:00
|
|
|
s(scope, 'attention_logits_transposed'),
|
2017-05-10 00:06:03 +00:00
|
|
|
axes=[1, 0, 2],
|
2017-02-23 11:53:14 +00:00
|
|
|
)
|
2017-03-08 19:11:50 +00:00
|
|
|
return attention_logits_transposed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# \textbf{W}^\alpha used in the context of \alpha_{sum}(a,b)
|
|
|
|
|
def _apply_fc_weight_for_sum_match(
|
|
|
|
|
model,
|
|
|
|
|
input,
|
|
|
|
|
dim_in,
|
|
|
|
|
dim_out,
|
|
|
|
|
scope,
|
2017-04-20 16:59:15 +00:00
|
|
|
name,
|
2017-03-08 19:11:50 +00:00
|
|
|
):
|
2017-05-16 20:31:24 +00:00
|
|
|
output = brew.fc(
|
|
|
|
|
model,
|
2017-03-08 19:11:50 +00:00
|
|
|
input,
|
|
|
|
|
s(scope, name),
|
|
|
|
|
dim_in=dim_in,
|
|
|
|
|
dim_out=dim_out,
|
|
|
|
|
axis=2,
|
2017-02-23 11:53:14 +00:00
|
|
|
)
|
2017-03-08 19:11:50 +00:00
|
|
|
output = model.net.Squeeze(
|
|
|
|
|
output,
|
|
|
|
|
output,
|
2017-04-20 16:59:15 +00:00
|
|
|
dims=[0],
|
2017-02-23 11:53:14 +00:00
|
|
|
)
|
2017-03-08 19:11:50 +00:00
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Implement RecAtt due to section 4.1 in http://arxiv.org/abs/1601.03317
|
|
|
|
|
def apply_recurrent_attention(
|
|
|
|
|
model,
|
|
|
|
|
encoder_output_dim,
|
|
|
|
|
encoder_outputs_transposed,
|
|
|
|
|
weighted_encoder_outputs,
|
|
|
|
|
decoder_hidden_state_t,
|
|
|
|
|
decoder_hidden_state_dim,
|
|
|
|
|
attention_weighted_encoder_context_t_prev,
|
|
|
|
|
scope,
|
2017-07-23 16:55:41 +00:00
|
|
|
encoder_lengths=None,
|
2017-03-08 19:11:50 +00:00
|
|
|
):
|
|
|
|
|
weighted_prev_attention_context = _apply_fc_weight_for_sum_match(
|
|
|
|
|
model=model,
|
|
|
|
|
input=attention_weighted_encoder_context_t_prev,
|
|
|
|
|
dim_in=encoder_output_dim,
|
|
|
|
|
dim_out=encoder_output_dim,
|
|
|
|
|
scope=scope,
|
2017-04-20 16:59:15 +00:00
|
|
|
name='weighted_prev_attention_context',
|
2017-02-23 11:53:14 +00:00
|
|
|
)
|
2017-03-08 19:11:50 +00:00
|
|
|
|
|
|
|
|
weighted_decoder_hidden_state = _apply_fc_weight_for_sum_match(
|
|
|
|
|
model=model,
|
|
|
|
|
input=decoder_hidden_state_t,
|
|
|
|
|
dim_in=decoder_hidden_state_dim,
|
|
|
|
|
dim_out=encoder_output_dim,
|
|
|
|
|
scope=scope,
|
2017-04-20 16:59:15 +00:00
|
|
|
name='weighted_decoder_hidden_state',
|
2017-03-08 19:11:50 +00:00
|
|
|
)
|
soft-coverage attention
Summary:
Implementation of a new variant of attention module, which contains a recurrent decoder state with vectors corresponding to each source-side word and strictly increasing values, thus enabling it to model the degree to which source words have been translated.
The approach is a variant of the approaches described in https://arxiv.org/pdf/1601.04811.pdf. We simply include the sum of all previous attention weights for encoder words as a new recurrent state (coverage_t). A new linear transform on encoder_outputs is used to produce coverage_weights, which has the same dimensionality as encoder_outputs, and implicitly models the fertility of source-side words (and putting this extra information strain on the encoder network).
Thus the encoder output, the decoder state, and the coverage weights have the same dimensionality for a given source word, and attention logits are calculated as v * tanh(coverage * coverage_weights + encoder_output + decoder_state).
Note: the entire coverage state for each translation instance is of shape (encoder_length, coverage_units), but the states for the RecurrentNetwork operator, used to train the decoder, must be flat in the data dimension. This state is therefore initialized with shape (encoder_length * coverage_units) [not shown in the open-source library] and reshaped appropriately within the apply_soft_coverage_attention() function.
Differential Revision: D5593617
fbshipit-source-id: 7d0522b5eb0b26f22e8429e4461a459f2f16ed46
2017-09-01 04:06:54 +00:00
|
|
|
# [1, batch_size, encoder_output_dim]
|
2017-03-08 19:11:50 +00:00
|
|
|
decoder_hidden_encoder_outputs_sum_tmp = model.net.Add(
|
2017-02-23 11:53:14 +00:00
|
|
|
[
|
2017-05-05 19:14:39 +00:00
|
|
|
weighted_prev_attention_context,
|
2017-04-20 16:59:15 +00:00
|
|
|
weighted_decoder_hidden_state,
|
2017-02-23 11:53:14 +00:00
|
|
|
],
|
2017-03-08 19:11:50 +00:00
|
|
|
s(scope, 'decoder_hidden_encoder_outputs_sum_tmp'),
|
|
|
|
|
)
|
|
|
|
|
# [encoder_length, batch_size, encoder_output_dim]
|
|
|
|
|
decoder_hidden_encoder_outputs_sum = model.net.Add(
|
|
|
|
|
[
|
2017-05-05 19:14:39 +00:00
|
|
|
weighted_encoder_outputs,
|
2017-03-08 19:11:50 +00:00
|
|
|
decoder_hidden_encoder_outputs_sum_tmp,
|
|
|
|
|
],
|
|
|
|
|
s(scope, 'decoder_hidden_encoder_outputs_sum'),
|
|
|
|
|
broadcast=1,
|
|
|
|
|
)
|
|
|
|
|
attention_logits_transposed = _calc_attention_logits_from_sum_match(
|
|
|
|
|
model=model,
|
|
|
|
|
decoder_hidden_encoder_outputs_sum=decoder_hidden_encoder_outputs_sum,
|
|
|
|
|
encoder_output_dim=encoder_output_dim,
|
2017-04-20 16:59:15 +00:00
|
|
|
scope=scope,
|
2017-03-08 19:11:50 +00:00
|
|
|
)
|
|
|
|
|
|
2017-03-22 02:00:58 +00:00
|
|
|
# [batch_size, encoder_length, 1]
|
2017-03-08 19:11:50 +00:00
|
|
|
attention_weights_3d = _calc_attention_weights(
|
|
|
|
|
model=model,
|
|
|
|
|
attention_logits_transposed=attention_logits_transposed,
|
2017-04-20 16:59:15 +00:00
|
|
|
scope=scope,
|
2017-07-23 16:55:41 +00:00
|
|
|
encoder_lengths=encoder_lengths,
|
2017-03-08 19:11:50 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [batch_size, encoder_output_dim, 1]
|
|
|
|
|
attention_weighted_encoder_context = _calc_weighted_context(
|
|
|
|
|
model=model,
|
|
|
|
|
encoder_outputs_transposed=encoder_outputs_transposed,
|
|
|
|
|
encoder_output_dim=encoder_output_dim,
|
|
|
|
|
attention_weights_3d=attention_weights_3d,
|
2017-04-20 16:59:15 +00:00
|
|
|
scope=scope,
|
2017-03-08 19:11:50 +00:00
|
|
|
)
|
option to recompute blobs backward pass with massive memory savings
Summary:
This diff adds an option to recurrent_net to define some cell blobs to be recomputed on backward step, and thus they don't need to be stored in the step workspace. This is done by modifying the backward step to automatically include all operators that are needed to produce the output that is to be recomputed, and by storing those blobs in a shared workspace. To enable the shared workspace, i had to modify the stepworkspaces blob to also store a forward shared workspace. Making it a class field won't work since the lifecycle of the blob does not match the lifecycle of the operator.
For basic LSTM, the performance hit is quite modest (about 15% with one setting, but your mileage might vary. For Attention models, I am sure this is beneficial as computing the attention blobs is not expensive.
For basic LSTM, the memory saving is wonderful: each forward workspace only has 4 bytes (for timestep).
I also modified the neural_mt LSTM Cells, but there is no test available, so I am not 100% sure I did it correctly. Please have a look.
Added options to LSTM, MILSTM and LSTMAttention to enable memory mode.
Reviewed By: urikz
Differential Revision: D4853890
fbshipit-source-id: d8d0e0e75a5330d174fbfa39b96d8e4e8c446baa
2017-04-11 20:00:44 +00:00
|
|
|
return attention_weighted_encoder_context, attention_weights_3d, [
|
2017-04-20 16:59:15 +00:00
|
|
|
decoder_hidden_encoder_outputs_sum,
|
option to recompute blobs backward pass with massive memory savings
Summary:
This diff adds an option to recurrent_net to define some cell blobs to be recomputed on backward step, and thus they don't need to be stored in the step workspace. This is done by modifying the backward step to automatically include all operators that are needed to produce the output that is to be recomputed, and by storing those blobs in a shared workspace. To enable the shared workspace, i had to modify the stepworkspaces blob to also store a forward shared workspace. Making it a class field won't work since the lifecycle of the blob does not match the lifecycle of the operator.
For basic LSTM, the performance hit is quite modest (about 15% with one setting, but your mileage might vary. For Attention models, I am sure this is beneficial as computing the attention blobs is not expensive.
For basic LSTM, the memory saving is wonderful: each forward workspace only has 4 bytes (for timestep).
I also modified the neural_mt LSTM Cells, but there is no test available, so I am not 100% sure I did it correctly. Please have a look.
Added options to LSTM, MILSTM and LSTMAttention to enable memory mode.
Reviewed By: urikz
Differential Revision: D4853890
fbshipit-source-id: d8d0e0e75a5330d174fbfa39b96d8e4e8c446baa
2017-04-11 20:00:44 +00:00
|
|
|
]
|
2017-03-08 19:11:50 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def apply_regular_attention(
|
|
|
|
|
model,
|
|
|
|
|
encoder_output_dim,
|
|
|
|
|
encoder_outputs_transposed,
|
|
|
|
|
weighted_encoder_outputs,
|
|
|
|
|
decoder_hidden_state_t,
|
|
|
|
|
decoder_hidden_state_dim,
|
|
|
|
|
scope,
|
2017-07-23 16:55:41 +00:00
|
|
|
encoder_lengths=None,
|
2017-03-08 19:11:50 +00:00
|
|
|
):
|
|
|
|
|
weighted_decoder_hidden_state = _apply_fc_weight_for_sum_match(
|
|
|
|
|
model=model,
|
|
|
|
|
input=decoder_hidden_state_t,
|
|
|
|
|
dim_in=decoder_hidden_state_dim,
|
|
|
|
|
dim_out=encoder_output_dim,
|
|
|
|
|
scope=scope,
|
2017-04-20 16:59:15 +00:00
|
|
|
name='weighted_decoder_hidden_state',
|
2017-03-08 19:11:50 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [encoder_length, batch_size, encoder_output_dim]
|
|
|
|
|
decoder_hidden_encoder_outputs_sum = model.net.Add(
|
2017-03-23 21:50:28 +00:00
|
|
|
[weighted_encoder_outputs, weighted_decoder_hidden_state],
|
2017-03-08 19:11:50 +00:00
|
|
|
s(scope, 'decoder_hidden_encoder_outputs_sum'),
|
|
|
|
|
broadcast=1,
|
|
|
|
|
use_grad_hack=1,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
attention_logits_transposed = _calc_attention_logits_from_sum_match(
|
|
|
|
|
model=model,
|
|
|
|
|
decoder_hidden_encoder_outputs_sum=decoder_hidden_encoder_outputs_sum,
|
|
|
|
|
encoder_output_dim=encoder_output_dim,
|
2017-04-20 16:59:15 +00:00
|
|
|
scope=scope,
|
2017-03-08 19:11:50 +00:00
|
|
|
)
|
|
|
|
|
|
2017-03-22 02:00:58 +00:00
|
|
|
# [batch_size, encoder_length, 1]
|
2017-03-08 19:11:50 +00:00
|
|
|
attention_weights_3d = _calc_attention_weights(
|
|
|
|
|
model=model,
|
|
|
|
|
attention_logits_transposed=attention_logits_transposed,
|
2017-04-20 16:59:15 +00:00
|
|
|
scope=scope,
|
2017-07-23 16:55:41 +00:00
|
|
|
encoder_lengths=encoder_lengths,
|
2017-03-08 19:11:50 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [batch_size, encoder_output_dim, 1]
|
|
|
|
|
attention_weighted_encoder_context = _calc_weighted_context(
|
|
|
|
|
model=model,
|
|
|
|
|
encoder_outputs_transposed=encoder_outputs_transposed,
|
|
|
|
|
encoder_output_dim=encoder_output_dim,
|
|
|
|
|
attention_weights_3d=attention_weights_3d,
|
2017-04-20 16:59:15 +00:00
|
|
|
scope=scope,
|
2017-02-23 11:53:14 +00:00
|
|
|
)
|
option to recompute blobs backward pass with massive memory savings
Summary:
This diff adds an option to recurrent_net to define some cell blobs to be recomputed on backward step, and thus they don't need to be stored in the step workspace. This is done by modifying the backward step to automatically include all operators that are needed to produce the output that is to be recomputed, and by storing those blobs in a shared workspace. To enable the shared workspace, i had to modify the stepworkspaces blob to also store a forward shared workspace. Making it a class field won't work since the lifecycle of the blob does not match the lifecycle of the operator.
For basic LSTM, the performance hit is quite modest (about 15% with one setting, but your mileage might vary. For Attention models, I am sure this is beneficial as computing the attention blobs is not expensive.
For basic LSTM, the memory saving is wonderful: each forward workspace only has 4 bytes (for timestep).
I also modified the neural_mt LSTM Cells, but there is no test available, so I am not 100% sure I did it correctly. Please have a look.
Added options to LSTM, MILSTM and LSTMAttention to enable memory mode.
Reviewed By: urikz
Differential Revision: D4853890
fbshipit-source-id: d8d0e0e75a5330d174fbfa39b96d8e4e8c446baa
2017-04-11 20:00:44 +00:00
|
|
|
return attention_weighted_encoder_context, attention_weights_3d, [
|
2017-04-20 16:59:15 +00:00
|
|
|
decoder_hidden_encoder_outputs_sum,
|
option to recompute blobs backward pass with massive memory savings
Summary:
This diff adds an option to recurrent_net to define some cell blobs to be recomputed on backward step, and thus they don't need to be stored in the step workspace. This is done by modifying the backward step to automatically include all operators that are needed to produce the output that is to be recomputed, and by storing those blobs in a shared workspace. To enable the shared workspace, i had to modify the stepworkspaces blob to also store a forward shared workspace. Making it a class field won't work since the lifecycle of the blob does not match the lifecycle of the operator.
For basic LSTM, the performance hit is quite modest (about 15% with one setting, but your mileage might vary. For Attention models, I am sure this is beneficial as computing the attention blobs is not expensive.
For basic LSTM, the memory saving is wonderful: each forward workspace only has 4 bytes (for timestep).
I also modified the neural_mt LSTM Cells, but there is no test available, so I am not 100% sure I did it correctly. Please have a look.
Added options to LSTM, MILSTM and LSTMAttention to enable memory mode.
Reviewed By: urikz
Differential Revision: D4853890
fbshipit-source-id: d8d0e0e75a5330d174fbfa39b96d8e4e8c446baa
2017-04-11 20:00:44 +00:00
|
|
|
]
|
2017-08-04 23:00:20 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def apply_dot_attention(
|
|
|
|
|
model,
|
|
|
|
|
encoder_output_dim,
|
|
|
|
|
# [batch_size, encoder_output_dim, encoder_length]
|
|
|
|
|
encoder_outputs_transposed,
|
|
|
|
|
# [1, batch_size, decoder_state_dim]
|
|
|
|
|
decoder_hidden_state_t,
|
|
|
|
|
decoder_hidden_state_dim,
|
|
|
|
|
scope,
|
|
|
|
|
encoder_lengths=None,
|
|
|
|
|
):
|
|
|
|
|
if decoder_hidden_state_dim != encoder_output_dim:
|
|
|
|
|
weighted_decoder_hidden_state = brew.fc(
|
|
|
|
|
model,
|
|
|
|
|
decoder_hidden_state_t,
|
|
|
|
|
s(scope, 'weighted_decoder_hidden_state'),
|
|
|
|
|
dim_in=decoder_hidden_state_dim,
|
|
|
|
|
dim_out=encoder_output_dim,
|
|
|
|
|
axis=2,
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
weighted_decoder_hidden_state = decoder_hidden_state_t
|
|
|
|
|
|
|
|
|
|
# [batch_size, decoder_state_dim]
|
|
|
|
|
squeezed_weighted_decoder_hidden_state = model.net.Squeeze(
|
|
|
|
|
weighted_decoder_hidden_state,
|
|
|
|
|
s(scope, 'squeezed_weighted_decoder_hidden_state'),
|
|
|
|
|
dims=[0],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [batch_size, decoder_state_dim, 1]
|
|
|
|
|
expanddims_squeezed_weighted_decoder_hidden_state = model.net.ExpandDims(
|
|
|
|
|
squeezed_weighted_decoder_hidden_state,
|
2017-08-14 19:30:48 +00:00
|
|
|
squeezed_weighted_decoder_hidden_state,
|
2017-08-04 23:00:20 +00:00
|
|
|
dims=[2],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [batch_size, encoder_output_dim, 1]
|
|
|
|
|
attention_logits_transposed = model.net.BatchMatMul(
|
|
|
|
|
[
|
2017-08-14 19:30:48 +00:00
|
|
|
encoder_outputs_transposed,
|
2017-08-04 23:00:20 +00:00
|
|
|
expanddims_squeezed_weighted_decoder_hidden_state,
|
|
|
|
|
],
|
|
|
|
|
s(scope, 'attention_logits'),
|
2017-08-14 19:30:48 +00:00
|
|
|
trans_a=1,
|
2017-08-04 23:00:20 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [batch_size, encoder_length, 1]
|
|
|
|
|
attention_weights_3d = _calc_attention_weights(
|
|
|
|
|
model=model,
|
|
|
|
|
attention_logits_transposed=attention_logits_transposed,
|
|
|
|
|
scope=scope,
|
|
|
|
|
encoder_lengths=encoder_lengths,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [batch_size, encoder_output_dim, 1]
|
|
|
|
|
attention_weighted_encoder_context = _calc_weighted_context(
|
|
|
|
|
model=model,
|
|
|
|
|
encoder_outputs_transposed=encoder_outputs_transposed,
|
|
|
|
|
encoder_output_dim=encoder_output_dim,
|
|
|
|
|
attention_weights_3d=attention_weights_3d,
|
|
|
|
|
scope=scope,
|
|
|
|
|
)
|
|
|
|
|
return attention_weighted_encoder_context, attention_weights_3d, []
|
soft-coverage attention
Summary:
Implementation of a new variant of attention module, which contains a recurrent decoder state with vectors corresponding to each source-side word and strictly increasing values, thus enabling it to model the degree to which source words have been translated.
The approach is a variant of the approaches described in https://arxiv.org/pdf/1601.04811.pdf. We simply include the sum of all previous attention weights for encoder words as a new recurrent state (coverage_t). A new linear transform on encoder_outputs is used to produce coverage_weights, which has the same dimensionality as encoder_outputs, and implicitly models the fertility of source-side words (and putting this extra information strain on the encoder network).
Thus the encoder output, the decoder state, and the coverage weights have the same dimensionality for a given source word, and attention logits are calculated as v * tanh(coverage * coverage_weights + encoder_output + decoder_state).
Note: the entire coverage state for each translation instance is of shape (encoder_length, coverage_units), but the states for the RecurrentNetwork operator, used to train the decoder, must be flat in the data dimension. This state is therefore initialized with shape (encoder_length * coverage_units) [not shown in the open-source library] and reshaped appropriately within the apply_soft_coverage_attention() function.
Differential Revision: D5593617
fbshipit-source-id: 7d0522b5eb0b26f22e8429e4461a459f2f16ed46
2017-09-01 04:06:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def apply_soft_coverage_attention(
|
|
|
|
|
model,
|
|
|
|
|
encoder_output_dim,
|
|
|
|
|
encoder_outputs_transposed,
|
|
|
|
|
weighted_encoder_outputs,
|
|
|
|
|
decoder_hidden_state_t,
|
|
|
|
|
decoder_hidden_state_dim,
|
|
|
|
|
scope,
|
|
|
|
|
encoder_lengths,
|
|
|
|
|
coverage_t_prev,
|
|
|
|
|
coverage_weights,
|
|
|
|
|
):
|
|
|
|
|
|
|
|
|
|
weighted_decoder_hidden_state = _apply_fc_weight_for_sum_match(
|
|
|
|
|
model=model,
|
|
|
|
|
input=decoder_hidden_state_t,
|
|
|
|
|
dim_in=decoder_hidden_state_dim,
|
|
|
|
|
dim_out=encoder_output_dim,
|
|
|
|
|
scope=scope,
|
|
|
|
|
name='weighted_decoder_hidden_state',
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [encoder_length, batch_size, encoder_output_dim]
|
2018-06-05 22:49:16 +00:00
|
|
|
decoder_hidden_encoder_outputs_sum_tmp = model.net.Add(
|
soft-coverage attention
Summary:
Implementation of a new variant of attention module, which contains a recurrent decoder state with vectors corresponding to each source-side word and strictly increasing values, thus enabling it to model the degree to which source words have been translated.
The approach is a variant of the approaches described in https://arxiv.org/pdf/1601.04811.pdf. We simply include the sum of all previous attention weights for encoder words as a new recurrent state (coverage_t). A new linear transform on encoder_outputs is used to produce coverage_weights, which has the same dimensionality as encoder_outputs, and implicitly models the fertility of source-side words (and putting this extra information strain on the encoder network).
Thus the encoder output, the decoder state, and the coverage weights have the same dimensionality for a given source word, and attention logits are calculated as v * tanh(coverage * coverage_weights + encoder_output + decoder_state).
Note: the entire coverage state for each translation instance is of shape (encoder_length, coverage_units), but the states for the RecurrentNetwork operator, used to train the decoder, must be flat in the data dimension. This state is therefore initialized with shape (encoder_length * coverage_units) [not shown in the open-source library] and reshaped appropriately within the apply_soft_coverage_attention() function.
Differential Revision: D5593617
fbshipit-source-id: 7d0522b5eb0b26f22e8429e4461a459f2f16ed46
2017-09-01 04:06:54 +00:00
|
|
|
[weighted_encoder_outputs, weighted_decoder_hidden_state],
|
2018-06-05 22:49:16 +00:00
|
|
|
s(scope, 'decoder_hidden_encoder_outputs_sum_tmp'),
|
soft-coverage attention
Summary:
Implementation of a new variant of attention module, which contains a recurrent decoder state with vectors corresponding to each source-side word and strictly increasing values, thus enabling it to model the degree to which source words have been translated.
The approach is a variant of the approaches described in https://arxiv.org/pdf/1601.04811.pdf. We simply include the sum of all previous attention weights for encoder words as a new recurrent state (coverage_t). A new linear transform on encoder_outputs is used to produce coverage_weights, which has the same dimensionality as encoder_outputs, and implicitly models the fertility of source-side words (and putting this extra information strain on the encoder network).
Thus the encoder output, the decoder state, and the coverage weights have the same dimensionality for a given source word, and attention logits are calculated as v * tanh(coverage * coverage_weights + encoder_output + decoder_state).
Note: the entire coverage state for each translation instance is of shape (encoder_length, coverage_units), but the states for the RecurrentNetwork operator, used to train the decoder, must be flat in the data dimension. This state is therefore initialized with shape (encoder_length * coverage_units) [not shown in the open-source library] and reshaped appropriately within the apply_soft_coverage_attention() function.
Differential Revision: D5593617
fbshipit-source-id: 7d0522b5eb0b26f22e8429e4461a459f2f16ed46
2017-09-01 04:06:54 +00:00
|
|
|
broadcast=1,
|
|
|
|
|
)
|
|
|
|
|
# [batch_size, encoder_length]
|
|
|
|
|
coverage_t_prev_2d = model.net.Squeeze(
|
|
|
|
|
coverage_t_prev,
|
|
|
|
|
s(scope, 'coverage_t_prev_2d'),
|
|
|
|
|
dims=[0],
|
|
|
|
|
)
|
|
|
|
|
# [encoder_length, batch_size]
|
|
|
|
|
coverage_t_prev_transposed = brew.transpose(
|
|
|
|
|
model,
|
|
|
|
|
coverage_t_prev_2d,
|
|
|
|
|
s(scope, 'coverage_t_prev_transposed'),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [encoder_length, batch_size, encoder_output_dim]
|
|
|
|
|
scaled_coverage_weights = model.net.Mul(
|
|
|
|
|
[coverage_weights, coverage_t_prev_transposed],
|
|
|
|
|
s(scope, 'scaled_coverage_weights'),
|
|
|
|
|
broadcast=1,
|
|
|
|
|
axis=0,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [encoder_length, batch_size, encoder_output_dim]
|
|
|
|
|
decoder_hidden_encoder_outputs_sum = model.net.Add(
|
2018-06-05 22:49:16 +00:00
|
|
|
[decoder_hidden_encoder_outputs_sum_tmp, scaled_coverage_weights],
|
|
|
|
|
s(scope, 'decoder_hidden_encoder_outputs_sum'),
|
soft-coverage attention
Summary:
Implementation of a new variant of attention module, which contains a recurrent decoder state with vectors corresponding to each source-side word and strictly increasing values, thus enabling it to model the degree to which source words have been translated.
The approach is a variant of the approaches described in https://arxiv.org/pdf/1601.04811.pdf. We simply include the sum of all previous attention weights for encoder words as a new recurrent state (coverage_t). A new linear transform on encoder_outputs is used to produce coverage_weights, which has the same dimensionality as encoder_outputs, and implicitly models the fertility of source-side words (and putting this extra information strain on the encoder network).
Thus the encoder output, the decoder state, and the coverage weights have the same dimensionality for a given source word, and attention logits are calculated as v * tanh(coverage * coverage_weights + encoder_output + decoder_state).
Note: the entire coverage state for each translation instance is of shape (encoder_length, coverage_units), but the states for the RecurrentNetwork operator, used to train the decoder, must be flat in the data dimension. This state is therefore initialized with shape (encoder_length * coverage_units) [not shown in the open-source library] and reshaped appropriately within the apply_soft_coverage_attention() function.
Differential Revision: D5593617
fbshipit-source-id: 7d0522b5eb0b26f22e8429e4461a459f2f16ed46
2017-09-01 04:06:54 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [batch_size, encoder_length, 1]
|
|
|
|
|
attention_logits_transposed = _calc_attention_logits_from_sum_match(
|
|
|
|
|
model=model,
|
|
|
|
|
decoder_hidden_encoder_outputs_sum=decoder_hidden_encoder_outputs_sum,
|
|
|
|
|
encoder_output_dim=encoder_output_dim,
|
|
|
|
|
scope=scope,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [batch_size, encoder_length, 1]
|
|
|
|
|
attention_weights_3d = _calc_attention_weights(
|
|
|
|
|
model=model,
|
|
|
|
|
attention_logits_transposed=attention_logits_transposed,
|
|
|
|
|
scope=scope,
|
|
|
|
|
encoder_lengths=encoder_lengths,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [batch_size, encoder_output_dim, 1]
|
|
|
|
|
attention_weighted_encoder_context = _calc_weighted_context(
|
|
|
|
|
model=model,
|
|
|
|
|
encoder_outputs_transposed=encoder_outputs_transposed,
|
|
|
|
|
encoder_output_dim=encoder_output_dim,
|
|
|
|
|
attention_weights_3d=attention_weights_3d,
|
|
|
|
|
scope=scope,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# [batch_size, encoder_length]
|
|
|
|
|
attention_weights_2d = model.net.Squeeze(
|
|
|
|
|
attention_weights_3d,
|
|
|
|
|
s(scope, 'attention_weights_2d'),
|
|
|
|
|
dims=[2],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
coverage_t = model.net.Add(
|
|
|
|
|
[coverage_t_prev, attention_weights_2d],
|
|
|
|
|
s(scope, 'coverage_t'),
|
|
|
|
|
broadcast=1,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return (
|
|
|
|
|
attention_weighted_encoder_context,
|
|
|
|
|
attention_weights_3d,
|
|
|
|
|
[decoder_hidden_encoder_outputs_sum],
|
|
|
|
|
coverage_t,
|
|
|
|
|
)
|