diff --git a/src/transformers/models/led/configuration_led.py b/src/transformers/models/led/configuration_led.py index 5f534ab28..37720c730 100644 --- a/src/transformers/models/led/configuration_led.py +++ b/src/transformers/models/led/configuration_led.py @@ -86,18 +86,17 @@ class LEDConfig(PretrainedConfig): Example: ```python + >>> from transformers import LEDModel, LEDConfig - ``` + >>> # Initializing a LED allenai/led-base-16384 style configuration + >>> configuration = LEDConfig() - >>> from transformers import LEDModel, LEDConfig + >>> # Initializing a model from the allenai/led-base-16384 style configuration + >>> model = LEDModel(configuration) - >>> # Initializing a LED allenai/led-base-16384 style configuration >>> configuration = LEDConfig() - - >>> # Initializing a model from the allenai/led-base-16384 style configuration >>> model = - LEDModel(configuration) - - >>> # Accessing the model configuration >>> configuration = model.config - """ + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" model_type = "led" attribute_map = { "num_attention_heads": "encoder_attention_heads", diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index 3e852cf2a..162c1066f 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -1007,7 +1007,7 @@ class LEDDecoderLayer(nn.Module): """ residual = hidden_states - # Self Attention + # Self-Attention # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None # add present self-attn cache to positions 1,2 of present_key_value tuple @@ -1437,13 +1437,11 @@ class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput): LED_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) + This model inherits from [`PreTrainedModel`]. See the superclass documentation for the generic methods the library + implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads etc.) This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for general usage and behavior. Parameters: config ([`LEDConfig`]): @@ -1595,7 +1593,7 @@ LED_INPUTS_DOCSTRING = r""" class LEDEncoder(LEDPreTrainedModel): """ - Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + Transformer encoder consisting of *config.encoder_layers* self-attention layers. Each layer is a [`LEDEncoderLayer`]. Args: @@ -1643,7 +1641,7 @@ class LEDEncoder(LEDPreTrainedModel): self.post_init() def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor): - # longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn) + # longformer self-attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn) # (global_attention_mask + 1) => 1 for local attention, 2 for global attention # => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention if attention_mask is not None: diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py index a882e32ec..d44a35e44 100644 --- a/src/transformers/models/led/modeling_tf_led.py +++ b/src/transformers/models/led/modeling_tf_led.py @@ -1238,7 +1238,7 @@ class TFLEDDecoderLayer(tf.keras.layers.Layer): """ residual = hidden_states - # Self Attention + # Self-Attention # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None # add present self-attn cache to positions 1,2 of present_key_value tuple @@ -1612,7 +1612,7 @@ LED_INPUTS_DOCSTRING = r""" class TFLEDEncoder(tf.keras.layers.Layer): config_class = LEDConfig """ - Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + Transformer encoder consisting of *config.encoder_layers* self-attention layers. Each layer is a [`TFLEDEncoderLayer`]. Args: