From ef03ae874f91f502ce3ff997822e1be8a1ec47ed Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 28 May 2020 02:11:05 -0700 Subject: [PATCH] [Longformer] more models + model cards (#4628) * adding freeze roberta models * model cards * lint --- .../README.md | 20 ++++++++++++++++ .../allenai/longformer-base-4096/README.md | 24 +++++++++++++++++++ src/transformers/configuration_longformer.py | 8 ++++--- src/transformers/modeling_longformer.py | 11 +++++---- src/transformers/tokenization_longformer.py | 16 +++++++++---- 5 files changed, 67 insertions(+), 12 deletions(-) create mode 100644 model_cards/allenai/longformer-base-4096-extra.pos.embd.only/README.md create mode 100644 model_cards/allenai/longformer-base-4096/README.md diff --git a/model_cards/allenai/longformer-base-4096-extra.pos.embd.only/README.md b/model_cards/allenai/longformer-base-4096-extra.pos.embd.only/README.md new file mode 100644 index 000000000..881870e55 --- /dev/null +++ b/model_cards/allenai/longformer-base-4096-extra.pos.embd.only/README.md @@ -0,0 +1,20 @@ + +# longformer-base-4096-extra.pos.embd.only + +This model is similar to `longformer-base-4096` but it was pretrained to preserve RoBERTa weights by freezing all RoBERTa weights and only train the additional position embeddings. + + +### Citing + +If you use `Longformer` in your research, please cite [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150). +``` +@article{Beltagy2020Longformer, + title={Longformer: The Long-Document Transformer}, + author={Iz Beltagy and Matthew E. Peters and Arman Cohan}, + journal={arXiv:2004.05150}, + year={2020}, +} +``` + +`Longformer` is an open-source project developed by [the Allen Institute for Artificial Intelligence (AI2)](http://www.allenai.org). +AI2 is a non-profit institute with the mission to contribute to humanity through high-impact AI research and engineering. diff --git a/model_cards/allenai/longformer-base-4096/README.md b/model_cards/allenai/longformer-base-4096/README.md new file mode 100644 index 000000000..44132b1b2 --- /dev/null +++ b/model_cards/allenai/longformer-base-4096/README.md @@ -0,0 +1,24 @@ + +# longformer-base-4096 +[Longformer](https://arxiv.org/abs/2004.05150) is a transformer model for long documents. + +`longformer-base-4096` is a BERT-like model started from the RoBERTa checkpoint and pretrained for MLM on long documents. It supports sequences of length up to 4,096. + +Longformer uses a combination of a sliding window (local) attention and global attention. Global attention is user-configured based on the task to allow the model to learn task-specific representations. +Please refer to the examples in `modeling_longformer.py` and the paper for more details on how to set global attention. + + +### Citing + +If you use `Longformer` in your research, please cite [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150). +``` +@article{Beltagy2020Longformer, + title={Longformer: The Long-Document Transformer}, + author={Iz Beltagy and Matthew E. Peters and Arman Cohan}, + journal={arXiv:2004.05150}, + year={2020}, +} +``` + +`Longformer` is an open-source project developed by [the Allen Institute for Artificial Intelligence (AI2)](http://www.allenai.org). +AI2 is a non-profit institute with the mission to contribute to humanity through high-impact AI research and engineering. diff --git a/src/transformers/configuration_longformer.py b/src/transformers/configuration_longformer.py index 559cc4a3f..bccdc6c6e 100644 --- a/src/transformers/configuration_longformer.py +++ b/src/transformers/configuration_longformer.py @@ -23,9 +23,11 @@ from .configuration_roberta import RobertaConfig logger = logging.getLogger(__name__) LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "longformer-base-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096/config.json", - "longformer-large-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096/config.json", - "longformer-large-4096-finetuned-triviaqa": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-finetuned-triviaqa/config.json", + "allenai/longformer-base-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096/config.json", + "allenai/longformer-large-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096/config.json", + "allenai/longformer-large-4096-finetuned-triviaqa": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-finetuned-triviaqa/config.json", + "allenai/longformer-base-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096-extra.pos.embd.only/config.json", + "allenai/longformer-large-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-extra.pos.embd.only/config.json", } diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py index 533b383e9..4524e7b21 100644 --- a/src/transformers/modeling_longformer.py +++ b/src/transformers/modeling_longformer.py @@ -31,9 +31,11 @@ from .modeling_roberta import RobertaLMHead, RobertaModel logger = logging.getLogger(__name__) LONGFORMER_PRETRAINED_MODEL_ARCHIVE_MAP = { - "longformer-base-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096/pytorch_model.bin", - "longformer-large-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096/pytorch_model.bin", - "longformer-large-4096-finetuned-triviaqa": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-finetuned-triviaqa/pytorch_model.bin", + "allenai/longformer-base-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096/pytorch_model.bin", + "allenai/longformer-large-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096/pytorch_model.bin", + "allenai/longformer-large-4096-finetuned-triviaqa": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-finetuned-triviaqa/pytorch_model.bin", + "allenai/longformer-base-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096-extra.pos.embd.only/pytorch_model.bin", + "allenai/longformer-large-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-extra.pos.embd.only/pytorch_model.bin", } @@ -851,8 +853,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel): attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device) attention_mask = attention_mask.expand_as(input_ids) < question_end_index - attention_mask = attention_mask.int() + 1 # True => global attention; False => local attention - return attention_mask.long() + return attention_mask.long() + 1 # True => global attention; False => local attention def _get_question_end_index(self, input_ids): sep_token_indices = (input_ids == self.config.sep_token_id).nonzero() diff --git a/src/transformers/tokenization_longformer.py b/src/transformers/tokenization_longformer.py index c6986220f..92f36dc57 100644 --- a/src/transformers/tokenization_longformer.py +++ b/src/transformers/tokenization_longformer.py @@ -24,13 +24,21 @@ logger = logging.getLogger(__name__) # vocab and merges same as roberta vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" -_all_longformer_models = ["longformer-base-4096", "longformer-large-4096", "longformer-large-4096-finetuned-triviaqa"] +_all_longformer_models = [ + "allenai/longformer-base-4096", + "allenai/longformer-large-4096", + "allenai/longformer-large-4096-finetuned-triviaqa", + "allenai/longformer-base-4096-extra.pos.embd.only", + "allenai/longformer-large-4096-extra.pos.embd.only", +] PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - "longformer-base-4096": 4096, - "longformer-large-4096": 4096, - "longformer-large-4096-finetuned-triviaqa": 4096, + "allenai/longformer-base-4096": 4096, + "allenai/longformer-large-4096": 4096, + "allenai/longformer-large-4096-finetuned-triviaqa": 4096, + "allenai/longformer-base-4096-extra.pos.embd.only": 4096, + "allenai/longformer-large-4096-extra.pos.embd.only": 4096, }