From ef03ae874f91f502ce3ff997822e1be8a1ec47ed Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 28 May 2020 02:11:05 -0700
Subject: [PATCH] [Longformer] more models + model cards (#4628)

* adding freeze roberta models

* model cards

* lint
---
 .../README.md                                 | 20 ++++++++++++++++
 .../allenai/longformer-base-4096/README.md    | 24 +++++++++++++++++++
 src/transformers/configuration_longformer.py  |  8 ++++---
 src/transformers/modeling_longformer.py       | 11 +++++----
 src/transformers/tokenization_longformer.py   | 16 +++++++++----
 5 files changed, 67 insertions(+), 12 deletions(-)
 create mode 100644 model_cards/allenai/longformer-base-4096-extra.pos.embd.only/README.md
 create mode 100644 model_cards/allenai/longformer-base-4096/README.md

diff --git a/model_cards/allenai/longformer-base-4096-extra.pos.embd.only/README.md b/model_cards/allenai/longformer-base-4096-extra.pos.embd.only/README.md
new file mode 100644
index 000000000..881870e55
--- /dev/null
+++ b/model_cards/allenai/longformer-base-4096-extra.pos.embd.only/README.md
@@ -0,0 +1,20 @@
+
+# longformer-base-4096-extra.pos.embd.only
+
+This model is similar to `longformer-base-4096` but it was pretrained to preserve RoBERTa weights by freezing all RoBERTa weights and only train the additional position embeddings. 
+
+
+### Citing
+
+If you use `Longformer` in your research, please cite [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150).
+```
+@article{Beltagy2020Longformer,
+  title={Longformer: The Long-Document Transformer},
+  author={Iz Beltagy and Matthew E. Peters and Arman Cohan},
+  journal={arXiv:2004.05150},
+  year={2020},
+}
+```
+
+`Longformer` is an open-source project developed by [the Allen Institute for Artificial Intelligence (AI2)](http://www.allenai.org).
+AI2 is a non-profit institute with the mission to contribute to humanity through high-impact AI research and engineering.
diff --git a/model_cards/allenai/longformer-base-4096/README.md b/model_cards/allenai/longformer-base-4096/README.md
new file mode 100644
index 000000000..44132b1b2
--- /dev/null
+++ b/model_cards/allenai/longformer-base-4096/README.md
@@ -0,0 +1,24 @@
+
+# longformer-base-4096
+[Longformer](https://arxiv.org/abs/2004.05150) is a transformer model for long documents. 
+
+`longformer-base-4096` is a BERT-like model started from the RoBERTa checkpoint and pretrained for MLM on long documents. It supports sequences of length up to 4,096. 
+ 
+Longformer uses a combination of a sliding window (local) attention and global attention. Global attention is user-configured based on the task to allow the model to learn task-specific representations.
+Please refer to the examples in `modeling_longformer.py` and the paper for more details on how to set global attention.
+
+
+### Citing
+
+If you use `Longformer` in your research, please cite [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150).
+```
+@article{Beltagy2020Longformer,
+  title={Longformer: The Long-Document Transformer},
+  author={Iz Beltagy and Matthew E. Peters and Arman Cohan},
+  journal={arXiv:2004.05150},
+  year={2020},
+}
+```
+
+`Longformer` is an open-source project developed by [the Allen Institute for Artificial Intelligence (AI2)](http://www.allenai.org).
+AI2 is a non-profit institute with the mission to contribute to humanity through high-impact AI research and engineering.
diff --git a/src/transformers/configuration_longformer.py b/src/transformers/configuration_longformer.py
index 559cc4a3f..bccdc6c6e 100644
--- a/src/transformers/configuration_longformer.py
+++ b/src/transformers/configuration_longformer.py
@@ -23,9 +23,11 @@ from .configuration_roberta import RobertaConfig
 logger = logging.getLogger(__name__)
 
 LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "longformer-base-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096/config.json",
-    "longformer-large-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096/config.json",
-    "longformer-large-4096-finetuned-triviaqa": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-finetuned-triviaqa/config.json",
+    "allenai/longformer-base-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096/config.json",
+    "allenai/longformer-large-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096/config.json",
+    "allenai/longformer-large-4096-finetuned-triviaqa": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-finetuned-triviaqa/config.json",
+    "allenai/longformer-base-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096-extra.pos.embd.only/config.json",
+    "allenai/longformer-large-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-extra.pos.embd.only/config.json",
 }
 
 
diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py
index 533b383e9..4524e7b21 100644
--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@@ -31,9 +31,11 @@ from .modeling_roberta import RobertaLMHead, RobertaModel
 logger = logging.getLogger(__name__)
 
 LONGFORMER_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "longformer-base-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096/pytorch_model.bin",
-    "longformer-large-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096/pytorch_model.bin",
-    "longformer-large-4096-finetuned-triviaqa": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-finetuned-triviaqa/pytorch_model.bin",
+    "allenai/longformer-base-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096/pytorch_model.bin",
+    "allenai/longformer-large-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096/pytorch_model.bin",
+    "allenai/longformer-large-4096-finetuned-triviaqa": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-finetuned-triviaqa/pytorch_model.bin",
+    "allenai/longformer-base-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096-extra.pos.embd.only/pytorch_model.bin",
+    "allenai/longformer-large-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-extra.pos.embd.only/pytorch_model.bin",
 }
 
 
@@ -851,8 +853,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
         attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)
         attention_mask = attention_mask.expand_as(input_ids) < question_end_index
 
-        attention_mask = attention_mask.int() + 1  # True => global attention; False => local attention
-        return attention_mask.long()
+        return attention_mask.long() + 1  # True => global attention; False => local attention
 
     def _get_question_end_index(self, input_ids):
         sep_token_indices = (input_ids == self.config.sep_token_id).nonzero()
diff --git a/src/transformers/tokenization_longformer.py b/src/transformers/tokenization_longformer.py
index c6986220f..92f36dc57 100644
--- a/src/transformers/tokenization_longformer.py
+++ b/src/transformers/tokenization_longformer.py
@@ -24,13 +24,21 @@ logger = logging.getLogger(__name__)
 # vocab and merges same as roberta
 vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
 merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
-_all_longformer_models = ["longformer-base-4096", "longformer-large-4096", "longformer-large-4096-finetuned-triviaqa"]
+_all_longformer_models = [
+    "allenai/longformer-base-4096",
+    "allenai/longformer-large-4096",
+    "allenai/longformer-large-4096-finetuned-triviaqa",
+    "allenai/longformer-base-4096-extra.pos.embd.only",
+    "allenai/longformer-large-4096-extra.pos.embd.only",
+]
 
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "longformer-base-4096": 4096,
-    "longformer-large-4096": 4096,
-    "longformer-large-4096-finetuned-triviaqa": 4096,
+    "allenai/longformer-base-4096": 4096,
+    "allenai/longformer-large-4096": 4096,
+    "allenai/longformer-large-4096-finetuned-triviaqa": 4096,
+    "allenai/longformer-base-4096-extra.pos.embd.only": 4096,
+    "allenai/longformer-large-4096-extra.pos.embd.only": 4096,
 }