diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 7e1366b53..c6b990f21 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -217,25 +217,20 @@ Here is the full list of the currently provided pretrained models together with | | | | ALBERT xxlarge model with no dropout, additional training data and longer training | | | | (see `details `__) | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| T5 | ``t5-small`` | | 6-layer, 768-hidden, 12-heads, 66M parameters | -| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint | -| | | (see `details `__) | +| T5 | ``t5-small`` | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads, | +| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``t5-base`` | | 6-layer, 768-hidden, 12-heads, 66M parameters | -| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer. | -| | | (see `details `__) | +| | ``t5-base`` | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads, | +| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``t5-large`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | -| | | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint. | -| | | (see `details `__) | +| | ``t5-large`` | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads, | +| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``t5-3b`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | -| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. | -| | | (see `details `__) | +| | ``t5-3B`` | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads, | +| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``t5-11b`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | -| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. | -| | | (see `details `__) | +| | ``t5-11B`` | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads, | +| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py index 2ccdebc2b..6391cb418 100644 --- a/transformers/configuration_t5.py +++ b/transformers/configuration_t5.py @@ -30,8 +30,8 @@ T5_PRETRAINED_CONFIG_ARCHIVE_MAP = { 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json", 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json", 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json", - 't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-config.json", - 't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-config.json", + 't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json", + 't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json", } diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index c9310179a..263dc33b7 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -44,8 +44,8 @@ T5_PRETRAINED_MODEL_ARCHIVE_MAP = { 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin", 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin", 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin", - 't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-pytorch_model.bin", - 't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-pytorch_model.bin", + 't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin", + 't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin", } #################################################### diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py index 0ae7fff41..1336a1c30 100644 --- a/transformers/modeling_tf_t5.py +++ b/transformers/modeling_tf_t5.py @@ -34,8 +34,8 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = { 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5", 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5", 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5", - 't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-tf_model.h5", - 't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-tf_model.h5", + 't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5", + 't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5", } #################################################### diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py index 62e9c069e..9fd37b67c 100644 --- a/transformers/tokenization_t5.py +++ b/transformers/tokenization_t5.py @@ -44,8 +44,8 @@ PRETRAINED_VOCAB_FILES_MAP = { 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", - 't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", - 't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + 't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + 't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", } } @@ -56,8 +56,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 't5-small': 512, 't5-base': 512, 't5-large': 512, - 't5-3B': 512, - 't5-11B': 512, + 't5-3b': 512, + 't5-11b': 512, } class T5Tokenizer(PreTrainedTokenizer):