mirror of
https://github.com/saymrwulf/transformers.git
synced 2026-05-14 20:58:08 +00:00
update model doc - swith 3B/11B to 3b/11b
This commit is contained in:
parent
110394b2ba
commit
5c00e344c1
5 changed files with 20 additions and 25 deletions
|
|
@ -217,25 +217,20 @@ Here is the full list of the currently provided pretrained models together with
|
|||
| | | | ALBERT xxlarge model with no dropout, additional training data and longer training |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| T5 | ``t5-small`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
| T5 | ``t5-small`` | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads, |
|
||||
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``t5-base`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer. |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
| | ``t5-base`` | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads, |
|
||||
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``t5-large`` | | 6-layer, 768-hidden, 12-heads, 82M parameters |
|
||||
| | | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint. |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
| | ``t5-large`` | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads, |
|
||||
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``t5-3b`` | | 6-layer, 768-hidden, 12-heads, 82M parameters |
|
||||
| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
| | ``t5-3B`` | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads, |
|
||||
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``t5-11b`` | | 6-layer, 768-hidden, 12-heads, 82M parameters |
|
||||
| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
| | ``t5-11B`` | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads, |
|
||||
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -30,8 +30,8 @@ T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
|
||||
't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
|
||||
't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
|
||||
't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-config.json",
|
||||
't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-config.json",
|
||||
't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
|
||||
't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -44,8 +44,8 @@ T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||
't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
|
||||
't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
|
||||
't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
|
||||
't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-pytorch_model.bin",
|
||||
't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-pytorch_model.bin",
|
||||
't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
|
||||
't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
|
||||
}
|
||||
|
||||
####################################################
|
||||
|
|
|
|||
|
|
@ -34,8 +34,8 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||
't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
|
||||
't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5",
|
||||
't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5",
|
||||
't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-tf_model.h5",
|
||||
't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-tf_model.h5",
|
||||
't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5",
|
||||
't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5",
|
||||
}
|
||||
|
||||
####################################################
|
||||
|
|
|
|||
|
|
@ -44,8 +44,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
|||
't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
|
||||
't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
|
||||
't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
|
||||
't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
|
||||
't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
|
||||
't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
|
||||
't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -56,8 +56,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||
't5-small': 512,
|
||||
't5-base': 512,
|
||||
't5-large': 512,
|
||||
't5-3B': 512,
|
||||
't5-11B': 512,
|
||||
't5-3b': 512,
|
||||
't5-11b': 512,
|
||||
}
|
||||
|
||||
class T5Tokenizer(PreTrainedTokenizer):
|
||||
|
|
|
|||
Loading…
Reference in a new issue