From bd90cda9a6bb4723515c17df1192e53abc8e36e3 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Wed, 2 Aug 2023 20:22:36 +0200 Subject: [PATCH] =?UTF-8?q?CI=20with=20`num=5Fhidden=5Flayers=3D2`=20?= =?UTF-8?q?=F0=9F=9A=80=F0=9F=9A=80=F0=9F=9A=80=20(#25266)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * CI with layers=2 --------- Co-authored-by: ydshieh --- tests/models/albert/test_modeling_albert.py | 5 ++-- .../albert/test_modeling_flax_albert.py | 2 +- tests/models/align/test_modeling_align.py | 2 +- tests/models/altclip/test_modeling_altclip.py | 4 +-- ..._modeling_audio_spectrogram_transformer.py | 2 +- tests/models/bart/test_modeling_bart.py | 2 +- tests/models/beit/test_modeling_beit.py | 2 +- tests/models/beit/test_modeling_flax_beit.py | 2 +- tests/models/bert/test_modeling_bert.py | 2 +- tests/models/bert/test_modeling_flax_bert.py | 2 +- .../test_modeling_bert_generation.py | 2 +- .../test_modeling_bigbird_pegasus.py | 2 +- tests/models/biogpt/test_modeling_biogpt.py | 2 +- .../blenderbot/test_modeling_blenderbot.py | 2 +- .../test_modeling_blenderbot_small.py | 2 +- tests/models/blip/test_modeling_blip.py | 4 +-- tests/models/blip/test_modeling_blip_text.py | 2 +- tests/models/blip_2/test_modeling_blip_2.py | 8 +++--- tests/models/bloom/test_modeling_bloom.py | 2 +- tests/models/canine/test_modeling_canine.py | 2 +- .../test_modeling_chinese_clip.py | 4 +-- tests/models/clap/test_modeling_clap.py | 2 +- tests/models/clip/test_modeling_clip.py | 4 +-- tests/models/clip/test_modeling_flax_clip.py | 4 +-- tests/models/clipseg/test_modeling_clipseg.py | 17 ++++++++++--- tests/models/codegen/test_modeling_codegen.py | 2 +- .../models/convbert/test_modeling_convbert.py | 2 +- tests/models/cpmant/test_modeling_cpmant.py | 2 +- tests/models/ctrl/test_modeling_ctrl.py | 2 +- .../data2vec/test_modeling_data2vec_audio.py | 2 +- .../data2vec/test_modeling_data2vec_text.py | 2 +- .../data2vec/test_modeling_data2vec_vision.py | 2 +- tests/models/deberta/test_modeling_deberta.py | 2 +- .../deberta_v2/test_modeling_deberta_v2.py | 2 +- tests/models/deit/test_modeling_deit.py | 2 +- tests/models/dinov2/test_modeling_dinov2.py | 2 +- .../distilbert/test_modeling_distilbert.py | 2 +- .../test_modeling_flax_distilbert.py | 2 +- tests/models/dpr/test_modeling_dpr.py | 2 +- tests/models/dpt/test_modeling_dpt.py | 4 +-- tests/models/electra/test_modeling_electra.py | 2 +- .../electra/test_modeling_flax_electra.py | 2 +- tests/models/ernie/test_modeling_ernie.py | 2 +- tests/models/ernie_m/test_modeling_ernie_m.py | 2 +- tests/models/esm/test_modeling_esm.py | 2 +- tests/models/esm/test_modeling_esmfold.py | 2 +- tests/models/falcon/test_modeling_falcon.py | 2 +- .../models/flaubert/test_modeling_flaubert.py | 2 +- tests/models/flava/test_modeling_flava.py | 6 ++--- tests/models/fnet/test_modeling_fnet.py | 2 +- tests/models/git/test_modeling_git.py | 4 +-- tests/models/gpt2/test_modeling_flax_gpt2.py | 2 +- tests/models/gpt2/test_modeling_gpt2.py | 2 +- .../gpt_bigcode/test_modeling_gpt_bigcode.py | 2 +- .../gpt_neo/test_modeling_flax_gpt_neo.py | 4 +-- tests/models/gpt_neo/test_modeling_gpt_neo.py | 4 +-- .../models/gpt_neox/test_modeling_gpt_neox.py | 2 +- .../test_modeling_gpt_neox_japanese.py | 2 +- tests/models/gptj/test_modeling_flax_gptj.py | 2 +- tests/models/gptj/test_modeling_gptj.py | 2 +- .../test_modeling_gptsan_japanese.py | 2 +- .../models/groupvit/test_modeling_groupvit.py | 6 ++++- tests/models/hubert/test_modeling_hubert.py | 2 +- tests/models/ibert/test_modeling_ibert.py | 2 +- .../models/imagegpt/test_modeling_imagegpt.py | 2 +- .../test_modeling_instructblip.py | 6 ++--- .../models/layoutlm/test_modeling_layoutlm.py | 2 +- .../layoutlmv2/test_modeling_layoutlmv2.py | 2 +- .../layoutlmv3/test_modeling_layoutlmv3.py | 2 +- tests/models/llama/test_modeling_llama.py | 2 +- .../longformer/test_modeling_longformer.py | 2 +- .../longt5/test_modeling_flax_longt5.py | 2 +- tests/models/longt5/test_modeling_longt5.py | 4 +-- tests/models/luke/test_modeling_luke.py | 2 +- tests/models/marian/test_modeling_marian.py | 2 +- .../models/markuplm/test_modeling_markuplm.py | 2 +- tests/models/mbart/test_modeling_mbart.py | 2 +- tests/models/mega/test_modeling_mega.py | 2 +- .../test_modeling_megatron_bert.py | 2 +- tests/models/mgp_str/test_modeling_mgp_str.py | 2 +- .../mobilebert/test_modeling_mobilebert.py | 2 +- tests/models/mpnet/test_modeling_mpnet.py | 2 +- tests/models/mpt/test_modeling_mpt.py | 2 +- tests/models/mra/test_modeling_mra.py | 2 +- tests/models/mvp/test_modeling_mvp.py | 2 +- tests/models/nezha/test_modeling_nezha.py | 2 +- .../models/nllb_moe/test_modeling_nllb_moe.py | 2 +- .../test_modeling_nystromformer.py | 2 +- tests/models/openai/test_modeling_openai.py | 2 +- tests/models/opt/test_modeling_opt.py | 2 +- tests/models/owlvit/test_modeling_owlvit.py | 2 +- .../pegasus/test_modeling_flax_pegasus.py | 2 +- tests/models/pegasus/test_modeling_pegasus.py | 2 +- .../pegasus_x/test_modeling_pegasus_x.py | 2 +- .../pix2struct/test_modeling_pix2struct.py | 4 +-- tests/models/plbart/test_modeling_plbart.py | 2 +- .../prophetnet/test_modeling_prophetnet.py | 16 ++++++------ tests/models/qdqbert/test_modeling_qdqbert.py | 2 +- tests/models/realm/test_modeling_realm.py | 2 +- tests/models/rembert/test_modeling_rembert.py | 2 +- .../roberta/test_modeling_flax_roberta.py | 2 +- tests/models/roberta/test_modeling_roberta.py | 2 +- ...test_modeling_flax_roberta_prelayernorm.py | 2 +- .../test_modeling_roberta_prelayernorm.py | 2 +- .../models/roc_bert/test_modeling_roc_bert.py | 2 +- .../roformer/test_modeling_flax_roformer.py | 2 +- .../models/roformer/test_modeling_roformer.py | 2 +- tests/models/rwkv/test_modeling_rwkv.py | 2 +- tests/models/sew/test_modeling_sew.py | 2 +- tests/models/sew_d/test_modeling_sew_d.py | 2 +- .../test_modeling_speech_to_text_2.py | 2 +- .../models/speecht5/test_modeling_speecht5.py | 8 +++--- .../models/splinter/test_modeling_splinter.py | 2 +- .../squeezebert/test_modeling_squeezebert.py | 2 +- .../test_modeling_switch_transformers.py | 4 +-- tests/models/t5/test_modeling_flax_t5.py | 4 +-- tests/models/t5/test_modeling_t5.py | 4 +-- tests/models/tapas/test_modeling_tapas.py | 2 +- .../timesformer/test_modeling_timesformer.py | 2 +- .../transfo_xl/test_modeling_transfo_xl.py | 2 +- tests/models/trocr/test_modeling_trocr.py | 2 +- tests/models/tvlt/test_modeling_tvlt.py | 2 +- tests/models/umt5/test_modeling_umt5.py | 2 +- .../unispeech/test_modeling_unispeech.py | 2 +- .../test_modeling_unispeech_sat.py | 2 +- .../models/videomae/test_modeling_videomae.py | 2 +- tests/models/vilt/test_modeling_vilt.py | 2 +- .../visual_bert/test_modeling_visual_bert.py | 2 +- tests/models/vit/test_modeling_flax_vit.py | 2 +- tests/models/vit/test_modeling_vit.py | 2 +- .../vit_hybrid/test_modeling_vit_hybrid.py | 2 +- tests/models/vit_mae/test_modeling_vit_mae.py | 2 +- tests/models/vit_msn/test_modeling_vit_msn.py | 2 +- .../wav2vec2/test_modeling_flax_wav2vec2.py | 2 +- .../models/wav2vec2/test_modeling_wav2vec2.py | 2 +- .../test_modeling_wav2vec2_conformer.py | 2 +- tests/models/wavlm/test_modeling_wavlm.py | 2 +- tests/models/x_clip/test_modeling_x_clip.py | 4 +-- tests/models/xglm/test_modeling_flax_xglm.py | 2 +- tests/models/xglm/test_modeling_xglm.py | 2 +- tests/models/xlm/test_modeling_xlm.py | 2 +- .../test_modeling_xlm_roberta_xl.py | 2 +- tests/models/xlnet/test_modeling_xlnet.py | 2 +- tests/models/xmod/test_modeling_xmod.py | 2 +- tests/models/yolos/test_modeling_yolos.py | 2 +- tests/models/yoso/test_modeling_yoso.py | 2 +- tests/test_modeling_common.py | 25 ++++++++----------- 147 files changed, 207 insertions(+), 196 deletions(-) diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py index 96fa5596a..75c84ad0d 100644 --- a/tests/models/albert/test_modeling_albert.py +++ b/tests/models/albert/test_modeling_albert.py @@ -54,8 +54,9 @@ class AlbertModelTester: vocab_size=99, embedding_size=16, hidden_size=36, - num_hidden_layers=6, - num_hidden_groups=6, + num_hidden_layers=2, + # this needs to be the same as `num_hidden_layers`! + num_hidden_groups=2, num_attention_heads=6, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/albert/test_modeling_flax_albert.py b/tests/models/albert/test_modeling_flax_albert.py index 5292665f5..0bdc8065b 100644 --- a/tests/models/albert/test_modeling_flax_albert.py +++ b/tests/models/albert/test_modeling_flax_albert.py @@ -48,7 +48,7 @@ class FlaxAlbertModelTester(unittest.TestCase): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py index 35b2aebf6..47918bcd8 100644 --- a/tests/models/align/test_modeling_align.py +++ b/tests/models/align/test_modeling_align.py @@ -242,7 +242,7 @@ class AlignTextModelTester: use_token_type_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index fdae7768d..244ef1ed3 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -60,7 +60,7 @@ class AltCLIPVisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -212,7 +212,7 @@ class AltCLIPTextModelTester: hidden_size=32, projection_dim=32, project_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py index c36e946f1..ce596d84e 100644 --- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py @@ -55,7 +55,7 @@ class ASTModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py index 949e647e6..01189e562 100644 --- a/tests/models/bart/test_modeling_bart.py +++ b/tests/models/bart/test_modeling_bart.py @@ -1289,7 +1289,7 @@ class BartStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py index 4c0e5e5a4..2a35cddf4 100644 --- a/tests/models/beit/test_modeling_beit.py +++ b/tests/models/beit/test_modeling_beit.py @@ -64,7 +64,7 @@ class BeitModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/beit/test_modeling_flax_beit.py b/tests/models/beit/test_modeling_flax_beit.py index 0587174ba..78c24220c 100644 --- a/tests/models/beit/test_modeling_flax_beit.py +++ b/tests/models/beit/test_modeling_flax_beit.py @@ -48,7 +48,7 @@ class FlaxBeitModelTester(unittest.TestCase): is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 52c8035d8..9aec91367 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -57,7 +57,7 @@ class BertModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/bert/test_modeling_flax_bert.py b/tests/models/bert/test_modeling_flax_bert.py index 55ffb4401..822689917 100644 --- a/tests/models/bert/test_modeling_flax_bert.py +++ b/tests/models/bert/test_modeling_flax_bert.py @@ -47,7 +47,7 @@ class FlaxBertModelTester(unittest.TestCase): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/bert_generation/test_modeling_bert_generation.py b/tests/models/bert_generation/test_modeling_bert_generation.py index ced98e6f7..ecd7a459e 100644 --- a/tests/models/bert_generation/test_modeling_bert_generation.py +++ b/tests/models/bert_generation/test_modeling_bert_generation.py @@ -41,7 +41,7 @@ class BertGenerationEncoderTester: use_input_mask=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py index 5d345db3f..aedbbb463 100644 --- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py +++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py @@ -605,7 +605,7 @@ class BigBirdPegasusStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py index e50410930..e43fc1e41 100644 --- a/tests/models/biogpt/test_modeling_biogpt.py +++ b/tests/models/biogpt/test_modeling_biogpt.py @@ -51,7 +51,7 @@ class BioGptModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/blenderbot/test_modeling_blenderbot.py b/tests/models/blenderbot/test_modeling_blenderbot.py index 499c7aa52..ca1630b3c 100644 --- a/tests/models/blenderbot/test_modeling_blenderbot.py +++ b/tests/models/blenderbot/test_modeling_blenderbot.py @@ -356,7 +356,7 @@ class BlenderbotStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py index 257aa1699..249a8a799 100644 --- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py +++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py @@ -365,7 +365,7 @@ class BlenderbotSmallStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index 9a6e3da06..cf8c48708 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -70,7 +70,7 @@ class BlipVisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -221,7 +221,7 @@ class BlipTextModelTester: vocab_size=99, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py index 488512f6e..2301b776f 100644 --- a/tests/models/blip/test_modeling_blip_text.py +++ b/tests/models/blip/test_modeling_blip_text.py @@ -44,7 +44,7 @@ class BlipTextModelTester: vocab_size=99, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 71f652050..c5bdb7079 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -62,7 +62,7 @@ class Blip2VisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -215,7 +215,7 @@ class Blip2QFormerModelTester: vocab_size=99, hidden_size=32, projection_dim=32, - num_hidden_layers=6, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -289,7 +289,7 @@ class Blip2TextModelDecoderOnlyTester: use_labels=False, vocab_size=99, hidden_size=16, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=4, hidden_act="gelu", @@ -503,7 +503,7 @@ class Blip2TextModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py index 4e9b837c8..de7cb03e7 100644 --- a/tests/models/bloom/test_modeling_bloom.py +++ b/tests/models/bloom/test_modeling_bloom.py @@ -54,7 +54,7 @@ class BloomModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py index 057fc0913..303d465ca 100644 --- a/tests/models/canine/test_modeling_canine.py +++ b/tests/models/canine/test_modeling_canine.py @@ -53,7 +53,7 @@ class CanineModelTester: # NOTE: this is not a model parameter, just an input vocab_size=100000, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 894f1e127..137c3c288 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -69,7 +69,7 @@ class ChineseCLIPTextModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", @@ -246,7 +246,7 @@ class ChineseCLIPVisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index 35c0f4e20..dc5718850 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -287,7 +287,7 @@ class ClapTextModelTester: vocab_size=99, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 4239ce5ed..996bea95b 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -86,7 +86,7 @@ class CLIPVisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -261,7 +261,7 @@ class CLIPTextModelTester: vocab_size=99, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/clip/test_modeling_flax_clip.py b/tests/models/clip/test_modeling_flax_clip.py index 7d63fa9ed..565c641ae 100644 --- a/tests/models/clip/test_modeling_flax_clip.py +++ b/tests/models/clip/test_modeling_flax_clip.py @@ -35,7 +35,7 @@ class FlaxCLIPVisionModelTester: num_channels=3, is_training=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -252,7 +252,7 @@ class FlaxCLIPTextModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index e931bdc8d..37a71d1b1 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -78,7 +78,7 @@ class CLIPSegVisionModelTester: num_channels=3, is_training=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -228,7 +228,7 @@ class CLIPSegTextModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -346,7 +346,15 @@ class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase): class CLIPSegModelTester: - def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): + def __init__( + self, + parent, + text_kwargs=None, + vision_kwargs=None, + is_training=True, + # This should respect the `num_hidden_layers` in `CLIPSegVisionModelTester` + extract_layers=(1,), + ): if text_kwargs is None: text_kwargs = {} if vision_kwargs is None: @@ -356,6 +364,7 @@ class CLIPSegModelTester: self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs) self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs) self.is_training = is_training + self.extract_layers = extract_layers def prepare_config_and_inputs(self): text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() @@ -371,7 +380,7 @@ class CLIPSegModelTester: self.vision_model_tester.get_config(), projection_dim=64, reduce_dim=32, - extract_layers=[1, 2, 3], + extract_layers=self.extract_layers, ) def create_and_check_model(self, config, input_ids, attention_mask, pixel_values): diff --git a/tests/models/codegen/test_modeling_codegen.py b/tests/models/codegen/test_modeling_codegen.py index 9072c2b5b..34a32caa7 100644 --- a/tests/models/codegen/test_modeling_codegen.py +++ b/tests/models/codegen/test_modeling_codegen.py @@ -47,7 +47,7 @@ class CodeGenModelTester: vocab_size=256, hidden_size=32, rotary_dim=4, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/convbert/test_modeling_convbert.py b/tests/models/convbert/test_modeling_convbert.py index dc1550acc..754967ce0 100644 --- a/tests/models/convbert/test_modeling_convbert.py +++ b/tests/models/convbert/test_modeling_convbert.py @@ -53,7 +53,7 @@ class ConvBertModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py index f3ebeaad3..6ecfe15c2 100644 --- a/tests/models/cpmant/test_modeling_cpmant.py +++ b/tests/models/cpmant/test_modeling_cpmant.py @@ -49,7 +49,7 @@ class CpmAntModelTester: use_mc_token_ids=False, vocab_size=99, hidden_size=32, - num_hidden_layers=3, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, num_buckets=32, diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py index dfcb2c913..ff4274441 100644 --- a/tests/models/ctrl/test_modeling_ctrl.py +++ b/tests/models/ctrl/test_modeling_ctrl.py @@ -49,7 +49,7 @@ class CTRLModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py index 67fe0cbe7..e9448621e 100644 --- a/tests/models/data2vec/test_modeling_data2vec_audio.py +++ b/tests/models/data2vec/test_modeling_data2vec_audio.py @@ -59,7 +59,7 @@ class Data2VecAudioModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, intermediate_size=20, diff --git a/tests/models/data2vec/test_modeling_data2vec_text.py b/tests/models/data2vec/test_modeling_data2vec_text.py index a45c9b6a8..4b4b2835d 100644 --- a/tests/models/data2vec/test_modeling_data2vec_text.py +++ b/tests/models/data2vec/test_modeling_data2vec_text.py @@ -57,7 +57,7 @@ class Data2VecTextModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py index 299ffad3e..69a763a4f 100644 --- a/tests/models/data2vec/test_modeling_data2vec_vision.py +++ b/tests/models/data2vec/test_modeling_data2vec_vision.py @@ -59,7 +59,7 @@ class Data2VecVisionModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/deberta/test_modeling_deberta.py b/tests/models/deberta/test_modeling_deberta.py index 7daff3b52..52758e222 100644 --- a/tests/models/deberta/test_modeling_deberta.py +++ b/tests/models/deberta/test_modeling_deberta.py @@ -47,7 +47,7 @@ class DebertaModelTester(object): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/deberta_v2/test_modeling_deberta_v2.py b/tests/models/deberta_v2/test_modeling_deberta_v2.py index 548c9617b..abfbe7402 100644 --- a/tests/models/deberta_v2/test_modeling_deberta_v2.py +++ b/tests/models/deberta_v2/test_modeling_deberta_v2.py @@ -48,7 +48,7 @@ class DebertaV2ModelTester(object): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py index 37bfe3fa7..2685900af 100644 --- a/tests/models/deit/test_modeling_deit.py +++ b/tests/models/deit/test_modeling_deit.py @@ -69,7 +69,7 @@ class DeiTModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py index ed69faa44..cf7ff95b5 100644 --- a/tests/models/dinov2/test_modeling_dinov2.py +++ b/tests/models/dinov2/test_modeling_dinov2.py @@ -57,7 +57,7 @@ class Dinov2ModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py index 9d17a1c44..ff56afd0a 100644 --- a/tests/models/distilbert/test_modeling_distilbert.py +++ b/tests/models/distilbert/test_modeling_distilbert.py @@ -50,7 +50,7 @@ class DistilBertModelTester(object): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/distilbert/test_modeling_flax_distilbert.py b/tests/models/distilbert/test_modeling_flax_distilbert.py index f4481a6e4..1f5a402e8 100644 --- a/tests/models/distilbert/test_modeling_flax_distilbert.py +++ b/tests/models/distilbert/test_modeling_flax_distilbert.py @@ -47,7 +47,7 @@ class FlaxDistilBertModelTester(unittest.TestCase): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/dpr/test_modeling_dpr.py b/tests/models/dpr/test_modeling_dpr.py index cd4f430de..b6a687a35 100644 --- a/tests/models/dpr/test_modeling_dpr.py +++ b/tests/models/dpr/test_modeling_dpr.py @@ -48,7 +48,7 @@ class DPRModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py index 62ac20df3..247791ed4 100644 --- a/tests/models/dpt/test_modeling_dpt.py +++ b/tests/models/dpt/test_modeling_dpt.py @@ -53,7 +53,7 @@ class DPTModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, backbone_out_indices=[0, 1, 2, 3], num_attention_heads=4, intermediate_size=37, @@ -62,7 +62,7 @@ class DPTModelTester: attention_probs_dropout_prob=0.1, initializer_range=0.02, num_labels=3, - neck_hidden_sizes=[16, 16, 32, 32], + neck_hidden_sizes=[16, 32], is_hybrid=False, scope=None, ): diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py index 550bc1448..a5d3fa585 100644 --- a/tests/models/electra/test_modeling_electra.py +++ b/tests/models/electra/test_modeling_electra.py @@ -54,7 +54,7 @@ class ElectraModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/electra/test_modeling_flax_electra.py b/tests/models/electra/test_modeling_flax_electra.py index 0dda4e38f..19b35d894 100644 --- a/tests/models/electra/test_modeling_flax_electra.py +++ b/tests/models/electra/test_modeling_flax_electra.py @@ -34,7 +34,7 @@ class FlaxElectraModelTester(unittest.TestCase): vocab_size=99, embedding_size=24, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py index e845bd1f8..f0bdec3ef 100644 --- a/tests/models/ernie/test_modeling_ernie.py +++ b/tests/models/ernie/test_modeling_ernie.py @@ -56,7 +56,7 @@ class ErnieModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/ernie_m/test_modeling_ernie_m.py b/tests/models/ernie_m/test_modeling_ernie_m.py index 5e0ac9523..1fafcd34b 100644 --- a/tests/models/ernie_m/test_modeling_ernie_m.py +++ b/tests/models/ernie_m/test_modeling_ernie_m.py @@ -50,7 +50,7 @@ class ErnieMModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py index 2e5d48082..f242e7796 100644 --- a/tests/models/esm/test_modeling_esm.py +++ b/tests/models/esm/test_modeling_esm.py @@ -49,7 +49,7 @@ class EsmModelTester: use_labels=True, vocab_size=33, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py index 39f274af5..1ec5ab8ac 100644 --- a/tests/models/esm/test_modeling_esmfold.py +++ b/tests/models/esm/test_modeling_esmfold.py @@ -43,7 +43,7 @@ class EsmFoldModelTester: use_labels=False, vocab_size=19, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py index 6530eeb1a..0efc76237 100644 --- a/tests/models/falcon/test_modeling_falcon.py +++ b/tests/models/falcon/test_modeling_falcon.py @@ -50,7 +50,7 @@ class FalconModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/flaubert/test_modeling_flaubert.py b/tests/models/flaubert/test_modeling_flaubert.py index 99dbf927e..61806182b 100644 --- a/tests/models/flaubert/test_modeling_flaubert.py +++ b/tests/models/flaubert/test_modeling_flaubert.py @@ -57,7 +57,7 @@ class FlaubertModelTester(object): vocab_size=99, n_special=0, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index f1221f106..022418163 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -79,7 +79,7 @@ class FlavaImageModelTester: parent, batch_size=12, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", @@ -342,7 +342,7 @@ class FlavaTextModelTester: max_position_embeddings=512, position_embedding_type="absolute", hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", @@ -487,7 +487,7 @@ class FlavaMultimodalModelTester: seq_length=44, use_input_mask=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py index e7e592d5b..01e9942de 100644 --- a/tests/models/fnet/test_modeling_fnet.py +++ b/tests/models/fnet/test_modeling_fnet.py @@ -70,7 +70,7 @@ class FNetModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, intermediate_size=37, hidden_act="gelu", hidden_dropout_prob=0.1, diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index ed094db4a..0dde54a39 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -51,7 +51,7 @@ class GitVisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -203,7 +203,7 @@ class GitModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gpt2/test_modeling_flax_gpt2.py b/tests/models/gpt2/test_modeling_flax_gpt2.py index e842bbc73..9bdc17fa1 100644 --- a/tests/models/gpt2/test_modeling_flax_gpt2.py +++ b/tests/models/gpt2/test_modeling_flax_gpt2.py @@ -52,7 +52,7 @@ class FlaxGPT2ModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py index f820b5494..c94103988 100644 --- a/tests/models/gpt2/test_modeling_gpt2.py +++ b/tests/models/gpt2/test_modeling_gpt2.py @@ -56,7 +56,7 @@ class GPT2ModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py index 8beddc0ab..3d4dd27fa 100644 --- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py +++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py @@ -55,7 +55,7 @@ class GPTBigCodeModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="relu", diff --git a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py index a32f35f6e..58574a8b1 100644 --- a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py +++ b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py @@ -52,9 +52,9 @@ class FlaxGPTNeoModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, - attention_types=[[["global", "local"], 2]], + attention_types=[[["global", "local"], 1]], intermediate_size=37, hidden_act="gelu", hidden_dropout_prob=0.1, diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py index a79cf5b25..075b9a266 100644 --- a/tests/models/gpt_neo/test_modeling_gpt_neo.py +++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py @@ -54,8 +54,8 @@ class GPTNeoModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=4, - attention_types=[[["global", "local"], 2]], + num_hidden_layers=2, + attention_types=[[["global", "local"], 1]], num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py index 176970779..8777bd3ab 100644 --- a/tests/models/gpt_neox/test_modeling_gpt_neox.py +++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py @@ -52,7 +52,7 @@ class GPTNeoXModelTester: use_labels=True, vocab_size=99, hidden_size=64, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py index 47bb22b62..fc78b8bdd 100644 --- a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py +++ b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py @@ -44,7 +44,7 @@ class GPTNeoXJapaneseModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_multiple_size=4, hidden_act="gelu", diff --git a/tests/models/gptj/test_modeling_flax_gptj.py b/tests/models/gptj/test_modeling_flax_gptj.py index d177e345e..48061f84d 100644 --- a/tests/models/gptj/test_modeling_flax_gptj.py +++ b/tests/models/gptj/test_modeling_flax_gptj.py @@ -53,7 +53,7 @@ class FlaxGPTJModelTester: vocab_size=99, hidden_size=32, rotary_dim=4, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py index 3636d357d..f0e027007 100644 --- a/tests/models/gptj/test_modeling_gptj.py +++ b/tests/models/gptj/test_modeling_gptj.py @@ -56,7 +56,7 @@ class GPTJModelTester: vocab_size=99, hidden_size=32, rotary_dim=4, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py index 54a98cf70..1a86e23fd 100644 --- a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py +++ b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py @@ -45,7 +45,7 @@ class GPTSanJapaneseTester: is_training=True, hidden_size=32, ext_size=42, - num_hidden_layers=5, + num_hidden_layers=2, num_ext_layers=2, num_attention_heads=4, num_experts=2, diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index 261841277..6d52b6b50 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -356,7 +356,7 @@ class GroupViTTextModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -553,6 +553,10 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def test_model_common_attributes(self): pass + # overwritten from parent as this equivalent test needs a specific `seed` and hard to get a good one! + def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-5, name="outputs", attributes=None): + super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol=tol, name=name, attributes=attributes) + @is_pt_tf_cross_test def test_pt_tf_model_equivalence(self): import tensorflow as tf diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py index bad1a561d..c5a6a1398 100644 --- a/tests/models/hubert/test_modeling_hubert.py +++ b/tests/models/hubert/test_modeling_hubert.py @@ -71,7 +71,7 @@ class HubertModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, # this is most likely not correctly set yet intermediate_size=20, diff --git a/tests/models/ibert/test_modeling_ibert.py b/tests/models/ibert/test_modeling_ibert.py index 9f2f2c950..096a55169 100644 --- a/tests/models/ibert/test_modeling_ibert.py +++ b/tests/models/ibert/test_modeling_ibert.py @@ -62,7 +62,7 @@ class IBertModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py index 19fe688bf..b4e2cd5ab 100644 --- a/tests/models/imagegpt/test_modeling_imagegpt.py +++ b/tests/models/imagegpt/test_modeling_imagegpt.py @@ -65,7 +65,7 @@ class ImageGPTModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index d659c3891..49d780918 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -64,7 +64,7 @@ class InstructBlipVisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -219,7 +219,7 @@ class InstructBlipQFormerModelTester: vocab_size=99, hidden_size=32, projection_dim=32, - num_hidden_layers=6, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -295,7 +295,7 @@ class InstructBlipTextModelDecoderOnlyTester: use_labels=False, vocab_size=99, hidden_size=16, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=4, hidden_act="gelu", diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py index 0535fbf4e..aafa53969 100644 --- a/tests/models/layoutlm/test_modeling_layoutlm.py +++ b/tests/models/layoutlm/test_modeling_layoutlm.py @@ -48,7 +48,7 @@ class LayoutLMModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py index c8457331c..cffa09d6d 100644 --- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py @@ -55,7 +55,7 @@ class LayoutLMv2ModelTester: use_labels=True, vocab_size=99, hidden_size=36, - num_hidden_layers=3, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py index 2c3aef9b9..bf9a0b831 100644 --- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py @@ -63,7 +63,7 @@ class LayoutLMv3ModelTester: use_labels=True, vocab_size=99, hidden_size=36, - num_hidden_layers=3, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index c2efc3f5a..e8d5f9abe 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -46,7 +46,7 @@ class LlamaModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/longformer/test_modeling_longformer.py b/tests/models/longformer/test_modeling_longformer.py index 21853e442..b40e464e6 100644 --- a/tests/models/longformer/test_modeling_longformer.py +++ b/tests/models/longformer/test_modeling_longformer.py @@ -50,7 +50,7 @@ class LongformerModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/longt5/test_modeling_flax_longt5.py b/tests/models/longt5/test_modeling_flax_longt5.py index 2c262bef3..9449cfa5e 100644 --- a/tests/models/longt5/test_modeling_flax_longt5.py +++ b/tests/models/longt5/test_modeling_flax_longt5.py @@ -71,7 +71,7 @@ class FlaxLongT5ModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/longt5/test_modeling_longt5.py b/tests/models/longt5/test_modeling_longt5.py index 0f7ae0a27..b2d17dc0e 100644 --- a/tests/models/longt5/test_modeling_longt5.py +++ b/tests/models/longt5/test_modeling_longt5.py @@ -59,7 +59,7 @@ class LongT5ModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, @@ -916,7 +916,7 @@ class LongT5EncoderOnlyModelTester: # For common tests use_attention_mask=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py index 35bdb6b6d..95cb4f5d0 100644 --- a/tests/models/luke/test_modeling_luke.py +++ b/tests/models/luke/test_modeling_luke.py @@ -61,7 +61,7 @@ class LukeModelTester: entity_vocab_size=10, entity_emb_size=6, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py index 6cbcd55d3..8fd5e04a5 100644 --- a/tests/models/marian/test_modeling_marian.py +++ b/tests/models/marian/test_modeling_marian.py @@ -661,7 +661,7 @@ class MarianStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/markuplm/test_modeling_markuplm.py b/tests/models/markuplm/test_modeling_markuplm.py index 09d2f1ad5..71757385e 100644 --- a/tests/models/markuplm/test_modeling_markuplm.py +++ b/tests/models/markuplm/test_modeling_markuplm.py @@ -53,7 +53,7 @@ class MarkupLMModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/mbart/test_modeling_mbart.py b/tests/models/mbart/test_modeling_mbart.py index ec3d36f33..db5b554e8 100644 --- a/tests/models/mbart/test_modeling_mbart.py +++ b/tests/models/mbart/test_modeling_mbart.py @@ -491,7 +491,7 @@ class MBartStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/mega/test_modeling_mega.py b/tests/models/mega/test_modeling_mega.py index dfb00d190..e10ecc548 100644 --- a/tests/models/mega/test_modeling_mega.py +++ b/tests/models/mega/test_modeling_mega.py @@ -51,7 +51,7 @@ class MegaModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, intermediate_size=37, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, diff --git a/tests/models/megatron_bert/test_modeling_megatron_bert.py b/tests/models/megatron_bert/test_modeling_megatron_bert.py index bc1d81c4e..818f65d80 100644 --- a/tests/models/megatron_bert/test_modeling_megatron_bert.py +++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py @@ -58,7 +58,7 @@ class MegatronBertModelTester: vocab_size=99, hidden_size=64, embedding_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/mgp_str/test_modeling_mgp_str.py b/tests/models/mgp_str/test_modeling_mgp_str.py index 1d972e22a..d8ba50a35 100644 --- a/tests/models/mgp_str/test_modeling_mgp_str.py +++ b/tests/models/mgp_str/test_modeling_mgp_str.py @@ -55,7 +55,7 @@ class MgpstrModelTester: num_bpe_labels=99, num_wordpiece_labels=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, mlp_ratio=4.0, patch_embeds_hidden_size=257, diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py index 6e4f696b8..572490071 100644 --- a/tests/models/mobilebert/test_modeling_mobilebert.py +++ b/tests/models/mobilebert/test_modeling_mobilebert.py @@ -54,7 +54,7 @@ class MobileBertModelTester: vocab_size=99, hidden_size=64, embedding_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/mpnet/test_modeling_mpnet.py b/tests/models/mpnet/test_modeling_mpnet.py index d3261e4bc..fc1676417 100644 --- a/tests/models/mpnet/test_modeling_mpnet.py +++ b/tests/models/mpnet/test_modeling_mpnet.py @@ -49,7 +49,7 @@ class MPNetModelTester: use_labels=True, vocab_size=99, hidden_size=64, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=64, hidden_act="gelu", diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py index 91cb35bb7..363c493b1 100644 --- a/tests/models/mpt/test_modeling_mpt.py +++ b/tests/models/mpt/test_modeling_mpt.py @@ -54,7 +54,7 @@ class MptModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py index c6a081303..aac9ce5bc 100644 --- a/tests/models/mra/test_modeling_mra.py +++ b/tests/models/mra/test_modeling_mra.py @@ -51,7 +51,7 @@ class MraModelTester: use_labels=True, vocab_size=99, hidden_size=16, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=2, intermediate_size=36, hidden_act="gelu", diff --git a/tests/models/mvp/test_modeling_mvp.py b/tests/models/mvp/test_modeling_mvp.py index cc3986a37..8e6143529 100644 --- a/tests/models/mvp/test_modeling_mvp.py +++ b/tests/models/mvp/test_modeling_mvp.py @@ -595,7 +595,7 @@ class MvpStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/nezha/test_modeling_nezha.py b/tests/models/nezha/test_modeling_nezha.py index 5b36ffbc9..a71823d8a 100644 --- a/tests/models/nezha/test_modeling_nezha.py +++ b/tests/models/nezha/test_modeling_nezha.py @@ -55,7 +55,7 @@ class NezhaModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py index 030b5f2a8..9311a0199 100644 --- a/tests/models/nllb_moe/test_modeling_nllb_moe.py +++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py @@ -52,7 +52,7 @@ class NllbMoeModelTester: use_labels=False, vocab_size=99, hidden_size=16, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=4, hidden_act="relu", diff --git a/tests/models/nystromformer/test_modeling_nystromformer.py b/tests/models/nystromformer/test_modeling_nystromformer.py index 390308631..ae0667010 100644 --- a/tests/models/nystromformer/test_modeling_nystromformer.py +++ b/tests/models/nystromformer/test_modeling_nystromformer.py @@ -51,7 +51,7 @@ class NystromformerModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/openai/test_modeling_openai.py b/tests/models/openai/test_modeling_openai.py index 0e8ba6d9c..98d74ee5f 100644 --- a/tests/models/openai/test_modeling_openai.py +++ b/tests/models/openai/test_modeling_openai.py @@ -49,7 +49,7 @@ class OpenAIGPTModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py index 251282a91..69a063f27 100644 --- a/tests/models/opt/test_modeling_opt.py +++ b/tests/models/opt/test_modeling_opt.py @@ -70,7 +70,7 @@ class OPTModelTester: use_labels=False, vocab_size=99, hidden_size=16, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=4, hidden_act="gelu", diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py index 4dbd1fb0a..8360b9f2a 100644 --- a/tests/models/owlvit/test_modeling_owlvit.py +++ b/tests/models/owlvit/test_modeling_owlvit.py @@ -62,7 +62,7 @@ class OwlViTVisionModelTester: num_channels=3, is_training=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/pegasus/test_modeling_flax_pegasus.py b/tests/models/pegasus/test_modeling_flax_pegasus.py index fbc49c781..62b9077f0 100644 --- a/tests/models/pegasus/test_modeling_flax_pegasus.py +++ b/tests/models/pegasus/test_modeling_flax_pegasus.py @@ -52,7 +52,7 @@ class FlaxPegasusModelTester: use_labels=False, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_dropout_prob=0.1, diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py index bde7477f9..4011fe2c6 100644 --- a/tests/models/pegasus/test_modeling_pegasus.py +++ b/tests/models/pegasus/test_modeling_pegasus.py @@ -371,7 +371,7 @@ class PegasusStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/pegasus_x/test_modeling_pegasus_x.py b/tests/models/pegasus_x/test_modeling_pegasus_x.py index 73c4ee62b..22d7b0c86 100644 --- a/tests/models/pegasus_x/test_modeling_pegasus_x.py +++ b/tests/models/pegasus_x/test_modeling_pegasus_x.py @@ -670,7 +670,7 @@ class PegasusXStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py index c49db3dcf..34ca767d6 100644 --- a/tests/models/pix2struct/test_modeling_pix2struct.py +++ b/tests/models/pix2struct/test_modeling_pix2struct.py @@ -71,7 +71,7 @@ class Pix2StructVisionModelTester: patch_embed_hidden_size=12, projection_dim=32, max_patches=64, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -230,7 +230,7 @@ class Pix2StructTextModelTester: vocab_size=99, hidden_size=12, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/plbart/test_modeling_plbart.py b/tests/models/plbart/test_modeling_plbart.py index 05dbac6a2..4cd8ecd14 100644 --- a/tests/models/plbart/test_modeling_plbart.py +++ b/tests/models/plbart/test_modeling_plbart.py @@ -473,7 +473,7 @@ class PLBartStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/prophetnet/test_modeling_prophetnet.py b/tests/models/prophetnet/test_modeling_prophetnet.py index fa717b274..eee03134d 100644 --- a/tests/models/prophetnet/test_modeling_prophetnet.py +++ b/tests/models/prophetnet/test_modeling_prophetnet.py @@ -55,10 +55,10 @@ class ProphetNetModelTester: use_labels=True, decoder_start_token_id=0, encoder_ffn_dim=32, - num_encoder_layers=4, + num_encoder_layers=2, num_encoder_attention_heads=4, decoder_ffn_dim=32, - num_decoder_layers=4, + num_decoder_layers=2, num_decoder_attention_heads=4, max_position_embeddings=30, is_encoder_decoder=True, @@ -437,10 +437,10 @@ class ProphetNetModelTester: decoder_attention_mask=decoder_attention_mask, labels=lm_labels, ) - self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(4.5981, device=torch_device), atol=1e-3)) + self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(4.5892, device=torch_device), atol=1e-3)) expected_logit_slice = torch.tensor( - [-0.0648, 0.0790, 0.0360, 0.0089, 0.0039, -0.0639, 0.0131], device=torch_device + [-0.0184, 0.0758, -0.0543, -0.0093, 0.0050, -0.0660, -0.1453], device=torch_device ) self.parent.assertTrue(torch.allclose(result.logits[0, :, 1], expected_logit_slice, atol=1e-3)) @@ -551,10 +551,10 @@ class ProphetNetStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=0, encoder_ffn_dim=32, - num_encoder_layers=4, + num_encoder_layers=2, num_encoder_attention_heads=4, decoder_ffn_dim=32, - num_decoder_layers=4, + num_decoder_layers=2, num_decoder_attention_heads=4, max_position_embeddings=30, is_encoder_decoder=False, @@ -782,10 +782,10 @@ class ProphetNetStandaloneEncoderModelTester: use_labels=True, decoder_start_token_id=0, encoder_ffn_dim=32, - num_encoder_layers=4, + num_encoder_layers=2, num_encoder_attention_heads=4, decoder_ffn_dim=32, - num_decoder_layers=4, + num_decoder_layers=2, num_decoder_attention_heads=4, max_position_embeddings=30, is_encoder_decoder=False, diff --git a/tests/models/qdqbert/test_modeling_qdqbert.py b/tests/models/qdqbert/test_modeling_qdqbert.py index cc05389ee..d10abb733 100644 --- a/tests/models/qdqbert/test_modeling_qdqbert.py +++ b/tests/models/qdqbert/test_modeling_qdqbert.py @@ -54,7 +54,7 @@ class QDQBertModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/realm/test_modeling_realm.py b/tests/models/realm/test_modeling_realm.py index ddd6c2645..4d6d9fd0f 100644 --- a/tests/models/realm/test_modeling_realm.py +++ b/tests/models/realm/test_modeling_realm.py @@ -54,7 +54,7 @@ class RealmModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/rembert/test_modeling_rembert.py b/tests/models/rembert/test_modeling_rembert.py index 4e6754b2e..557a42243 100644 --- a/tests/models/rembert/test_modeling_rembert.py +++ b/tests/models/rembert/test_modeling_rembert.py @@ -55,7 +55,7 @@ class RemBertModelTester: hidden_size=32, input_embedding_size=18, output_embedding_size=43, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roberta/test_modeling_flax_roberta.py b/tests/models/roberta/test_modeling_flax_roberta.py index c325e295f..f82479aa7 100644 --- a/tests/models/roberta/test_modeling_flax_roberta.py +++ b/tests/models/roberta/test_modeling_flax_roberta.py @@ -46,7 +46,7 @@ class FlaxRobertaModelTester(unittest.TestCase): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index 7ca78e23b..40c85123c 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -58,7 +58,7 @@ class RobertaModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py index 3f15ca9ff..8500dfcb6 100644 --- a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py +++ b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py @@ -49,7 +49,7 @@ class FlaxRobertaPreLayerNormModelTester(unittest.TestCase): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py index 4e4915147..c44e1613b 100644 --- a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py +++ b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py @@ -57,7 +57,7 @@ class RobertaPreLayerNormModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py index 2efd9b799..d1caca6b6 100644 --- a/tests/models/roc_bert/test_modeling_roc_bert.py +++ b/tests/models/roc_bert/test_modeling_roc_bert.py @@ -58,7 +58,7 @@ class RoCBertModelTester: pronunciation_embed_dim=32, shape_embed_dim=32, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roformer/test_modeling_flax_roformer.py b/tests/models/roformer/test_modeling_flax_roformer.py index 28d0ffba9..8364e121b 100644 --- a/tests/models/roformer/test_modeling_flax_roformer.py +++ b/tests/models/roformer/test_modeling_flax_roformer.py @@ -47,7 +47,7 @@ class FlaxRoFormerModelTester(unittest.TestCase): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py index 357e126a0..e54d31d15 100644 --- a/tests/models/roformer/test_modeling_roformer.py +++ b/tests/models/roformer/test_modeling_roformer.py @@ -56,7 +56,7 @@ class RoFormerModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py index 2b9cc4713..4ca5cfdf9 100644 --- a/tests/models/rwkv/test_modeling_rwkv.py +++ b/tests/models/rwkv/test_modeling_rwkv.py @@ -52,7 +52,7 @@ class RwkvModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, intermediate_size=37, hidden_act="gelu", hidden_dropout_prob=0.1, diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py index 651600c43..876b232a1 100644 --- a/tests/models/sew/test_modeling_sew.py +++ b/tests/models/sew/test_modeling_sew.py @@ -65,7 +65,7 @@ class SEWModelTester: num_conv_pos_embeddings=31, num_conv_pos_embedding_groups=2, squeeze_factor=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout=0.1, intermediate_size=20, diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py index 9aa4b8eda..dc33e80ed 100644 --- a/tests/models/sew_d/test_modeling_sew_d.py +++ b/tests/models/sew_d/test_modeling_sew_d.py @@ -72,7 +72,7 @@ class SEWDModelTester: position_biased_input=False, pos_att_type=("p2c", "c2p"), norm_rel_ebd="layer_norm", - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout=0.1, intermediate_size=20, diff --git a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py index ccd5bfa18..cbb449c6e 100644 --- a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py +++ b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py @@ -50,7 +50,7 @@ class Speech2Text2StandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, decoder_attention_heads=4, max_position_embeddings=30, pad_token_id=0, diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index c357259d7..9324996ff 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -105,7 +105,7 @@ class SpeechT5ModelTester: is_training=False, vocab_size=81, hidden_size=24, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, intermediate_size=4, ): @@ -249,7 +249,7 @@ class SpeechT5ForSpeechToTextTester: decoder_seq_length=7, is_training=False, hidden_size=24, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, intermediate_size=4, conv_dim=(32, 32, 32), @@ -786,7 +786,7 @@ class SpeechT5ForTextToSpeechTester: decoder_seq_length=1024, # speech is longer is_training=False, hidden_size=24, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, intermediate_size=4, vocab_size=81, @@ -1031,7 +1031,7 @@ class SpeechT5ForSpeechToSpeechTester: decoder_seq_length=1024, is_training=False, hidden_size=24, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, intermediate_size=4, conv_dim=(32, 32, 32), diff --git a/tests/models/splinter/test_modeling_splinter.py b/tests/models/splinter/test_modeling_splinter.py index 24a075315..90ee07c35 100644 --- a/tests/models/splinter/test_modeling_splinter.py +++ b/tests/models/splinter/test_modeling_splinter.py @@ -46,7 +46,7 @@ class SplinterModelTester: vocab_size=99, hidden_size=32, question_token_id=1, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/squeezebert/test_modeling_squeezebert.py b/tests/models/squeezebert/test_modeling_squeezebert.py index 5efb03031..bf86792f5 100644 --- a/tests/models/squeezebert/test_modeling_squeezebert.py +++ b/tests/models/squeezebert/test_modeling_squeezebert.py @@ -50,7 +50,7 @@ class SqueezeBertModelTester(object): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=64, hidden_act="gelu", diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py index abe785eca..ae9966a1d 100644 --- a/tests/models/switch_transformers/test_modeling_switch_transformers.py +++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py @@ -58,7 +58,7 @@ class SwitchTransformersModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, @@ -826,7 +826,7 @@ class SwitchTransformersEncoderOnlyModelTester: # For common tests use_attention_mask=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/t5/test_modeling_flax_t5.py b/tests/models/t5/test_modeling_flax_t5.py index a2a80ab25..d5d729dac 100644 --- a/tests/models/t5/test_modeling_flax_t5.py +++ b/tests/models/t5/test_modeling_flax_t5.py @@ -70,7 +70,7 @@ class FlaxT5ModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, @@ -477,7 +477,7 @@ class FlaxT5EncoderOnlyModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py index 41aade960..cae891ef8 100644 --- a/tests/models/t5/test_modeling_t5.py +++ b/tests/models/t5/test_modeling_t5.py @@ -71,7 +71,7 @@ class T5ModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, @@ -902,7 +902,7 @@ class T5EncoderOnlyModelTester: # For common tests use_attention_mask=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/tapas/test_modeling_tapas.py b/tests/models/tapas/test_modeling_tapas.py index 619a5d261..6a482d03b 100644 --- a/tests/models/tapas/test_modeling_tapas.py +++ b/tests/models/tapas/test_modeling_tapas.py @@ -79,7 +79,7 @@ class TapasModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py index 2783a65ce..2b7a5e279 100644 --- a/tests/models/timesformer/test_modeling_timesformer.py +++ b/tests/models/timesformer/test_modeling_timesformer.py @@ -60,7 +60,7 @@ class TimesformerModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/transfo_xl/test_modeling_transfo_xl.py b/tests/models/transfo_xl/test_modeling_transfo_xl.py index 970f87bf1..63afd438d 100644 --- a/tests/models/transfo_xl/test_modeling_transfo_xl.py +++ b/tests/models/transfo_xl/test_modeling_transfo_xl.py @@ -52,7 +52,7 @@ class TransfoXLModelTester: d_head=8, d_inner=128, div_val=2, - num_hidden_layers=5, + num_hidden_layers=2, scope=None, seed=1, eos_token_id=0, diff --git a/tests/models/trocr/test_modeling_trocr.py b/tests/models/trocr/test_modeling_trocr.py index 0033f339a..da24c7dd4 100644 --- a/tests/models/trocr/test_modeling_trocr.py +++ b/tests/models/trocr/test_modeling_trocr.py @@ -47,7 +47,7 @@ class TrOCRStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, decoder_attention_heads=4, max_position_embeddings=30, pad_token_id=0, diff --git a/tests/models/tvlt/test_modeling_tvlt.py b/tests/models/tvlt/test_modeling_tvlt.py index e437b2651..3ee7f7adc 100644 --- a/tests/models/tvlt/test_modeling_tvlt.py +++ b/tests/models/tvlt/test_modeling_tvlt.py @@ -68,7 +68,7 @@ class TvltModelTester: num_audio_channels=1, num_frames=2, hidden_size=32, - num_hidden_layers=3, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=128, hidden_act="gelu", diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py index 29f850217..d9fd852c8 100644 --- a/tests/models/umt5/test_modeling_umt5.py +++ b/tests/models/umt5/test_modeling_umt5.py @@ -64,7 +64,7 @@ class UMT5ModelTester: use_attention_mask=True, use_labels=False, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py index 6d0bd1bf1..ac770bdbb 100644 --- a/tests/models/unispeech/test_modeling_unispeech.py +++ b/tests/models/unispeech/test_modeling_unispeech.py @@ -65,7 +65,7 @@ class UniSpeechModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, # this is most likely not correctly set yet intermediate_size=20, diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py index a418a56da..9c8cffba9 100644 --- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py +++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py @@ -67,7 +67,7 @@ class UniSpeechSatModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, # this is most likely not correctly set yet intermediate_size=20, diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py index 85a0d2714..9fb9c9e7f 100644 --- a/tests/models/videomae/test_modeling_videomae.py +++ b/tests/models/videomae/test_modeling_videomae.py @@ -62,7 +62,7 @@ class VideoMAEModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py index 772091d5b..4aa036ebb 100644 --- a/tests/models/vilt/test_modeling_vilt.py +++ b/tests/models/vilt/test_modeling_vilt.py @@ -65,7 +65,7 @@ class ViltModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py index cf48fd7ff..9000be33a 100644 --- a/tests/models/visual_bert/test_modeling_visual_bert.py +++ b/tests/models/visual_bert/test_modeling_visual_bert.py @@ -54,7 +54,7 @@ class VisualBertModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vit/test_modeling_flax_vit.py b/tests/models/vit/test_modeling_flax_vit.py index ca3130493..af56f4717 100644 --- a/tests/models/vit/test_modeling_flax_vit.py +++ b/tests/models/vit/test_modeling_flax_vit.py @@ -41,7 +41,7 @@ class FlaxViTModelTester(unittest.TestCase): is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vit/test_modeling_vit.py b/tests/models/vit/test_modeling_vit.py index 67c6e4acb..82ba910ec 100644 --- a/tests/models/vit/test_modeling_vit.py +++ b/tests/models/vit/test_modeling_vit.py @@ -59,7 +59,7 @@ class ViTModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py index fc816750e..20747b2d5 100644 --- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py +++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py @@ -50,7 +50,7 @@ class ViTHybridModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py index 3cedb0c17..89a3a0d80 100644 --- a/tests/models/vit_mae/test_modeling_vit_mae.py +++ b/tests/models/vit_mae/test_modeling_vit_mae.py @@ -56,7 +56,7 @@ class ViTMAEModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vit_msn/test_modeling_vit_msn.py b/tests/models/vit_msn/test_modeling_vit_msn.py index 173dca091..a53163775 100644 --- a/tests/models/vit_msn/test_modeling_vit_msn.py +++ b/tests/models/vit_msn/test_modeling_vit_msn.py @@ -52,7 +52,7 @@ class ViTMSNModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py index b9b52dc12..4cff7dca4 100644 --- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py @@ -123,7 +123,7 @@ class FlaxWav2Vec2ModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, # this is most likely not correctly set yet intermediate_size=20, diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index 630a5d8e8..fb639077b 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -153,7 +153,7 @@ class Wav2Vec2ModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, # this is most likely not correctly set yet intermediate_size=20, diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py index 8c26268c6..a79e8ac1e 100644 --- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py +++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py @@ -71,7 +71,7 @@ class Wav2Vec2ConformerModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, intermediate_size=20, diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py index b04a96dd1..05385b68b 100644 --- a/tests/models/wavlm/test_modeling_wavlm.py +++ b/tests/models/wavlm/test_modeling_wavlm.py @@ -64,7 +64,7 @@ class WavLMModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, # this is most likely not correctly set yet intermediate_size=20, diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index ef2d11ac6..5c602d3d3 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -61,7 +61,7 @@ class XCLIPVisionModelTester: num_frames=8, # important; the batch size * time must be divisible by the number of frames is_training=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, mit_hidden_size=64, @@ -318,7 +318,7 @@ class XCLIPTextModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/xglm/test_modeling_flax_xglm.py b/tests/models/xglm/test_modeling_flax_xglm.py index 60436cb1f..8f1c9a5e2 100644 --- a/tests/models/xglm/test_modeling_flax_xglm.py +++ b/tests/models/xglm/test_modeling_flax_xglm.py @@ -53,7 +53,7 @@ class FlaxXGLMModelTester: use_labels=True, vocab_size=99, d_model=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, ffn_dim=37, activation_function="gelu", diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py index bbb87abe6..e6c013cca 100644 --- a/tests/models/xglm/test_modeling_xglm.py +++ b/tests/models/xglm/test_modeling_xglm.py @@ -44,7 +44,7 @@ class XGLMModelTester: use_labels=True, vocab_size=99, d_model=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, ffn_dim=37, activation_function="gelu", diff --git a/tests/models/xlm/test_modeling_xlm.py b/tests/models/xlm/test_modeling_xlm.py index d8a184411..b551e7e64 100644 --- a/tests/models/xlm/test_modeling_xlm.py +++ b/tests/models/xlm/test_modeling_xlm.py @@ -57,7 +57,7 @@ class XLMModelTester: vocab_size=99, n_special=0, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py index 7ec84c9b1..828d6a02a 100644 --- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py +++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py @@ -55,7 +55,7 @@ class XLMRobertaXLModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/xlnet/test_modeling_xlnet.py b/tests/models/xlnet/test_modeling_xlnet.py index 2b3f4752e..2b0c95cd6 100644 --- a/tests/models/xlnet/test_modeling_xlnet.py +++ b/tests/models/xlnet/test_modeling_xlnet.py @@ -56,7 +56,7 @@ class XLNetModelTester: hidden_size=32, num_attention_heads=4, d_inner=128, - num_hidden_layers=5, + num_hidden_layers=2, type_sequence_label_size=2, untie_r=True, bi_data=False, diff --git a/tests/models/xmod/test_modeling_xmod.py b/tests/models/xmod/test_modeling_xmod.py index 5845dee74..fc1ce44e3 100644 --- a/tests/models/xmod/test_modeling_xmod.py +++ b/tests/models/xmod/test_modeling_xmod.py @@ -51,7 +51,7 @@ class XmodModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/yolos/test_modeling_yolos.py b/tests/models/yolos/test_modeling_yolos.py index 7c49bb864..c1fb50e30 100644 --- a/tests/models/yolos/test_modeling_yolos.py +++ b/tests/models/yolos/test_modeling_yolos.py @@ -52,7 +52,7 @@ class YolosModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/yoso/test_modeling_yoso.py b/tests/models/yoso/test_modeling_yoso.py index e275e19e4..67d7b9edc 100644 --- a/tests/models/yoso/test_modeling_yoso.py +++ b/tests/models/yoso/test_modeling_yoso.py @@ -51,7 +51,7 @@ class YosoModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index d80624b71..0cdc94fc8 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -1017,7 +1017,8 @@ class ModelTesterMixin: attentions = outputs[-1] self.assertEqual(attentions[0].shape[-3], 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) + # TODO: To have this check, we will need at least 3 layers. Do we really need it? + # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) def test_head_pruning_save_load_from_pretrained(self): @@ -1053,7 +1054,8 @@ class ModelTesterMixin: outputs = model(**self._prepare_for_class(inputs_dict, model_class)) attentions = outputs[-1] self.assertEqual(attentions[0].shape[-3], 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) + # TODO: To have this check, we will need at least 3 layers. Do we really need it? + # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) def test_head_pruning_save_load_from_config_init(self): @@ -1087,7 +1089,8 @@ class ModelTesterMixin: attentions = outputs[-1] self.assertEqual(attentions[0].shape[-3], 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) + # TODO: To have this check, we will need at least 3 layers. Do we really need it? + # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) def test_head_pruning_integration(self): @@ -1106,7 +1109,7 @@ class ModelTesterMixin: inputs_dict["output_attentions"] = True config.output_hidden_states = False - heads_to_prune = {0: [0], 1: [1, 2]} + heads_to_prune = {1: [1, 2]} config.pruned_heads = heads_to_prune model = model_class(config=config) @@ -1117,10 +1120,8 @@ class ModelTesterMixin: outputs = model(**self._prepare_for_class(inputs_dict, model_class)) attentions = outputs[-1] - self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) + self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0) self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) with tempfile.TemporaryDirectory() as temp_dir_name: model.save_pretrained(temp_dir_name) @@ -1131,12 +1132,10 @@ class ModelTesterMixin: outputs = model(**self._prepare_for_class(inputs_dict, model_class)) attentions = outputs[-1] - self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) + self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0) self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) - heads_to_prune = {0: [0], 2: [1, 2]} + heads_to_prune = {0: [0], 1: [1, 2]} model.prune_heads(heads_to_prune) with torch.no_grad(): @@ -1145,10 +1144,8 @@ class ModelTesterMixin: self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) - self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]}) + self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2]}) def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class):