From d0b1d8d888b71724afb7623cc1e7a30bbf9edf4a Mon Sep 17 00:00:00 2001
From: AbdelKarim ELJANDOUBI <78537694+eljandoubi@users.noreply.github.com>
Date: Tue, 5 Nov 2024 10:06:07 +0100
Subject: [PATCH 01/12] Skip DeepSpeed ZeRO Stage 3 model initialization when
 bnb (#34395)

* Skip DeepSpeed ZeRO Stage 3 model initialization when it is intended to be quantized.

* Propagate the quantization state using a context manager

* make fixup
---
 src/transformers/modeling_utils.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 2ef4c3615c9fa2..6c332a7a6ae8e7 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -136,6 +136,7 @@
 
 
 _init_weights = True
+_is_quantized = False
 
 
 def is_fsdp_enabled():
@@ -213,6 +214,16 @@ def _skip_init(*args, **kwargs):
                 setattr(torch.nn.init, name, init_func)
 
 
+@contextmanager
+def set_quantized_state():
+    global _is_quantized
+    _is_quantized = True
+    try:
+        yield
+    finally:
+        _is_quantized = False
+
+
 def get_parameter_device(parameter: Union[nn.Module, "ModuleUtilsMixin"]):
     try:
         return next(parameter.parameters()).device
@@ -1531,7 +1542,7 @@ def _from_config(cls, config, **kwargs):
                 torch_dtype=torch_dtype,
             )
 
-        if is_deepspeed_zero3_enabled():
+        if is_deepspeed_zero3_enabled() and not _is_quantized:
             import deepspeed
 
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
@@ -4086,6 +4097,9 @@ def from_pretrained(
                 )
             init_contexts.append(init_empty_weights())
 
+        if is_deepspeed_zero3_enabled() and is_quantized:
+            init_contexts.append(set_quantized_state())
+
         config = copy.deepcopy(config)  # We do not want to modify the config inplace in from_pretrained.
         if not getattr(config, "_attn_implementation_autoset", False):
             config = cls._autoset_attn_implementation(

From 5e1fd4e204d81f2f66f8c164433e62ea5f4d0467 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Tue, 5 Nov 2024 10:26:13 +0100
Subject: [PATCH 02/12] FIX: Broken repr of TorchAoConfig (#34560)

FIX Broken repr of TorchAoConfig

The __repr__ method references a non-existent self.kwargs. This is now
fixed.

There does not appear to be a uniform way of defining __repr__ for
quantization configs. I copied the method as implemented for HQQ:

https://github.com/huggingface/transformers/blob/e2ac16b28a0b8b900e136750309ca40c49d975c5/src/transformers/utils/quantization_config.py#L285-L287
---
 src/transformers/utils/quantization_config.py          | 3 ++-
 tests/quantization/torchao_integration/test_torchao.py | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 026a2066798574..2f04df97e863a5 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1309,7 +1309,8 @@ def get_apply_tensor_subclass(self):
         return _STR_TO_METHOD[self.quant_type](**self.quant_type_kwargs)
 
     def __repr__(self):
-        return f"{self.quant_type}({', '.join(str(k) + '=' + str(v) for k, v in self.kwargs.items())})"
+        config_dict = self.to_dict()
+        return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
 
 
 @dataclass
diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py
index c7c701e49aec14..c3ab06ee61ba59 100644
--- a/tests/quantization/torchao_integration/test_torchao.py
+++ b/tests/quantization/torchao_integration/test_torchao.py
@@ -74,6 +74,13 @@ def test_post_init_check(self):
         with self.assertRaisesRegex(ValueError, "Unexpected keyword arg"):
             _ = TorchAoConfig("int4_weight_only", group_size1=32)
 
+    def test_repr(self):
+        """
+        Check that there is no error in the repr
+        """
+        quantization_config = TorchAoConfig("int4_weight_only", modules_to_not_convert=["conv"], group_size=8)
+        repr(quantization_config)
+
 
 @require_torch_gpu
 @require_torchao

From 893ad04fad145904ccb71e4e858e4134c32226b6 Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Tue, 5 Nov 2024 11:34:01 +0100
Subject: [PATCH 03/12] Load sub-configs from composite configs (#34410)

* save/load sub-configs

* nit forgot these

* fix copies

* move test to common

* use dict for sub-configs

* add load-save-laod test

* clean up modeling check

* oops this are correct keys

* fix some tests, missed some composite configs

* this model was missed
---
 src/transformers/configuration_utils.py       | 21 ++++-
 src/transformers/modeling_utils.py            | 17 ++--
 .../models/align/configuration_align.py       | 42 +--------
 .../models/altclip/configuration_altclip.py   | 23 +----
 .../models/bark/configuration_bark.py         | 47 +++-------
 .../models/blip/configuration_blip.py         | 42 +--------
 .../models/blip_2/configuration_blip_2.py     | 44 ++--------
 .../bridgetower/configuration_bridgetower.py  | 36 +-------
 .../chameleon/configuration_chameleon.py      |  2 +
 .../configuration_chinese_clip.py             | 42 +--------
 .../models/clap/configuration_clap.py         | 42 +--------
 .../models/clip/configuration_clip.py         | 42 +--------
 .../models/clipseg/configuration_clipseg.py   | 42 +--------
 .../models/clvp/configuration_clvp.py         | 28 ++----
 .../models/dbrx/configuration_dbrx.py         | 43 ++--------
 .../configuration_encoder_decoder.py          |  4 +-
 .../configuration_fastspeech2_conformer.py    |  4 +-
 .../models/flava/configuration_flava.py       | 85 +++----------------
 .../models/git/configuration_git.py           | 22 +----
 .../models/groupvit/configuration_groupvit.py | 42 +--------
 .../models/idefics/configuration_idefics.py   | 16 ++--
 .../models/idefics2/configuration_idefics2.py | 34 ++------
 .../models/idefics3/configuration_idefics3.py | 33 ++-----
 .../configuration_instructblip.py             | 48 ++---------
 .../configuration_instructblipvideo.py        | 47 ++--------
 .../modular_instructblipvideo.py              |  7 +-
 .../models/kosmos2/configuration_kosmos2.py   | 55 +++---------
 .../models/llava/configuration_llava.py       |  4 +-
 .../llava_next/configuration_llava_next.py    |  4 +-
 .../configuration_llava_next_video.py         |  4 +-
 .../modular_llava_next_video.py               |  4 +-
 .../configuration_llava_onevision.py          |  4 +-
 .../models/mllama/configuration_mllama.py     | 47 ++--------
 .../models/moshi/configuration_moshi.py       |  2 +-
 .../models/mpt/configuration_mpt.py           | 28 ++----
 .../models/musicgen/configuration_musicgen.py |  6 ++
 .../configuration_musicgen_melody.py          |  6 ++
 .../models/owlv2/configuration_owlv2.py       | 56 +-----------
 .../models/owlvit/configuration_owlvit.py     | 56 +-----------
 .../paligemma/configuration_paligemma.py      |  4 +-
 .../qwen2_audio/configuration_qwen2_audio.py  |  4 +-
 .../models/qwen2_vl/configuration_qwen2_vl.py | 22 +----
 .../models/siglip/configuration_siglip.py     | 42 +--------
 .../configuration_speech_encoder_decoder.py   |  1 +
 .../video_llava/configuration_video_llava.py  |  4 +-
 .../models/vipllava/configuration_vipllava.py |  4 +-
 .../configuration_vision_encoder_decoder.py   |  1 +
 .../configuration_vision_text_dual_encoder.py |  1 +
 .../models/x_clip/configuration_x_clip.py     | 42 +--------
 tests/models/align/test_modeling_align.py     |  9 ++
 tests/models/altclip/test_modeling_altclip.py |  9 ++
 tests/models/blip/test_modeling_blip.py       |  7 ++
 tests/models/blip_2/test_modeling_blip_2.py   |  7 ++
 tests/models/clap/test_modeling_clap.py       |  7 ++
 tests/models/clip/test_modeling_clip.py       |  7 ++
 tests/models/clipseg/test_modeling_clipseg.py |  7 ++
 tests/models/clvp/test_modeling_clvp.py       |  8 +-
 tests/models/flava/test_modeling_flava.py     |  7 ++
 .../models/groupvit/test_modeling_groupvit.py |  7 ++
 .../models/idefics2/test_modeling_idefics2.py |  7 +-
 .../models/idefics3/test_modeling_idefics3.py |  7 +-
 .../test_modeling_instructblip.py             |  9 ++
 .../test_modeling_instructblipvideo.py        |  7 ++
 tests/models/kosmos2/test_modeling_kosmos2.py |  7 +-
 tests/models/llava/test_modeling_llava.py     |  8 +-
 .../llava_next/test_modeling_llava_next.py    |  8 +-
 .../test_modeling_llava_next_video.py         |  8 +-
 .../test_modeling_llava_onevision.py          |  8 +-
 tests/models/mllama/test_modeling_mllama.py   |  7 +-
 tests/models/owlv2/test_modeling_owlv2.py     |  7 ++
 tests/models/owlvit/test_modeling_owlvit.py   |  7 ++
 .../models/qwen2_vl/test_modeling_qwen2_vl.py |  3 +
 tests/models/siglip/test_modeling_siglip.py   |  5 +-
 .../video_llava/test_modeling_video_llava.py  |  8 +-
 .../models/vipllava/test_modeling_vipllava.py |  8 +-
 tests/models/x_clip/test_modeling_x_clip.py   |  7 ++
 tests/test_configuration_common.py            | 50 ++++++++++-
 tests/test_modeling_common.py                 | 14 ++-
 78 files changed, 464 insertions(+), 1052 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 1d892c49a231fc..60f9f34cf861c9 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -190,6 +190,8 @@ class PretrainedConfig(PushToHubMixin):
     """
 
     model_type: str = ""
+    base_config_key: str = ""
+    sub_configs: Dict[str, "PretrainedConfig"] = {}
     is_composition: bool = False
     attribute_map: Dict[str, str] = {}
     _auto_class: Optional[str] = None
@@ -543,11 +545,22 @@ def from_pretrained(
         cls._set_token_in_kwargs(kwargs, token)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if cls.base_config_key and cls.base_config_key in config_dict:
+            config_dict = config_dict[cls.base_config_key]
+
         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
+            # sometimes the config has no `base_config_key` if the config is used in several composite models
+            # e.g. LlamaConfig. In that case we try to see if there is match in `model_type` before raising a warning
+            for k, v in config_dict.items():
+                if isinstance(v, dict) and v.get("model_type") == cls.model_type:
+                    config_dict = v
+
+            # raise warning only if we still can't see a match in `model_type`
+            if config_dict["model_type"] != cls.model_type:
+                logger.warning(
+                    f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                    f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+                )
 
         return cls.from_dict(config_dict, **kwargs)
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 6c332a7a6ae8e7..0df59d1db8e05b 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1608,15 +1608,14 @@ def _autoset_attn_implementation(
         # Below we check if a config is composite and manually prepare a dict of attn impl if not already passed as a dict.
         # Later each sub-module will dispatch with its own attn impl, by calling `XXXModel._from_config(config.text_config)`
         # If any of sub-modules doesn't support requested attn, an error will be raised. See https://github.com/huggingface/transformers/pull/32238
-        for key in config:
-            if isinstance(getattr(config, key), PretrainedConfig):
-                sub_config = getattr(config, key)
-                curr_attn_implementation = (
-                    requested_attn_implementation
-                    if not isinstance(requested_attn_implementation, dict)
-                    else requested_attn_implementation.get(key, None)
-                )
-                sub_config._attn_implementation_internal = curr_attn_implementation
+        for key in config.sub_configs.keys():
+            sub_config = getattr(config, key)
+            curr_attn_implementation = (
+                requested_attn_implementation
+                if not isinstance(requested_attn_implementation, dict)
+                else requested_attn_implementation.get(key, None)
+            )
+            sub_config._attn_implementation_internal = curr_attn_implementation
 
         if use_flash_attention_2:
             logger.warning_once(
diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py
index 99fa81b4a9350d..a22ab1dc40f8d0 100644
--- a/src/transformers/models/align/configuration_align.py
+++ b/src/transformers/models/align/configuration_align.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """ALIGN model configuration"""
 
-import os
-from typing import TYPE_CHECKING, List, Union
+from typing import TYPE_CHECKING, List
 
 
 if TYPE_CHECKING:
@@ -95,6 +94,7 @@ class AlignTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "align_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -133,24 +133,6 @@ def __init__(
         self.use_cache = use_cache
         self.pad_token_id = pad_token_id
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from AlignConfig
-        if config_dict.get("model_type") == "align":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class AlignVisionConfig(PretrainedConfig):
     r"""
@@ -223,6 +205,7 @@ class AlignVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "align_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -272,24 +255,6 @@ def __init__(
         self.drop_connect_rate = drop_connect_rate
         self.num_hidden_layers = sum(num_block_repeats) * 4
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from AlignConfig
-        if config_dict.get("model_type") == "align":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class AlignConfig(PretrainedConfig):
     r"""
@@ -340,6 +305,7 @@ class AlignConfig(PretrainedConfig):
     ```"""
 
     model_type = "align"
+    sub_configs = {"text_config": AlignTextConfig, "vision_config": AlignVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py
index 7333fa63a35280..3c8e91bd473533 100755
--- a/src/transformers/models/altclip/configuration_altclip.py
+++ b/src/transformers/models/altclip/configuration_altclip.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """AltCLIP model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -199,6 +196,7 @@ class AltCLIPVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "altclip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -233,24 +231,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from AltCLIPConfig
-        if config_dict.get("model_type") == "altclip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class AltCLIPConfig(PretrainedConfig):
     r"""
@@ -298,6 +278,7 @@ class AltCLIPConfig(PretrainedConfig):
     ```"""
 
     model_type = "altclip"
+    sub_configs = {"text_config": AltCLIPTextConfig, "vision_config": AltCLIPVisionConfig}
 
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs
diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py
index 6dd08b65e89e6c..a498d1dd19371d 100644
--- a/src/transformers/models/bark/configuration_bark.py
+++ b/src/transformers/models/bark/configuration_bark.py
@@ -14,12 +14,11 @@
 # limitations under the License.
 """BARK model configuration"""
 
-import os
-from typing import Dict, Optional, Union
+from typing import Dict
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import add_start_docstrings, logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -64,7 +63,6 @@
 
 
 class BarkSubModelConfig(PretrainedConfig):
-    model_type = "bark_module"
     keys_to_ignore_at_inference = ["past_key_values"]
 
     attribute_map = {
@@ -101,38 +99,6 @@ def __init__(
 
         super().__init__(**kwargs)
 
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
-        cache_dir: Optional[Union[str, os.PathLike]] = None,
-        force_download: bool = False,
-        local_files_only: bool = False,
-        token: Optional[Union[str, bool]] = None,
-        revision: str = "main",
-        **kwargs,
-    ) -> "PretrainedConfig":
-        kwargs["cache_dir"] = cache_dir
-        kwargs["force_download"] = force_download
-        kwargs["local_files_only"] = local_files_only
-        kwargs["revision"] = revision
-
-        cls._set_token_in_kwargs(kwargs, token)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the config dict if we are loading from Bark
-        if config_dict.get("model_type") == "bark":
-            config_dict = config_dict[f"{cls.model_type}_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 @add_start_docstrings(
     BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkSemanticConfig", model="BarkSemanticModel"),
@@ -154,6 +120,7 @@ def from_pretrained(
 )
 class BarkSemanticConfig(BarkSubModelConfig):
     model_type = "semantic"
+    base_config_key = "semantic_config"
 
 
 @add_start_docstrings(
@@ -176,6 +143,7 @@ class BarkSemanticConfig(BarkSubModelConfig):
 )
 class BarkCoarseConfig(BarkSubModelConfig):
     model_type = "coarse_acoustics"
+    base_config_key = "coarse_acoustics_config"
 
 
 @add_start_docstrings(
@@ -203,6 +171,7 @@ class BarkCoarseConfig(BarkSubModelConfig):
 )
 class BarkFineConfig(BarkSubModelConfig):
     model_type = "fine_acoustics"
+    base_config_key = "fine_acoustics_config"
 
     def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs):
         self.n_codes_total = n_codes_total
@@ -265,6 +234,12 @@ class BarkConfig(PretrainedConfig):
     """
 
     model_type = "bark"
+    sub_configs = {
+        "semantic_config": BarkSemanticConfig,
+        "coarse_acoustics_config": BarkCoarseConfig,
+        "fine_acoustics_config": BarkFineConfig,
+        "codec_config": AutoConfig,
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py
index 4772738be10352..18db71eb14890b 100644
--- a/src/transformers/models/blip/configuration_blip.py
+++ b/src/transformers/models/blip/configuration_blip.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """Blip model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -96,6 +93,7 @@ class BlipTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "blip_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -146,24 +144,6 @@ def __init__(
         self.use_cache = use_cache
         self.label_smoothing = label_smoothing
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from BlipConfig
-        if config_dict.get("model_type") == "blip":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class BlipVisionConfig(PretrainedConfig):
     r"""
@@ -215,6 +195,7 @@ class BlipVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "blip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -245,24 +226,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from BlipConfig
-        if config_dict.get("model_type") == "blip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class BlipConfig(PretrainedConfig):
     r"""
@@ -316,6 +279,7 @@ class BlipConfig(PretrainedConfig):
     ```"""
 
     model_type = "blip"
+    sub_configs = {"text_config": BlipTextConfig, "vision_config": BlipVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py
index 16fa4aec38492b..d690d22338a687 100644
--- a/src/transformers/models/blip_2/configuration_blip_2.py
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@@ -14,13 +14,12 @@
 # limitations under the License.
 """BLIP-2 model configuration"""
 
-import os
-from typing import Optional, Union
+from typing import Optional
 
 from ...configuration_utils import PretrainedConfig
 from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -76,6 +75,7 @@ class Blip2VisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "blip_2_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -106,24 +106,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.qkv_bias = qkv_bias
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from Blip2Config
-        if config_dict.get("model_type") == "blip-2":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Blip2QFormerConfig(PretrainedConfig):
     r"""
@@ -190,6 +172,7 @@ class Blip2QFormerConfig(PretrainedConfig):
     ```"""
 
     model_type = "blip_2_qformer"
+    base_config_key = "qformer_config"
 
     def __init__(
         self,
@@ -229,24 +212,6 @@ def __init__(
         self.encoder_hidden_size = encoder_hidden_size
         self.use_qformer_text_input = use_qformer_text_input
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the qformer config dict if we are loading from Blip2Config
-        if config_dict.get("model_type") == "blip-2":
-            config_dict = config_dict["qformer_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Blip2Config(PretrainedConfig):
     r"""
@@ -306,6 +271,7 @@ class Blip2Config(PretrainedConfig):
     ```"""
 
     model_type = "blip-2"
+    sub_configs = {"text_config": AutoConfig, "qformer_config": Blip2QFormerConfig, "vision_config": Blip2VisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py
index 4985b6ef89fec2..de49283493b63f 100644
--- a/src/transformers/models/bridgetower/configuration_bridgetower.py
+++ b/src/transformers/models/bridgetower/configuration_bridgetower.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """BridgeTower model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -68,6 +65,7 @@ class BridgeTowerVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "bridgetower_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -95,21 +93,6 @@ def __init__(
         self.share_layernorm = share_layernorm
         self.remove_last_layer = remove_last_layer
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "bridgetower":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class BridgeTowerTextConfig(PretrainedConfig):
     r"""
@@ -175,6 +158,7 @@ class BridgeTowerTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "bridgetower_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -217,21 +201,6 @@ def __init__(
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "bridgetower":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class BridgeTowerConfig(PretrainedConfig):
     r"""
@@ -288,6 +257,7 @@ class BridgeTowerConfig(PretrainedConfig):
     ```"""
 
     model_type = "bridgetower"
+    sub_configs = {"text_config": BridgeTowerTextConfig, "vision_config": BridgeTowerVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py
index 67de37f2d01b2c..9842127e7bb48f 100644
--- a/src/transformers/models/chameleon/configuration_chameleon.py
+++ b/src/transformers/models/chameleon/configuration_chameleon.py
@@ -62,6 +62,7 @@ class ChameleonVQVAEConfig(PretrainedConfig):
     """
 
     model_type = "chameleon_vqgan"
+    base_config_key = "vq_config"
 
     def __init__(
         self,
@@ -187,6 +188,7 @@ class ChameleonConfig(PretrainedConfig):
     ```"""
 
     model_type = "chameleon"
+    sub_configs = {"vq_config": ChameleonVQVAEConfig}
     keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
index 5b37044fab500d..d50d6c842b313c 100644
--- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 """Chinese-CLIP model configuration"""
 
-import os
 from collections import OrderedDict
-from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+from typing import TYPE_CHECKING, Any, Mapping, Optional
 
 
 if TYPE_CHECKING:
@@ -102,6 +101,7 @@ class ChineseCLIPTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "chinese_clip_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -141,24 +141,6 @@ def __init__(
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from ChineseCLIPConfig
-        if config_dict.get("model_type") == "chinese_clip":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class ChineseCLIPVisionConfig(PretrainedConfig):
     r"""
@@ -215,6 +197,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "chinese_clip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -249,24 +232,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from ChineseCLIPConfig
-        if config_dict.get("model_type") == "chinese_clip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class ChineseCLIPConfig(PretrainedConfig):
     r"""
@@ -316,6 +281,7 @@ class ChineseCLIPConfig(PretrainedConfig):
     ```"""
 
     model_type = "chinese_clip"
+    sub_configs = {"text_config": ChineseCLIPTextConfig, "vision_config": ChineseCLIPVisionConfig}
 
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 1425e2a86289cc..b2added7f0e073 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """CLAP model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -94,6 +91,7 @@ class ClapTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "clap_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -137,24 +135,6 @@ def __init__(
         self.projection_hidden_act = projection_hidden_act
         self.projection_dim = projection_dim
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from ClapConfig
-        if config_dict.get("model_type") == "clap":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class ClapAudioConfig(PretrainedConfig):
     r"""
@@ -245,6 +225,7 @@ class ClapAudioConfig(PretrainedConfig):
     ```"""
 
     model_type = "clap_audio_model"
+    base_config_key = "audio_config"
 
     def __init__(
         self,
@@ -307,24 +288,6 @@ def __init__(
         self.initializer_factor = initializer_factor
         self.projection_hidden_act = projection_hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the audio config dict if we are loading from ClapConfig
-        if config_dict.get("model_type") == "clap":
-            config_dict = config_dict["audio_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class ClapConfig(PretrainedConfig):
     r"""
@@ -377,6 +340,7 @@ class ClapConfig(PretrainedConfig):
     ```"""
 
     model_type = "clap"
+    sub_configs = {"text_config": ClapTextConfig, "audio_config": ClapAudioConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py
index 8e027f5c3f010f..2e1f2deede00c9 100644
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 """CLIP model configuration"""
 
-import os
 from collections import OrderedDict
-from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+from typing import TYPE_CHECKING, Any, Mapping, Optional
 
 
 if TYPE_CHECKING:
@@ -93,6 +92,7 @@ class CLIPTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "clip_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -130,24 +130,6 @@ def __init__(
         self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from CLIPConfig
-        if config_dict.get("model_type") == "clip":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class CLIPVisionConfig(PretrainedConfig):
     r"""
@@ -205,6 +187,7 @@ class CLIPVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "clip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -239,24 +222,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from CLIPConfig
-        if config_dict.get("model_type") == "clip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class CLIPConfig(PretrainedConfig):
     r"""
@@ -305,6 +270,7 @@ class CLIPConfig(PretrainedConfig):
     ```"""
 
     model_type = "clip"
+    sub_configs = {"text_config": CLIPTextConfig, "vision_config": CLIPVisionConfig}
 
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index 0ac8196fc7f546..5474840f357a34 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """CLIPSeg model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -84,6 +81,7 @@ class CLIPSegTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "clipseg_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -117,24 +115,6 @@ def __init__(
         self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from CLIPSegConfig
-        if config_dict.get("model_type") == "clipseg":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class CLIPSegVisionConfig(PretrainedConfig):
     r"""
@@ -190,6 +170,7 @@ class CLIPSegVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "clipseg_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -222,24 +203,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from CLIPSegConfig
-        if config_dict.get("model_type") == "clipseg":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class CLIPSegConfig(PretrainedConfig):
     r"""
@@ -306,6 +269,7 @@ class CLIPSegConfig(PretrainedConfig):
     ```"""
 
     model_type = "clipseg"
+    sub_configs = {"text_config": CLIPSegTextConfig, "vision_config": CLIPSegVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/clvp/configuration_clvp.py b/src/transformers/models/clvp/configuration_clvp.py
index d17a04c861bf3b..8fd0e150801a66 100644
--- a/src/transformers/models/clvp/configuration_clvp.py
+++ b/src/transformers/models/clvp/configuration_clvp.py
@@ -91,6 +91,7 @@ class ClvpEncoderConfig(PretrainedConfig):
     ```"""
 
     model_type = "clvp_encoder"
+    base_config_key = ["text_config", "speech_config"]
 
     def __init__(
         self,
@@ -141,7 +142,7 @@ def from_pretrained(
 
         # make sure to have the config_type be either "text_config" or "speech_config"
         # this is to make sure that we can load only text or speech configs from the nested ClvpConfig.
-        if config_type not in ["text_config", "speech_config"]:
+        if config_type not in cls.base_config_key:
             raise ValueError(
                 f"We can only load either 'text_config' or 'speech_config' but you are trying to load" f"{config_type}"
             )
@@ -253,6 +254,7 @@ class ClvpDecoderConfig(PretrainedConfig):
     ```"""
 
     model_type = "clvp_decoder"
+    base_config_key = "decoder_config"
 
     def __init__(
         self,
@@ -314,24 +316,6 @@ def __init__(
 
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the speech config dict if we are loading from ClvpConfig
-        if config_dict.get("model_type") == "clvp":
-            config_dict = config_dict["decoder_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class ClvpConfig(PretrainedConfig):
     r"""
@@ -386,7 +370,11 @@ class ClvpConfig(PretrainedConfig):
     ```"""
 
     model_type = "clvp"
-    is_composition = True
+    sub_configs = {
+        "text_config": ClvpEncoderConfig,
+        "speech_config": ClvpEncoderConfig,
+        "decoder_config": ClvpDecoderConfig,
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index dde5232ae5cc9b..302b5e6a55821d 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -41,6 +41,8 @@ class DbrxAttentionConfig(PretrainedConfig):
         rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
     """
 
+    base_config_key = "attn_config"
+
     def __init__(
         self,
         attn_pdrop: float = 0.0,
@@ -55,29 +57,12 @@ def __init__(
         self.kv_n_heads = kv_n_heads
         self.rope_theta = rope_theta
 
-        for k in ["model_type"]:
+        for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]:
             if k in kwargs:
                 kwargs.pop(k)
         if len(kwargs) != 0:
             raise ValueError(f"Found unknown {kwargs=}")
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "dbrx":
-            config_dict = config_dict["attn_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class DbrxFFNConfig(PretrainedConfig):
     """Configuration class for Dbrx FFN.
@@ -100,6 +85,8 @@ class DbrxFFNConfig(PretrainedConfig):
         moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
     """
 
+    base_config_key = "ffn_config"
+
     def __init__(
         self,
         ffn_act_fn: dict = None,
@@ -122,29 +109,12 @@ def __init__(
         self.moe_loss_weight = moe_loss_weight
         self.moe_normalize_expert_weights = moe_normalize_expert_weights
 
-        for k in ["model_type"]:
+        for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]:
             if k in kwargs:
                 kwargs.pop(k)
         if len(kwargs) != 0:
             raise ValueError(f"Found unknown {kwargs=}")
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "dbrx":
-            config_dict = config_dict["ffn_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class DbrxConfig(PretrainedConfig):
     r"""
@@ -202,6 +172,7 @@ class DbrxConfig(PretrainedConfig):
     """
 
     model_type = "dbrx"
+    sub_configs = {"attn_config": DbrxAttentionConfig, "ffn_config": DbrxFFNConfig}
     attribute_map = {
         "num_attention_heads": "n_heads",
         "hidden_size": "d_model",
diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
index ab5d49b32fea90..5190ed51ffd350 100644
--- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
@@ -17,6 +17,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ..auto import AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -70,6 +71,7 @@ class EncoderDecoderConfig(PretrainedConfig):
     ```"""
 
     model_type = "encoder-decoder"
+    sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
     is_composition = True
 
     def __init__(self, **kwargs):
@@ -84,8 +86,6 @@ def __init__(self, **kwargs):
         decoder_config = kwargs.pop("decoder")
         decoder_model_type = decoder_config.pop("model_type")
 
-        from ..auto.configuration_auto import AutoConfig
-
         self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
         self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
         self.is_encoder_decoder = True
diff --git a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
index ade5b8b2667537..59a1b029751646 100644
--- a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
@@ -164,6 +164,7 @@ class FastSpeech2ConformerConfig(PretrainedConfig):
     ```"""
 
     model_type = "fastspeech2_conformer"
+    base_config_key = "model_config"
     attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"}
 
     def __init__(
@@ -377,6 +378,7 @@ class FastSpeech2ConformerHifiGanConfig(PretrainedConfig):
     ```"""
 
     model_type = "hifigan"
+    base_config_key = "vocoder_config"
 
     def __init__(
         self,
@@ -453,7 +455,7 @@ class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig):
     """
 
     model_type = "fastspeech2_conformer_with_hifigan"
-    is_composition = True
+    sub_configs = {"model_config": FastSpeech2ConformerConfig, "vocoder_config": FastSpeech2ConformerHifiGanConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py
index b6349361c0dda8..47cdb488a2eb5d 100644
--- a/src/transformers/models/flava/configuration_flava.py
+++ b/src/transformers/models/flava/configuration_flava.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """FLAVA model configurations"""
 
-import os
-from typing import Any, Dict, Union
+from typing import Any, Dict
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -86,6 +85,7 @@ class FlavaImageConfig(PretrainedConfig):
     ```"""
 
     model_type = "flava_image_model"
+    base_config_key = "image_config"
 
     def __init__(
         self,
@@ -124,24 +124,6 @@ def __init__(
         self.mask_token = mask_token
         self.vocab_size = vocab_size
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the image config dict if we are loading from FlavaConfig
-        if config_dict.get("model_type") == "flava":
-            config_dict = config_dict["image_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class FlavaTextConfig(PretrainedConfig):
     r"""
@@ -216,6 +198,7 @@ class FlavaTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "flava_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -254,24 +237,6 @@ def __init__(
         self.qkv_bias = qkv_bias
         self.pad_token_id = pad_token_id
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from FlavaConfig
-        if config_dict.get("model_type") == "flava":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class FlavaMultimodalConfig(PretrainedConfig):
     r"""
@@ -327,6 +292,7 @@ class FlavaMultimodalConfig(PretrainedConfig):
     ```"""
 
     model_type = "flava_multimodal_model"
+    base_config_key = "multimodal_config"
 
     def __init__(
         self,
@@ -357,27 +323,10 @@ def __init__(
         self.qkv_bias = qkv_bias
         self.use_cls_token = use_cls_token
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the multimodal config dict if we are loading from FlavaConfig
-        if config_dict.get("model_type") == "flava":
-            config_dict = config_dict["multimodal_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class FlavaImageCodebookConfig(PretrainedConfig):
     model_type = "flava_image_codebook"
+    base_config_key = "image_codebook_config"
 
     r"""
     [`FlavaImageCodebookConfig`] is the configuration class to store the configuration of a [`FlavaImageCodebook`]. It
@@ -442,24 +391,6 @@ def __init__(
         self.freeze = freeze
         self.initializer_range = initializer_range
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the image codebook config dict if we are loading from FlavaConfig
-        if config_dict.get("model_type") == "flava":
-            config_dict = config_dict["image_codebook_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class FlavaConfig(PretrainedConfig):
     r"""
@@ -532,6 +463,12 @@ class FlavaConfig(PretrainedConfig):
     """
 
     model_type = "flava"
+    sub_configs = {
+        "text_config": FlavaTextConfig,
+        "image_config": FlavaImageConfig,
+        "multimodal_config": FlavaMultimodalConfig,
+        "image_codebook_config": FlavaImageCodebookConfig,
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py
index ecaea17ff946af..1be3e7067bdfcf 100644
--- a/src/transformers/models/git/configuration_git.py
+++ b/src/transformers/models/git/configuration_git.py
@@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from typing import Union
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -72,6 +70,7 @@ class GitVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "git_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -102,24 +101,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from GITConfig
-        if config_dict.get("model_type") == "git":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class GitConfig(PretrainedConfig):
     r"""
@@ -186,6 +167,7 @@ class GitConfig(PretrainedConfig):
     ```"""
 
     model_type = "git"
+    sub_configs = {"vision_config": GitVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py
index e608fbcdbe9c0a..e85e4fc9184371 100644
--- a/src/transformers/models/groupvit/configuration_groupvit.py
+++ b/src/transformers/models/groupvit/configuration_groupvit.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 """GroupViT model configuration"""
 
-import os
 from collections import OrderedDict
-from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+from typing import TYPE_CHECKING, Any, Mapping, Optional
 
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
@@ -86,6 +85,7 @@ class GroupViTTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "groupvit_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -121,24 +121,6 @@ def __init__(
         self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from GroupViTConfig
-        if config_dict.get("model_type") == "groupvit":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class GroupViTVisionConfig(PretrainedConfig):
     r"""
@@ -197,6 +179,7 @@ class GroupViTVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "groupvit_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -246,24 +229,6 @@ def __init__(
         self.assign_eps = assign_eps
         self.assign_mlp_ratio = assign_mlp_ratio
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from GroupViTConfig
-        if config_dict.get("model_type") == "groupvit":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class GroupViTConfig(PretrainedConfig):
     r"""
@@ -292,6 +257,7 @@ class GroupViTConfig(PretrainedConfig):
     """
 
     model_type = "groupvit"
+    sub_configs = {"text_config": GroupViTTextConfig, "vision_config": GroupViTVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py
index 56b6025a8e89dd..e34a5764400196 100644
--- a/src/transformers/models/idefics/configuration_idefics.py
+++ b/src/transformers/models/idefics/configuration_idefics.py
@@ -38,7 +38,7 @@ class IdeficsVisionConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        hidden_size (`int`, *optional*, defaults to 768):
+        embed_dim (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
@@ -50,12 +50,12 @@ class IdeficsVisionConfig(PretrainedConfig):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        image_num_channels (`int`, *optional*, defaults to `3`):
+        num_channels (`int`, *optional*, defaults to 3):
             Number of image channels.
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
@@ -64,11 +64,9 @@ class IdeficsVisionConfig(PretrainedConfig):
         initializer_factor (`float`, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
             testing).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
     """
 
-    model_type = "idefics"
+    model_type = "idefics_vision"
     attribute_map = {
         "hidden_size": "embed_dim",
     }
@@ -119,7 +117,7 @@ class IdeficsPerceiverConfig(PretrainedConfig):
     Args:
         use_resampler (`bool`, *optional*, defaults to `False`):
             Whether or not to use the resampler
-        resampler_n_latents (`int`, *optional*, defaults to ):
+        resampler_n_latents (`int`, *optional*, defaults to 64):
             Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
         resampler_depth (`int`, *optional*, defaults to 6):
             Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
@@ -131,7 +129,7 @@ class IdeficsPerceiverConfig(PretrainedConfig):
             Whether or not to use qk layer norms in perceiver
     """
 
-    model_type = "idefics"
+    model_type = "idefics_perciever"
 
     def __init__(
         self,
@@ -235,7 +233,7 @@ class IdeficsConfig(PretrainedConfig):
     ```"""
 
     model_type = "idefics"
-    is_composition = False
+    sub_configs = {"perceiver_config": IdeficsPerceiverConfig, "vision_config": IdeficsVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/idefics2/configuration_idefics2.py b/src/transformers/models/idefics2/configuration_idefics2.py
index 64743d1cd470e7..408d374c77f7eb 100644
--- a/src/transformers/models/idefics2/configuration_idefics2.py
+++ b/src/transformers/models/idefics2/configuration_idefics2.py
@@ -13,12 +13,9 @@
 # limitations under the License.
 """Idefics2 model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -76,7 +73,8 @@ class Idefics2VisionConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    model_type = "idefics2"
+    model_type = "idefics2_vision"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -107,24 +105,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from Idefics2Config
-        if config_dict.get("model_type") == "idefics2":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Idefics2PerceiverConfig(PretrainedConfig):
     r"""
@@ -152,7 +132,7 @@ class Idefics2PerceiverConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
     """
 
-    model_type = "idefics2"
+    model_type = "idefics2_perceiver"
 
     def __init__(
         self,
@@ -220,7 +200,11 @@ class Idefics2Config(PretrainedConfig):
     ```"""
 
     model_type = "idefics2"
-    is_composition = True
+    sub_configs = {
+        "text_config": AutoConfig,
+        "perceiver_config": Idefics2PerceiverConfig,
+        "vision_config": Idefics2VisionConfig,
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py
index 45afe685f5209c..4b10d8d2d03a81 100644
--- a/src/transformers/models/idefics3/configuration_idefics3.py
+++ b/src/transformers/models/idefics3/configuration_idefics3.py
@@ -13,12 +13,9 @@
 # limitations under the License.
 """Idefics3 model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -57,8 +54,7 @@ class Idefics3VisionConfig(PretrainedConfig):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        intializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation for initializing all weight matrices in the model.
+        initializer_range (`<fill_type>`, *optional*, defaults to 0.02): <fill_docstring>
 
     Example:
 
@@ -76,7 +72,8 @@ class Idefics3VisionConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    model_type = "idefics3"
+    model_type = "idefics3_vision"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -107,24 +104,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from Idefics3Config
-        if config_dict.get("model_type") == "idefics3":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Idefics3Config(PretrainedConfig):
     r"""
@@ -165,7 +144,7 @@ class Idefics3Config(PretrainedConfig):
     ```"""
 
     model_type = "idefics3"
-    is_composition = True
+    sub_configs = {"text_config": AutoConfig, "vision_config": Idefics3VisionConfig}
 
     def __init__(
         self,
@@ -204,4 +183,4 @@ def __init__(
         self.text_config = text_config
         self.scale_factor = scale_factor
 
-        super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
+        super().__init__(**kwargs, pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings)
diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py
index a274212a945e04..6124dba3a08efe 100644
--- a/src/transformers/models/instructblip/configuration_instructblip.py
+++ b/src/transformers/models/instructblip/configuration_instructblip.py
@@ -14,13 +14,10 @@
 # limitations under the License.
 """InstructBLIP model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -78,6 +75,7 @@ class InstructBlipVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -108,24 +106,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.qkv_bias = qkv_bias
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from InstructBlipConfig
-        if config_dict.get("model_type") == "instructblip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class InstructBlipQFormerConfig(PretrainedConfig):
     r"""
@@ -192,6 +172,7 @@ class InstructBlipQFormerConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblip_qformer"
+    base_config_key = "qformer_config"
 
     def __init__(
         self,
@@ -229,24 +210,6 @@ def __init__(
         self.cross_attention_frequency = cross_attention_frequency
         self.encoder_hidden_size = encoder_hidden_size
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the qformer config dict if we are loading from InstructBlipConfig
-        if config_dict.get("model_type") == "instructblip":
-            config_dict = config_dict["qformer_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class InstructBlipConfig(PretrainedConfig):
     r"""
@@ -305,6 +268,11 @@ class InstructBlipConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblip"
+    sub_configs = {
+        "text_config": AutoConfig,
+        "qformer_config": InstructBlipQFormerConfig,
+        "vision_config": InstructBlipVisionConfig,
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
index e7c8eeccef98b4..14687a96e54f37 100644
--- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
@@ -19,13 +19,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from typing import Union
 
 from ...configuration_utils import PretrainedConfig
 from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -83,6 +81,7 @@ class InstructBlipVideoVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblipvideo_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -113,24 +112,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.qkv_bias = qkv_bias
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from InstructBlipVideoConfig
-        if config_dict.get("model_type") == "instructblipvideo":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class InstructBlipVideoQFormerConfig(PretrainedConfig):
     r"""
@@ -197,6 +178,7 @@ class InstructBlipVideoQFormerConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblipvideo_qformer"
+    base_config_key = "qformer_config"
 
     def __init__(
         self,
@@ -234,24 +216,6 @@ def __init__(
         self.cross_attention_frequency = cross_attention_frequency
         self.encoder_hidden_size = encoder_hidden_size
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the qformer config dict if we are loading from InstructBlipVideoConfig
-        if config_dict.get("model_type") == "instructblipvideo":
-            config_dict = config_dict["qformer_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class InstructBlipVideoConfig(PretrainedConfig):
     r"""
@@ -310,6 +274,11 @@ class InstructBlipVideoConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblipvideo"
+    sub_configs = {
+        "text_config": AutoConfig,
+        "qformer_config": InstructBlipVideoQFormerConfig,
+        "vision_config": InstructBlipVideoVisionConfig,
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
index 63c6c486854c57..b0dc8a215740f1 100644
--- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
@@ -32,7 +32,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -103,6 +103,11 @@ class InstructBlipVideoConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblipvideo"
+    sub_configs = {
+        "text_config": AutoConfig,
+        "qformer_config": InstructBlipVideoQFormerConfig,
+        "vision_config": InstructBlipVideoVisionConfig,
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/kosmos2/configuration_kosmos2.py b/src/transformers/models/kosmos2/configuration_kosmos2.py
index e49074f8061b2c..921ec336c0be80 100644
--- a/src/transformers/models/kosmos2/configuration_kosmos2.py
+++ b/src/transformers/models/kosmos2/configuration_kosmos2.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """KOSMOS-2 model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -61,7 +58,7 @@ class Kosmos2TextConfig(PretrainedConfig):
         layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
             for more details.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -69,9 +66,16 @@ class Kosmos2TextConfig(PretrainedConfig):
             Scale embeddings by diving by sqrt(embed_dim).
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Token id used for padding.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            Token id used for beginning of string.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            Token id used for end of string.
     ```"""
 
     model_type = "kosmos_2_text_model"
+    base_config_key = "text_config"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {
         "num_attention_heads": "attention_heads",
@@ -124,24 +128,6 @@ def __init__(
         self.scale_embedding = scale_embedding
         self.use_cache = use_cache
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from Kosmos2Config
-        if config_dict.get("model_type") == "kosmos-2":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Kosmos2VisionConfig(PretrainedConfig):
     r"""
@@ -171,18 +157,19 @@ class Kosmos2VisionConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1):
+        initializer_factor (`float`, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
     ```"""
 
     model_type = "kosmos_2_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -215,24 +202,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from Kosmos2Config
-        if config_dict.get("model_type") == "kosmos-2":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Kosmos2Config(PretrainedConfig):
     r"""
@@ -267,7 +236,7 @@ class Kosmos2Config(PretrainedConfig):
     ```"""
 
     model_type = "kosmos-2"
-    is_composition = True
+    sub_configs = {"text_config": Kosmos2TextConfig, "vision_config": Kosmos2VisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py
index 3a4cb09855f0ec..05034f5cfcf6f8 100644
--- a/src/transformers/models/llava/configuration_llava.py
+++ b/src/transformers/models/llava/configuration_llava.py
@@ -15,7 +15,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -73,7 +73,7 @@ class LlavaConfig(PretrainedConfig):
     ```"""
 
     model_type = "llava"
-    is_composition = True
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py
index e8768dde85722b..54616edbf96dce 100644
--- a/src/transformers/models/llava_next/configuration_llava_next.py
+++ b/src/transformers/models/llava_next/configuration_llava_next.py
@@ -15,7 +15,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -78,7 +78,7 @@ class LlavaNextConfig(PretrainedConfig):
     ```"""
 
     model_type = "llava_next"
-    is_composition = False
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
index 0e4e39b4b3ab53..2fe889da60336b 100644
--- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
@@ -21,7 +21,7 @@
 
 
 from ...configuration_utils import PretrainedConfig
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 class LlavaNextVideoConfig(PretrainedConfig):
@@ -86,7 +86,7 @@ class LlavaNextVideoConfig(PretrainedConfig):
     ```"""
 
     model_type = "llava_next_video"
-    is_composition = True
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py
index 8018afa7244dd2..002b450c2af232 100644
--- a/src/transformers/models/llava_next_video/modular_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py
@@ -31,7 +31,7 @@
 from ...utils import (
     logging,
 )
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -99,7 +99,7 @@ class LlavaNextVideoConfig(PretrainedConfig):
     ```"""
 
     model_type = "llava_next_video"
-    is_composition = True
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py
index eef86c6c8c019b..46b65b35b1a5cb 100644
--- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py
@@ -18,7 +18,7 @@
 from ...utils import (
     logging,
 )
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -81,7 +81,7 @@ class LlavaOnevisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "llava_onevision"
-    is_composition = False
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py
index 539fc61ba4edba..635ca503205f5f 100644
--- a/src/transformers/models/mllama/configuration_mllama.py
+++ b/src/transformers/models/mllama/configuration_mllama.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 """Mllama model configuration"""
 
-import os
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional
 
 from ...configuration_utils import PretrainedConfig
 from ...modeling_rope_utils import rope_config_validation
@@ -59,7 +58,7 @@ class MllamaVisionConfig(PretrainedConfig):
             The size (resolution) of each image *tile*.
         patch_size (`int`, *optional*, defaults to 14):
             The size (resolution) of each patch.
-        norm_eps (`float`, *optional*, defaults to 1e-5):
+        norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         max_num_tiles (`int`, *optional*, defaults to 4):
             Maximum number of tiles for image splitting.
@@ -88,6 +87,7 @@ class MllamaVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "mllama_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -137,23 +137,6 @@ def __init__(
     def max_aspect_ratio_id(self) -> int:
         return len(self.supported_aspect_ratios)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "mllama":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class MllamaTextConfig(PretrainedConfig):
     r"""
@@ -178,12 +161,12 @@ class MllamaTextConfig(PretrainedConfig):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 32):
             Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*):
+        num_key_value_heads (`int`, *optional*, defaults to 8):
             This is the number of key_value heads that should be used to implement Grouped Query Attention. If not
             specified, will default to `num_attention_heads`.
         intermediate_size (`int`, *optional*, defaults to 14336):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        rope_theta (`float`, *optional*, defaults to 500000.0):
+        rope_theta (`float`, *optional*, defaults to `500000.0`):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
@@ -259,6 +242,7 @@ class MllamaTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "mllama_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -311,23 +295,6 @@ def __init__(
             **kwargs,
         )
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "mllama":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class MllamaConfig(PretrainedConfig):
     r"""
@@ -370,7 +337,7 @@ class MllamaConfig(PretrainedConfig):
     ```"""
 
     model_type = "mllama"
-    is_composition = True
+    sub_configs = {"text_config": MllamaTextConfig, "vision_config": MllamaVisionConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py
index 654e4e82a491b7..1b31141f020db5 100644
--- a/src/transformers/models/moshi/configuration_moshi.py
+++ b/src/transformers/models/moshi/configuration_moshi.py
@@ -235,8 +235,8 @@ class MoshiConfig(PretrainedConfig):
     ```"""
 
     model_type = "moshi"
-    is_composition = True
     keys_to_ignore_at_inference = ["past_key_values"]
+    sub_configs = {"audio_encoder_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py
index ed822c813ba26e..8ee3f8c0c07428 100644
--- a/src/transformers/models/mpt/configuration_mpt.py
+++ b/src/transformers/models/mpt/configuration_mpt.py
@@ -41,22 +41,22 @@ class MptAttentionConfig(PretrainedConfig):
     Args:
         attn_type (`str`, *optional*, defaults to `"multihead_attention"`):
             type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`.
-        attn_pdrop (`float`, *optional*, defaults to 0.0):
+        attn_pdrop (`float`, *optional*, defaults to `0.0`):
             The dropout probability for the attention layers.
         attn_impl (`str`, *optional*, defaults to `"torch"`):
             The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`.
         clip_qkv (`float`, *optional*):
             If not `None`, clip the queries, keys, and values in the attention layer to this value.
-        softmax_scale (`float`, *optional*, defaults to `None`):
+        softmax_scale (`float`, *optional*):
             If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to
             `1/sqrt(hidden_size)`.
-        prefix_lm (`bool`, *optional*, defaults to `False`)):
+        prefix_lm (`bool`, *optional*, defaults to `False`):
             Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument
             which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another
             bi-directionally. Tokens outside the prefix use causal attention.
         qk_ln (`bool`, *optional*, defaults to `False`):
             Whether to apply layer normalization to the queries and keys in the attention layer.
-        attn_uses_sequence_id (`bool`, *optional*, defaults to `False`)):
+        attn_uses_sequence_id (`bool`, *optional*, defaults to `False`):
             Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train`
             mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each
             token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored.
@@ -66,6 +66,8 @@ class MptAttentionConfig(PretrainedConfig):
             The maximum value of the alibi bias.
     """
 
+    base_config_key = "attn_config"
+
     def __init__(
         self,
         attn_type="multihead_attention",
@@ -97,23 +99,6 @@ def __init__(
                 f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}"
             )
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "mpt":
-            config_dict = config_dict["attn_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class MptConfig(PretrainedConfig):
     """
@@ -188,6 +173,7 @@ class MptConfig(PretrainedConfig):
     """
 
     model_type = "mpt"
+    sub_configs = {"attn_config": MptAttentionConfig}
     attribute_map = {
         "num_attention_heads": "n_heads",
         "hidden_size": "d_model",
diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py
index 0d282355defa96..00c03072198092 100644
--- a/src/transformers/models/musicgen/configuration_musicgen.py
+++ b/src/transformers/models/musicgen/configuration_musicgen.py
@@ -76,6 +76,7 @@ class MusicgenDecoderConfig(PretrainedConfig):
     """
 
     model_type = "musicgen_decoder"
+    base_config_key = "decoder_config"
     keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
@@ -189,6 +190,11 @@ class MusicgenConfig(PretrainedConfig):
     ```"""
 
     model_type = "musicgen"
+    sub_configs = {
+        "text_encoder": AutoConfig,
+        "audio_encoder": AutoConfig,
+        "decoder": MusicgenDecoderConfig,
+    }
     is_composition = True
 
     def __init__(self, **kwargs):
diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py
index 8a77cea0252234..e65ad50021c3ab 100644
--- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py
@@ -78,6 +78,7 @@ class MusicgenMelodyDecoderConfig(PretrainedConfig):
     """
 
     model_type = "musicgen_melody_decoder"
+    base_config_key = "decoder_config"
     keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
@@ -195,6 +196,11 @@ class MusicgenMelodyConfig(PretrainedConfig):
     ```"""
 
     model_type = "musicgen_melody"
+    sub_configs = {
+        "text_encoder": AutoConfig,
+        "audio_encoder": AutoConfig,
+        "decoder": MusicgenMelodyDecoderConfig,
+    }
     is_composition = True
 
     def __init__(
diff --git a/src/transformers/models/owlv2/configuration_owlv2.py b/src/transformers/models/owlv2/configuration_owlv2.py
index 43019553c5c6dc..f9085eaf9c1546 100644
--- a/src/transformers/models/owlv2/configuration_owlv2.py
+++ b/src/transformers/models/owlv2/configuration_owlv2.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """OWLv2 model configuration"""
 
-import os
-from typing import TYPE_CHECKING, Dict, Union
+from typing import TYPE_CHECKING, Dict
 
 
 if TYPE_CHECKING:
@@ -90,6 +89,7 @@ class Owlv2TextConfig(PretrainedConfig):
     ```"""
 
     model_type = "owlv2_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -123,24 +123,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from Owlv2Config
-        if config_dict.get("model_type") == "owlv2":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 # Copied from transformers.models.owlvit.configuration_owlvit.OwlViTVisionConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2, 32->16
 class Owlv2VisionConfig(PretrainedConfig):
@@ -197,6 +179,7 @@ class Owlv2VisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "owlv2_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -229,24 +212,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from Owlv2Config
-        if config_dict.get("model_type") == "owlv2":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 # Copied from transformers.models.owlvit.configuration_owlvit.OwlViTConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2
 class Owlv2Config(PretrainedConfig):
@@ -276,6 +241,7 @@ class Owlv2Config(PretrainedConfig):
     """
 
     model_type = "owlv2"
+    sub_configs = {"text_config": Owlv2TextConfig, "vision_config": Owlv2VisionConfig}
 
     def __init__(
         self,
@@ -304,20 +270,6 @@ def __init__(
         self.return_dict = return_dict
         self.initializer_factor = 1.0
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
     @classmethod
     def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs):
         r"""
diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py
index 877b348f32c121..8be707ce99a1c6 100644
--- a/src/transformers/models/owlvit/configuration_owlvit.py
+++ b/src/transformers/models/owlvit/configuration_owlvit.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 """OWL-ViT model configuration"""
 
-import os
 from collections import OrderedDict
-from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional
 
 
 if TYPE_CHECKING:
@@ -92,6 +91,7 @@ class OwlViTTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "owlvit_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -125,24 +125,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from OwlViTConfig
-        if config_dict.get("model_type") == "owlvit":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class OwlViTVisionConfig(PretrainedConfig):
     r"""
@@ -198,6 +180,7 @@ class OwlViTVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "owlvit_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -230,24 +213,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from OwlViTConfig
-        if config_dict.get("model_type") == "owlvit":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class OwlViTConfig(PretrainedConfig):
     r"""
@@ -276,6 +241,7 @@ class OwlViTConfig(PretrainedConfig):
     """
 
     model_type = "owlvit"
+    sub_configs = {"text_config": OwlViTTextConfig, "vision_config": OwlViTVisionConfig}
 
     def __init__(
         self,
@@ -304,20 +270,6 @@ def __init__(
         self.return_dict = return_dict
         self.initializer_factor = 1.0
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
     @classmethod
     def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs):
         r"""
diff --git a/src/transformers/models/paligemma/configuration_paligemma.py b/src/transformers/models/paligemma/configuration_paligemma.py
index 64598436dbbf1f..de60c501292b30 100644
--- a/src/transformers/models/paligemma/configuration_paligemma.py
+++ b/src/transformers/models/paligemma/configuration_paligemma.py
@@ -17,7 +17,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -73,7 +73,7 @@ class PaliGemmaConfig(PretrainedConfig):
     ```"""
 
     model_type = "paligemma"
-    is_composition = False
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py
index deb276f334723c..925aa60a8dc6de 100644
--- a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py
@@ -15,7 +15,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -157,7 +157,7 @@ class Qwen2AudioConfig(PretrainedConfig):
     ```"""
 
     model_type = "qwen2_audio"
-    is_composition = False
+    sub_configs = {"text_config": AutoConfig, "audio_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
index 1349006e768cd4..55042327de4ec3 100644
--- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """Qwen2VL model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...modeling_rope_utils import rope_config_validation
 from ...utils import logging
@@ -27,6 +24,7 @@
 
 class Qwen2VLVisionConfig(PretrainedConfig):
     model_type = "qwen2_vl"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -55,23 +53,6 @@ def __init__(
         self.spatial_merge_size = spatial_merge_size
         self.temporal_patch_size = temporal_patch_size
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        if config_dict.get("model_type") == "qwen2_vl":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Qwen2VLConfig(PretrainedConfig):
     r"""
@@ -180,6 +161,7 @@ class Qwen2VLConfig(PretrainedConfig):
     ```"""
 
     model_type = "qwen2_vl"
+    sub_configs = {"vision_config": Qwen2VLVisionConfig}
     keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
diff --git a/src/transformers/models/siglip/configuration_siglip.py b/src/transformers/models/siglip/configuration_siglip.py
index 73622373cbab5d..cc8fae93cdb25b 100644
--- a/src/transformers/models/siglip/configuration_siglip.py
+++ b/src/transformers/models/siglip/configuration_siglip.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """Siglip model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -79,6 +76,7 @@ class SiglipTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "siglip_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -110,24 +108,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.attention_dropout = attention_dropout
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from SiglipConfig
-        if config_dict.get("model_type") == "siglip":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class SiglipVisionConfig(PretrainedConfig):
     r"""
@@ -178,6 +158,7 @@ class SiglipVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "siglip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -206,24 +187,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from SiglipConfig
-        if config_dict.get("model_type") == "siglip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class SiglipConfig(PretrainedConfig):
     r"""
@@ -268,6 +231,7 @@ class SiglipConfig(PretrainedConfig):
     ```"""
 
     model_type = "siglip"
+    sub_configs = {"text_config": SiglipTextConfig, "vision_config": SiglipVisionConfig}
 
     def __init__(self, text_config=None, vision_config=None, **kwargs):
         super().__init__(**kwargs)
diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
index 32a58ec5589eed..d7e0211610b657 100644
--- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
@@ -71,6 +71,7 @@ class SpeechEncoderDecoderConfig(PretrainedConfig):
     ```"""
 
     model_type = "speech-encoder-decoder"
+    sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
     is_composition = True
 
     def __init__(self, **kwargs):
diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py
index 8738a02585e039..87d96ca24ffdb4 100644
--- a/src/transformers/models/video_llava/configuration_video_llava.py
+++ b/src/transformers/models/video_llava/configuration_video_llava.py
@@ -15,7 +15,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -78,7 +78,7 @@ class VideoLlavaConfig(PretrainedConfig):
     ```"""
 
     model_type = "video_llava"
-    is_composition = False
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py
index f88be5adfba028..f26c2b2f50fb6a 100644
--- a/src/transformers/models/vipllava/configuration_vipllava.py
+++ b/src/transformers/models/vipllava/configuration_vipllava.py
@@ -15,7 +15,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -72,7 +72,7 @@ class VipLlavaConfig(PretrainedConfig):
     ```"""
 
     model_type = "vipllava"
-    is_composition = False
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
index a4aa663f98526f..59678f2573ff0e 100644
--- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
@@ -78,6 +78,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
     ```"""
 
     model_type = "vision-encoder-decoder"
+    sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
     is_composition = True
 
     def __init__(self, **kwargs):
diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
index 4cea34ca2313bc..0d79720e1aa8d2 100644
--- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
@@ -75,6 +75,7 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
     ```"""
 
     model_type = "vision-text-dual-encoder"
+    sub_configs = {"vision_config": AutoConfig, "text_config": AutoConfig}
     is_composition = True
 
     def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs):
diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index 827046b6c35380..3d3b92d2c8c02e 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """X-CLIP model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -79,6 +76,7 @@ class XCLIPTextConfig(PretrainedConfig):
     ```"""
 
     model_type = "xclip_text_model"
+    base_config_key = "text_config"
 
     def __init__(
         self,
@@ -112,24 +110,6 @@ def __init__(
         self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from XCLIPConfig
-        if config_dict.get("model_type") == "xclip":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class XCLIPVisionConfig(PretrainedConfig):
     r"""
@@ -195,6 +175,7 @@ class XCLIPVisionConfig(PretrainedConfig):
     ```"""
 
     model_type = "xclip_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -239,24 +220,6 @@ def __init__(
         self.hidden_act = hidden_act
         self.drop_path_rate = drop_path_rate
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from XCLIPConfig
-        if config_dict.get("model_type") == "xclip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class XCLIPConfig(PretrainedConfig):
     r"""
@@ -295,6 +258,7 @@ class XCLIPConfig(PretrainedConfig):
     """
 
     model_type = "xclip"
+    sub_configs = {"text_config": XCLIPTextConfig, "vision_config": XCLIPVisionConfig}
 
     def __init__(
         self,
diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index ddeb585a757d5d..3c7e679686f617 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -457,11 +457,20 @@ class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = AlignModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=AlignConfig,
+            has_text_modality=False,
+            common_properties=["projection_dim", "temperature_init_value"],
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="Start to fail after using torch `cu118`.")
     def test_multi_gpu_data_parallel_forward(self):
         super().test_multi_gpu_data_parallel_forward()
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index 0175e562eda618..658e2e38d9adb5 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -452,11 +452,20 @@ def is_pipeline_test_to_skip(
 
     def setUp(self):
         self.model_tester = AltCLIPModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=AltCLIPConfig,
+            has_text_modality=False,
+            common_properties=["projection_dim", "logit_scale_init_value"],
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index d542757cbf879f..7e1dbbe6bb9cb0 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -449,11 +449,18 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = BlipModelTester(self)
+        common_properties = ["logit_scale_init_value", "image_text_hidden_size", "projection_dim", "label_smoothing"]
+        self.config_tester = ConfigTester(
+            self, config_class=BlipConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index 1ec9c2e1c07cdd..0943661b96666c 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -482,6 +482,13 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
 
     def setUp(self):
         self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self)
+        common_properties = ["image_token_index", "num_query_tokens", "image_text_hidden_size"]
+        self.config_tester = ConfigTester(
+            self, config_class=Blip2Config, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_for_conditional_generation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 9f8cc62d2e0fc3..60b77d0efa4b7b 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -515,11 +515,18 @@ class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = ClapModelTester(self)
+        common_properties = ["logit_scale_init_value", "projection_hidden_act", "projection_dim"]
+        self.config_tester = ConfigTester(
+            self, config_class=ClapConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index a7c8c8ef8410e8..fa5de84e06205f 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -745,11 +745,18 @@ class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase
 
     def setUp(self):
         self.model_tester = CLIPModelTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=CLIPConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 75ffa7ad23c2ff..b2b047bb502cce 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -472,11 +472,18 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
 
     def setUp(self):
         self.model_tester = CLIPSegModelTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=CLIPSegConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     def test_model_for_image_segmentation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_for_image_segmentation(*config_and_inputs)
diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py
index 12e58500063a17..a212b4781d0a78 100644
--- a/tests/models/clvp/test_modeling_clvp.py
+++ b/tests/models/clvp/test_modeling_clvp.py
@@ -414,7 +414,13 @@ class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase)
 
     def setUp(self):
         self.model_tester = ClvpModelForConditionalGenerationTester(self)
-        self.clvp_config_tester = ConfigTester(self, config_class=ClvpConfig, hidden_size=32)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.clvp_config_tester = ConfigTester(
+            self, config_class=ClvpConfig, has_text_modality=False, common_properties=common_properties, hidden_size=32
+        )
+
+    def test_config(self):
+        self.clvp_config_tester.run_common_tests()
 
     def tearDown(self):
         super().tearDown()
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index d8c8f385e9ce11..1c35fd705ccd87 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -931,11 +931,18 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = self.class_for_tester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value", "init_codebook"]
+        self.config_tester = ConfigTester(
+            self, config_class=FlavaConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="tested in individual model tests")
     def test_hidden_states_output(self):
         pass
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index ce31bc44a611d2..88b55ec56d8233 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -559,11 +559,18 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
 
     def setUp(self):
         self.model_tester = GroupViTModelTester(self)
+        common_properties = ["projection_dim", "projection_intermediate_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=GroupViTConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="hidden_states are tested in individual model tests")
     def test_hidden_states_output(self):
         pass
diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py
index 3dcd0bf5fbcdeb..ae8c91f29d4d46 100644
--- a/tests/models/idefics2/test_modeling_idefics2.py
+++ b/tests/models/idefics2/test_modeling_idefics2.py
@@ -185,7 +185,12 @@ class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = Idefics2VisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False)
+        self.config_tester = ConfigTester(
+            self, config_class=Idefics2Config, has_text_modality=False, common_properties=["image_token_id"]
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     @unittest.skip(reason="input_embeds cannot be passed in without input_ids")
     def test_inputs_embeds():
diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
index 598f5882470e99..5bfd4c3f3c0e83 100644
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -168,7 +168,12 @@ class Idefics3ModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = Idefics3VisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Idefics3Config, has_text_modality=False)
+        self.config_tester = ConfigTester(
+            self, config_class=Idefics3Config, has_text_modality=False, common_properties=["image_token_id"]
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     @unittest.skip(reason="input_embeds cannot be passed in without input_ids")
     def test_inputs_embeds():
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index f06caeb03778ee..e77577dad7877b 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -486,6 +486,15 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
 
     def setUp(self):
         self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=InstructBlipConfig,
+            has_text_modality=False,
+            common_properties=["num_query_tokens", "image_token_index"],
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_for_conditional_generation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index 7e0bf4eaf0a20d..3be5f89325cf38 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -510,11 +510,18 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
 
     def setUp(self):
         self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self)
+        common_properties = ["num_query_tokens", "video_token_index"]
+        self.config_tester = ConfigTester(
+            self, config_class=InstructBlipVideoConfig, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_for_conditional_generation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
index 43266a750b8d6c..7ede47a348d55b 100644
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -304,7 +304,12 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
 
     def setUp(self):
         self.model_tester = Kosmos2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Kosmos2Config, hidden_size=37)
+        self.config_tester = ConfigTester(
+            self, config_class=Kosmos2Config, has_text_modality=False, common_properties=["latent_query_num"]
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     # overwrite from common to skip `image_to_text_projection.latent_query`
     def test_initialization(self):
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 9810ff7c2a56d4..1359e16a3d7b03 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -194,7 +194,13 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
 
     def setUp(self):
         self.model_tester = LlavaVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)
+        common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"]
+        self.config_tester = ConfigTester(
+            self, config_class=LlavaConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
     def test_inputs_embeds(self):
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index 2146c94c18a4b4..7ce57dcba3eb43 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -223,7 +223,13 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
 
     def setUp(self):
         self.model_tester = LlavaNextVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LlavaNextConfig, has_text_modality=False)
+        common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"]
+        self.config_tester = ConfigTester(
+            self, config_class=LlavaNextConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
index 89cdce65ece95d..3ebb5752bd8ded 100644
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -240,7 +240,13 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
 
     def setUp(self):
         self.model_tester = LlavaNextVideoVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LlavaNextVideoConfig, has_text_modality=False)
+        common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"]
+        self.config_tester = ConfigTester(
+            self, config_class=LlavaNextVideoConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py
index 7a5781fa039b5b..a217eee2c70671 100644
--- a/tests/models/llava_onevision/test_modeling_llava_onevision.py
+++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py
@@ -226,7 +226,13 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
 
     def setUp(self):
         self.model_tester = LlavaOnevisionVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LlavaOnevisionConfig, has_text_modality=False)
+        common_properties = ["image_token_index", "video_token_index", "vision_feature_layer"]
+        self.config_tester = ConfigTester(
+            self, config_class=LlavaOnevisionConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py
index 91f2169a02f42d..9ed5d678225b18 100644
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@@ -272,7 +272,12 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
 
     def setUp(self):
         self.model_tester = MllamaVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MllamaConfig, has_text_modality=False)
+        self.config_tester = ConfigTester(
+            self, config_class=MllamaConfig, has_text_modality=False, common_properties=["image_token_index"]
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
     def test_inputs_embeds(self):
diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index 48070c7bb86c6b..df763aed48c749 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -447,6 +447,13 @@ class Owlv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = Owlv2ModelTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=Owlv2Config, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index a08fae0bc6d10e..e0599a50fb98b4 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -442,6 +442,13 @@ class OwlViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = OwlViTModelTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=OwlViTConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 6c04ba40df19d6..afd45dc0167a5b 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -232,6 +232,9 @@ def setUp(self):
         self.model_tester = Qwen2VLVisionText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=Qwen2VLConfig, has_text_modality=False)
 
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py
index 2fe06b1511a471..61ac78f102994a 100644
--- a/tests/models/siglip/test_modeling_siglip.py
+++ b/tests/models/siglip/test_modeling_siglip.py
@@ -667,9 +667,12 @@ class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.Test
     test_disk_offload_bin = False
     _is_composite = True
 
-    # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.setUp with CLIP->Siglip
     def setUp(self):
         self.model_tester = SiglipModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=SiglipConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.test_model
     def test_model(self):
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 0044ef02720c2b..4da6dc19addd4a 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -217,7 +217,13 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
 
     def setUp(self):
         self.model_tester = VideoLlavaVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VideoLlavaConfig, has_text_modality=False)
+        common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"]
+        self.config_tester = ConfigTester(
+            self, config_class=VideoLlavaConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index e2f9ae1ccfdea7..25670d782a987e 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -179,7 +179,13 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
 
     def setUp(self):
         self.model_tester = VipLlavaVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False)
+        common_properties = ["image_token_index", "vision_feature_layers", "image_seq_length"]
+        self.config_tester = ConfigTester(
+            self, config_class=VipLlavaConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
     def test_inputs_embeds(self):
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 8b91019bae18cc..04dd2d9d29687a 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -547,6 +547,13 @@ class XCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = XCLIPModelTester(self)
+        common_properties = ["projection_dim", "prompt_layers", "prompt_num_attention_heads"]
+        self.config_tester = ConfigTester(
+            self, config_class=XCLIPConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index 81c6a008b133ca..4dbbdedbbc2eb7 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -17,12 +17,17 @@
 import json
 import os
 import tempfile
+from pathlib import Path
 
 from transformers import is_torch_available
+from transformers.utils import direct_transformers_import
 
 from .utils.test_configuration_utils import config_common_kwargs
 
 
+transformers_module = direct_transformers_import(Path(__file__).parent)
+
+
 class ConfigTester:
     def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs):
         self.parent = parent
@@ -35,9 +40,10 @@ def create_and_test_config_common_properties(self):
         config = self.config_class(**self.inputs_dict)
         common_properties = (
             ["hidden_size", "num_attention_heads", "num_hidden_layers"]
-            if self.common_properties is None
+            if self.common_properties is None and not self.config_class.sub_configs
             else self.common_properties
         )
+        common_properties = [] if common_properties is None else common_properties
 
         # Add common fields for text models
         if self.has_text_modality:
@@ -110,6 +116,44 @@ def create_and_test_config_from_and_save_pretrained_subfolder(self):
 
         self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
 
+    def create_and_test_config_from_and_save_pretrained_composite(self):
+        """
+        Tests that composite or nested cofigs can be loaded and saved correctly. In case the config
+        has a sub-config, we should be able to call `sub_config.from_pretrained('general_config_file')`
+        and get a result same as if we loaded the whole config and obtained `config.sub_config` from it.
+        """
+        config = self.config_class(**self.inputs_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            config.save_pretrained(tmpdirname)
+            general_config_loaded = self.config_class.from_pretrained(tmpdirname)
+            general_config_dict = config.to_dict()
+
+            # Iterate over all sub_configs if there are any and load them with their own classes
+            sub_configs = self.config_class.sub_configs
+            for sub_config_key, sub_class in sub_configs.items():
+                if sub_class.__name__ == "AutoConfig":
+                    sub_class = sub_class.for_model(**general_config_dict[sub_config_key]).__class__
+                    sub_config_loaded = sub_class.from_pretrained(tmpdirname)
+                else:
+                    sub_config_loaded = sub_class.from_pretrained(tmpdirname)
+
+                # Pop `transformers_version`, it never exists when a config is part of a general composite config
+                # Verify that loading with subconfig class results in same dict as if we loaded with general composite config class
+                sub_config_loaded_dict = sub_config_loaded.to_dict()
+                sub_config_loaded_dict.pop("transformers_version", None)
+                self.parent.assertEqual(sub_config_loaded_dict, general_config_dict[sub_config_key])
+
+                # Verify that the loaded config type is same as in the general config
+                type_from_general_config = type(getattr(general_config_loaded, sub_config_key))
+                self.parent.assertTrue(isinstance(sub_config_loaded, type_from_general_config))
+
+                # Now save only the sub-config and load it back to make sure the whole load-save-load pipeline works
+                with tempfile.TemporaryDirectory() as tmpdirname2:
+                    sub_config_loaded.save_pretrained(tmpdirname2)
+                    sub_config_loaded_2 = sub_class.from_pretrained(tmpdirname2)
+                    self.parent.assertEqual(sub_config_loaded.to_dict(), sub_config_loaded_2.to_dict())
+
     def create_and_test_config_with_num_labels(self):
         config = self.config_class(**self.inputs_dict, num_labels=5)
         self.parent.assertEqual(len(config.id2label), 5)
@@ -128,6 +172,9 @@ def check_config_can_be_init_without_params(self):
             self.parent.assertIsNotNone(config)
 
     def check_config_arguments_init(self):
+        if self.config_class.sub_configs:
+            return  # TODO: @raushan composite models are not consistent in how they set general params
+
         kwargs = copy.deepcopy(config_common_kwargs)
         config = self.config_class(**kwargs)
         wrong_values = []
@@ -153,6 +200,7 @@ def run_common_tests(self):
         self.create_and_test_config_to_json_file()
         self.create_and_test_config_from_and_save_pretrained()
         self.create_and_test_config_from_and_save_pretrained_subfolder()
+        self.create_and_test_config_from_and_save_pretrained_composite()
         self.create_and_test_config_with_num_labels()
         self.check_config_can_be_init_without_params()
         self.check_config_arguments_init()
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 13c4d5155be445..c7a11ff0ac8a24 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -3802,22 +3802,18 @@ def test_attn_implementation_composite_models(self):
                 self.skipTest("Model is not a composite model.")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            sub_configs = {
-                key: getattr(config, key) for key in config if isinstance(getattr(config, key), PretrainedConfig)
-            }
 
             # set eager as it will be the one supported in all models
             # we just need to test if passing 'attn_implementation' as a dict fails or not
             attn_implementation_per_subconfig = {}
-            for key, sub_config in sub_configs.items():
+            for key in config.sub_configs.keys():
                 attn_implementation_per_subconfig[key] = "eager"
 
             config._attn_implementation = attn_implementation_per_subconfig
             model = model_class(config)
-            for key in model.config:
-                if isinstance(getattr(model.config, key), PretrainedConfig):
-                    sub_config = getattr(model.config, key)
-                    self.assertTrue(sub_config._attn_implementation == "eager")
+            for key in config.sub_configs.keys():
+                sub_config = getattr(model.config, key)
+                self.assertTrue(sub_config._attn_implementation == "eager")
 
             for name, submodule in model.named_modules():
                 class_name = submodule.__class__.__name__
@@ -3826,7 +3822,7 @@ def test_attn_implementation_composite_models(self):
                     or "SdpaSelfAttention" in class_name
                     or "FlashAttention" in class_name
                 ):
-                    raise ValueError("The eager model should not have SDPA/FA2 attention layers")
+                    raise ValueError(f"The eager model should not have SDPA/FA2 attention layers but got {class_name}")
 
     @require_torch_sdpa
     def test_sdpa_can_dispatch_non_composite_models(self):

From 663c8512398f864cb886879bbb64471777421413 Mon Sep 17 00:00:00 2001
From: Guang Yang <42389959+guangy10@users.noreply.github.com>
Date: Tue, 5 Nov 2024 04:41:48 -0800
Subject: [PATCH 04/12] DistilBERT is ExecuTorch compatible (#34475)

* DistillBERT is ExecuTorch compatible

* [run_slow] distilbert

* [run_slow] distilbert

---------

Co-authored-by: Guang Yang <guangyang@fb.com>
---
 .../distilbert/test_modeling_distilbert.py    | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py
index 3a74a1557cf9ba..d4c51cea125720 100644
--- a/tests/models/distilbert/test_modeling_distilbert.py
+++ b/tests/models/distilbert/test_modeling_distilbert.py
@@ -30,6 +30,7 @@
     import torch
 
     from transformers import (
+        AutoTokenizer,
         DistilBertForMaskedLM,
         DistilBertForMultipleChoice,
         DistilBertForQuestionAnswering,
@@ -38,6 +39,7 @@
         DistilBertModel,
     )
     from transformers.models.distilbert.modeling_distilbert import _create_sinusoidal_embeddings
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4
 
 
 class DistilBertModelTester:
@@ -420,3 +422,45 @@ def test_inference_no_head_absolute_embedding(self):
         )
 
         self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
+
+    @slow
+    def test_export(self):
+        if not is_torch_greater_or_equal_than_2_4:
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        distilbert_model = "distilbert-base-uncased"
+        device = "cpu"
+        attn_implementation = "sdpa"
+        max_length = 64
+
+        tokenizer = AutoTokenizer.from_pretrained(distilbert_model)
+        inputs = tokenizer(
+            f"Paris is the {tokenizer.mask_token} of France.",
+            return_tensors="pt",
+            padding="max_length",
+            max_length=max_length,
+        )
+
+        model = DistilBertForMaskedLM.from_pretrained(
+            distilbert_model,
+            device_map=device,
+            attn_implementation=attn_implementation,
+        )
+
+        logits = model(**inputs).logits
+        eager_predicted_mask = tokenizer.decode(logits[0, 4].topk(5).indices)
+        self.assertEqual(
+            eager_predicted_mask.split(),
+            ["capital", "birthplace", "northernmost", "centre", "southernmost"],
+        )
+
+        exported_program = torch.export.export(
+            model,
+            args=(inputs["input_ids"],),
+            kwargs={"attention_mask": inputs["attention_mask"]},
+            strict=True,
+        )
+
+        result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"])
+        exported_predicted_mask = tokenizer.decode(result.logits[0, 4].topk(5).indices)
+        self.assertEqual(eager_predicted_mask, exported_predicted_mask)

From 45b0c7680c0a2d79bda345e04d8461f27602010d Mon Sep 17 00:00:00 2001
From: Eon Kim <kimiron518@gmail.com>
Date: Tue, 5 Nov 2024 23:01:25 +0900
Subject: [PATCH 05/12] Remove unused test_dataset (#34516)

---
 .../contrastive-image-text/run_clip.py        | 35 -------------------
 1 file changed, 35 deletions(-)

diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index 3bed494b75c608..8353333ef827ed 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -141,10 +141,6 @@ class DataTrainingArguments:
         default=None,
         metadata={"help": "An optional input evaluation data file (a jsonlines file)."},
     )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input testing data file (a jsonlines file)."},
-    )
     max_seq_length: Optional[int] = field(
         default=128,
         metadata={
@@ -190,9 +186,6 @@ def __post_init__(self):
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
                 assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-            if self.test_file is not None:
-                extension = self.test_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
 
 
 dataset_name_mapping = {
@@ -315,9 +308,6 @@ def main():
         if data_args.validation_file is not None:
             data_files["validation"] = data_args.validation_file
             extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
         dataset = load_dataset(
             extension,
             data_files=data_files,
@@ -387,8 +377,6 @@ def _freeze_params(module):
         column_names = dataset["train"].column_names
     elif training_args.do_eval:
         column_names = dataset["validation"].column_names
-    elif training_args.do_predict:
-        column_names = dataset["test"].column_names
     else:
         logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
         return
@@ -490,29 +478,6 @@ def filter_corrupt_images(examples):
         # Transform images on the fly as doing it on the whole dataset takes too much time.
         eval_dataset.set_transform(transform_images)
 
-    if training_args.do_predict:
-        if "test" not in dataset:
-            raise ValueError("--do_predict requires a test dataset")
-        test_dataset = dataset["test"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(test_dataset), data_args.max_eval_samples)
-            test_dataset = test_dataset.select(range(max_eval_samples))
-
-        test_dataset = test_dataset.filter(
-            filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
-        )
-        test_dataset = test_dataset.map(
-            function=tokenize_captions,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=[col for col in column_names if col != image_column],
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on test dataset",
-        )
-
-        # Transform images on the fly as doing it on the whole dataset takes too much time.
-        test_dataset.set_transform(transform_images)
-
     # 8. Initialize our trainer
     trainer = Trainer(
         model=model,

From 74d3824cc0725829e7d92e1d43b97be1f18454f8 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 5 Nov 2024 15:12:47 +0100
Subject: [PATCH 06/12] Revert "Fix Whisper CI" (#34605)

Revert "Fix Whisper CI (#34541)"

This reverts commit eb811449a2389e48930c45f84c88fd041735cf92.
---
 src/transformers/generation/utils.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 53cd2df3a49c84..6e6d5b8bdce71d 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1452,11 +1452,10 @@ def _prepare_generated_length(
         ):
             generation_config.max_length -= inputs_tensor.shape[1]
         elif has_default_max_length:  # by default let's always generate 20 new tokens
-            if generation_config.max_length == GenerationConfig().max_length:
-                generation_config.max_length = generation_config.max_length + input_ids_length
-                max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
-                if max_position_embeddings is not None:
-                    generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
+            generation_config.max_length = generation_config.max_length + input_ids_length
+            max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+            if max_position_embeddings is not None:
+                generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
 
         # same for min length
         if generation_config.min_new_tokens is not None:

From 082e57e0d42a46a4a5244f8d005eeb88c8da37b6 Mon Sep 17 00:00:00 2001
From: Yoni Gottesman <yonigo10@gmail.com>
Date: Tue, 5 Nov 2024 17:10:15 +0200
Subject: [PATCH 07/12] Fix  #34494 assistant tokens when truncated (#34531)

* Fix assistant tokens when truncated

* fix test

* fix test

* step
---
 src/transformers/tokenization_utils_base.py   |   2 +-
 .../test_tokenization_layoutlmv2.py           |   4 +
 .../test_tokenization_layoutlmv3.py           |   4 +
 .../layoutxlm/test_tokenization_layoutxlm.py  |   4 +
 .../markuplm/test_tokenization_markuplm.py    |   4 +
 tests/models/tapas/test_tokenization_tapas.py |   4 +
 tests/models/udop/test_tokenization_udop.py   |   4 +
 tests/test_tokenization_common.py             | 104 ++++++++++++++++++
 8 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 43e13abe563115..381f3ef497d9bd 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1722,7 +1722,7 @@ def apply_chat_template(
                             if start_token is None:
                                 # start_token is out of bounds maybe due to truncation.
                                 break
-                            for token_id in range(start_token, end_token + 1 if end_token else len(input_ids)):
+                            for token_id in range(start_token, end_token + 1 if end_token else len(input_ids[i])):
                                 current_mask[token_id] = 1
                         assistant_masks.append(current_mask)
                     out["assistant_masks"] = assistant_masks if is_batched else assistant_masks[0]
diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
index 9e39cd0279d608..7dcf5399703103 100644
--- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
@@ -2497,3 +2497,7 @@ def test_chat_template(self):
     @unittest.skip("Chat is not supported")
     def test_chat_template_return_assistant_tokens_mask(self):
         pass
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        pass
diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
index 4a218d3f211146..9af0861536f73d 100644
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -2450,3 +2450,7 @@ def test_chat_template(self):
     @unittest.skip("Chat is not supported")
     def test_chat_template_return_assistant_tokens_mask(self):
         pass
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        pass
diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
index 9f6d65ffc5f0a1..f387e52790fce3 100644
--- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py
+++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
@@ -1991,3 +1991,7 @@ def test_chat_template(self):
     @unittest.skip("Chat is not supported")
     def test_chat_template_return_assistant_tokens_mask(self):
         pass
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        pass
diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py
index 60c98776b2a407..eaf30131d34054 100644
--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
@@ -2330,3 +2330,7 @@ def test_added_tokens_serialization(self):
     @unittest.skip("Chat is not supported")
     def test_chat_template_return_assistant_tokens_mask(self):
         pass
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        pass
diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py
index 49327a39cd80d3..0a911f7182b4a0 100644
--- a/tests/models/tapas/test_tokenization_tapas.py
+++ b/tests/models/tapas/test_tokenization_tapas.py
@@ -1290,3 +1290,7 @@ def test_chat_template(self):
     @unittest.skip("Chat is not supported")
     def test_chat_template_return_assistant_tokens_mask(self):
         pass
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        pass
diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py
index 90d669064a0fd1..a6ac2ff3d38096 100644
--- a/tests/models/udop/test_tokenization_udop.py
+++ b/tests/models/udop/test_tokenization_udop.py
@@ -1161,6 +1161,10 @@ def test_chat_template(self):
     def test_chat_template_return_assistant_tokens_mask(self):
         pass
 
+    @unittest.skip("Chat is not supported")
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        pass
+
     @unittest.skip(reason="Chat template tests don't play well with table/layout models.")
     def test_chat_template_batched(self):
         pass
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index dd9eb10de40ecf..a3bbbf3c9e97b2 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -1327,6 +1327,110 @@ def test_chat_template_return_assistant_tokens_mask(self):
                     [0] * (assistant_start2 - assistant_end - 1),
                 )
 
+    @require_jinja
+    def test_chat_template_return_assistant_tokens_mask_truncated(self):
+        dummy_template = (
+            "{% for message in messages %}"
+            "{% if (message['role'] != 'assistant') %}"
+            "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
+            "{% elif (message['role'] == 'assistant')%}"
+            "{{'<|im_start|>' + message['role'] + '\n'}}"
+            "{% generation %}"
+            "{{message['content'] + '<|im_end|>'}}"
+            "{% endgeneration %}"
+            "{{'\n'}}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        conversations = [
+            [
+                {"role": "system", "content": "system message"},
+                {"role": "user", "content": "user message"},
+                {
+                    "role": "assistant",
+                    "content": (
+                        "start turn assistant. long string to be truncated, long string to be truncated, "
+                        "long string to be truncated, long string to be truncated, long string to be truncated"
+                    ),
+                },
+                {"role": "user", "content": "another user message"},
+            ],
+            [
+                {"role": "system", "content": "system message"},
+                {"role": "user", "content": "user message"},
+                {
+                    "role": "assistant",
+                    "content": (
+                        "start turn assistant. long string to be truncated, long string to be truncated, "
+                        "long string to be truncated, long string to be truncated, long string to be truncated"
+                    ),
+                },
+                {"role": "user", "content": "another user message"},
+            ],
+        ]
+
+        for tokenizer, pretrained_name, _ in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                if not self.test_rust_tokenizer:
+                    self.skipTest(reason="No fast tokenizer defined")
+
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name)
+
+                # Find where to truncate, as the amount of tokens is different for different tokenizers and I want the
+                # truncation to happen in the middle of the assistant content.
+                full_encoding = tokenizer_r.apply_chat_template(
+                    conversations[0],
+                    chat_template=dummy_template,
+                    tokenize=True,
+                    return_dict=True,
+                )
+                chat_string = tokenizer_r.apply_chat_template(
+                    conversations[0], tokenize=False, chat_template=dummy_template
+                )
+                truncation_position = full_encoding.char_to_token(chat_string.index(", long string to be truncated,"))
+
+                # check batched
+                output = tokenizer_r.apply_chat_template(
+                    conversations,
+                    chat_template=dummy_template,
+                    tokenize=True,
+                    return_assistant_tokens_mask=True,
+                    max_length=truncation_position,
+                    truncation=True,
+                    return_dict=True,
+                )
+                for i, conv in enumerate(conversations):
+                    chat_string = tokenizer_r.apply_chat_template(conv, tokenize=False, chat_template=dummy_template)
+                    assistant_start = output.char_to_token(i, chat_string.index("start turn assistant"))
+
+                    # assert 1 from assistant_start to the end because the rest is truncated.
+                    self.assertEqual(
+                        output["assistant_masks"][i][assistant_start:],
+                        [1] * (len(output["assistant_masks"][i]) - assistant_start),
+                    )
+
+                # check not batched
+                output = tokenizer_r.apply_chat_template(
+                    conversations[0],
+                    chat_template=dummy_template,
+                    tokenize=True,
+                    return_assistant_tokens_mask=True,
+                    return_dict=True,
+                    max_length=truncation_position,
+                    truncation=True,
+                )
+
+                chat_string = tokenizer_r.apply_chat_template(
+                    conversations[0], tokenize=False, chat_template=dummy_template
+                )
+                assistant_start = output.char_to_token(0, chat_string.index("start turn assistant"))
+
+                # assert 1 from assistant_start to the end because the rest is truncated.
+                self.assertEqual(
+                    output["assistant_masks"][assistant_start:],
+                    [1] * (len(output["assistant_masks"]) - assistant_start),
+                )
+
     @require_jinja
     def test_continue_final_message(self):
         dummy_template = """

From f2d5dfbab2af25d6fbfb4a315c78bcdfad9f62d7 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 5 Nov 2024 16:10:42 +0100
Subject: [PATCH 08/12] Remove `@slow` for `test_eager_matches_sdpa_inference`
 (#34558)

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../modeling_llava_next_video.py              |   2 +-
 .../modular_llava_next_video.py               |   2 +-
 .../models/unispeech/modeling_unispeech.py    |   2 +-
 tests/generation/test_utils.py                |  20 +-
 tests/models/albert/test_modeling_albert.py   |   9 +-
 tests/models/glm/test_modeling_glm.py         | 302 ------------------
 tests/models/granite/test_modeling_granite.py |  10 -
 .../granitemoe/test_modeling_granitemoe.py    |  10 -
 tests/models/idefics/test_modeling_idefics.py |  15 +-
 .../llava_next/test_modeling_llava_next.py    |   8 +-
 .../test_modeling_llava_next_video.py         |  10 +-
 tests/models/mimi/test_modeling_mimi.py       |  84 ++---
 tests/models/mllama/test_modeling_mllama.py   |   9 -
 .../models/musicgen/test_modeling_musicgen.py | 188 ++++++-----
 .../test_modeling_musicgen_melody.py          | 162 ++++------
 .../models/nemotron/test_modeling_nemotron.py |  10 -
 .../paligemma/test_modeling_paligemma.py      |  10 -
 .../models/qwen2_vl/test_modeling_qwen2_vl.py |  10 +-
 .../video_llava/test_modeling_video_llava.py  |  13 +-
 .../models/videomae/test_modeling_videomae.py |   7 +-
 tests/test_modeling_common.py                 |  36 ++-
 21 files changed, 282 insertions(+), 637 deletions(-)

diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index 73118f4bfcd300..b0a20d6c5ccd93 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -95,7 +95,7 @@ def __init__(self, config):
         mode = config.spatial_pool_mode
         stride = config.spatial_pool_stride
         out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size)
-        self.image_size = config.vision_config.image_size // config.vision_config.patch_size**2
+        self.image_size = (config.vision_config.image_size // config.vision_config.patch_size) ** 2
 
         if mode == "average":
             self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py
index 002b450c2af232..3d6431d7ea29ba 100644
--- a/src/transformers/models/llava_next_video/modular_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py
@@ -191,7 +191,7 @@ def __init__(self, config):
         mode = config.spatial_pool_mode
         stride = config.spatial_pool_stride
         out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size)
-        self.image_size = config.vision_config.image_size // config.vision_config.patch_size**2
+        self.image_size = (config.vision_config.image_size // config.vision_config.patch_size) ** 2
 
         if mode == "average":
             self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index 52ba08f5d4eda5..6ce5e77706d358 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -1577,7 +1577,7 @@ def forward(
         quantized_features, codevector_perplexity = self.quantizer(extract_features)
 
         # project quantized features twice
-        quantized_features = self.project_q(quantized_features)
+        quantized_features = self.project_q(quantized_features.to(self.project_q.weight.dtype))
         quantized_features = self.project_hid(quantized_features)
 
         prob_replace_matrix = torch.empty(transformer_features.size(0), transformer_features.size(1)).fill_(
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 3bd8ce4b59c9b1..cbe851e97e9aed 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -185,16 +185,16 @@ def _get_logits_processor_kwargs(self, do_sample=False, config=None):
         # This is a band-aid for VLM models, to ensure they don't generate image/video tokens which would cause them
         # to crash. On pretrained models this isn't a risk, as they are trained to not generate these tokens.
         if config is not None:
-            image_token_index = (
-                config.image_token_index
-                if getattr(config, "image_token_index", None) is not None
-                else getattr(config, "image_token_id", None)
-            )
-            video_token_index = config.video_token_index if hasattr(config, "video_token_index") else None
-            if image_token_index is not None and image_token_index < config.get_text_config().vocab_size:
-                logits_processor_kwargs["bad_words_ids"].append([image_token_index])
-            if video_token_index is not None and video_token_index < config.get_text_config().vocab_size:
-                logits_processor_kwargs["bad_words_ids"].append([video_token_index])
+            for key in [
+                "image_token_index",
+                "image_token_id",
+                "video_token_index",
+                "video_token_id",
+                "vision_start_token_id",
+            ]:
+                token_index = getattr(config, key, None)
+                if token_index is not None and token_index < config.get_text_config().vocab_size:
+                    logits_processor_kwargs["bad_words_ids"].append([token_index])
 
         return logits_processor_kwargs
 
diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py
index 970f1dd8555e47..0a123c02ab778b 100644
--- a/tests/models/albert/test_modeling_albert.py
+++ b/tests/models/albert/test_modeling_albert.py
@@ -17,10 +17,11 @@
 import unittest
 
 from packaging import version
+from parameterized import parameterized
 
 from transformers import AlbertConfig, AutoTokenizer, is_torch_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
@@ -288,6 +289,12 @@ def setUp(self):
         self.model_tester = AlbertModelTester(self)
         self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
 
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @unittest.skip("Albert requires `head_mask` which is currently not done in this test.")
+    def test_eager_matches_sdpa_inference(self):
+        pass
+
     def test_config(self):
         self.config_tester.run_common_tests()
 
diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py
index b92c5db815b77a..ebac3b9167ce26 100644
--- a/tests/models/glm/test_modeling_glm.py
+++ b/tests/models/glm/test_modeling_glm.py
@@ -14,13 +14,9 @@
 # limitations under the License.
 """Testing suite for the PyTorch Glm model."""
 
-import inspect
-import tempfile
 import unittest
 
-import numpy as np
 import pytest
-from parameterized import parameterized
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, GlmConfig, is_torch_available
 from transformers.testing_utils import (
@@ -32,7 +28,6 @@
     slow,
     torch_device,
 )
-from transformers.utils import is_torch_bf16_available_on_device, is_torch_fp16_available_on_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -421,303 +416,6 @@ def test_custom_4d_attention_mask(self):
 
             torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-3)
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    @is_flaky
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        """Overwrite to add flakyness: some cases can sometimes fail"""
-        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
-            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
-
-        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
-            self.skipTest(
-                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
-            )
-
-        # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
-        if torch_dtype == "float16":
-            torch_dtype = torch.float16
-        elif torch_dtype == "bfloat16":
-            torch_dtype = torch.bfloat16
-        elif torch_dtype == "float32":
-            torch_dtype = torch.float32
-
-        atols = {
-            ("cpu", False, torch.float32): 1e-6,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-6,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-6,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-6,
-            ("cuda", True, torch.bfloat16): 1e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-        rtols = {
-            ("cpu", False, torch.float32): 1e-4,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-4,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-4,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-4,
-            ("cuda", True, torch.bfloat16): 3e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-
-        def get_mean_reldiff(failcase, x, ref, atol, rtol):
-            return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-            # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors.
-            # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask.
-            # This means that the class needs to be instantiated much later, after `use_mask` is set, which means a significant refactor of the code.
-            # However masking there is not done at any layers that matters (i.e self-attention), therefore we can safely deactivate it.
-            deactivate_mask = "use_mask_token" in inspect.signature(model_class).parameters
-
-            is_encoder_decoder = model.config.is_encoder_decoder
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch_dtype,
-                    attn_implementation="eager",
-                )
-                model_eager = model_eager.eval().to(torch_device)
-
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-                for name, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
-                has_sdpa = False
-                for name, submodule in model_sdpa.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        has_sdpa = True
-                        break
-                if not has_sdpa and model_sdpa.config.model_type != "falcon":
-                    raise ValueError("The SDPA model should have SDPA attention layers")
-
-                # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
-                # but it would be nicer to have an efficient way to use parameterized.expand
-                fail_cases = []
-                for padding_side in ["left", "right"]:
-                    for use_mask in [False, True]:
-                        for output_attentions in [True, False]:
-                            can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
-                            if not (self.has_attentions and can_output_attn) and output_attentions:
-                                continue
-                            for batch_size in [1, 5]:
-                                dummy_input = inputs_dict[model.main_input_name]
-
-                                if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                    dummy_input = dummy_input.to(torch_dtype)
-
-                                dummy_input = dummy_input[:batch_size]
-                                if dummy_input.shape[0] != batch_size:
-                                    if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                        extension = torch.rand(
-                                            batch_size - dummy_input.shape[0],
-                                            *dummy_input.shape[1:],
-                                            dtype=torch_dtype,
-                                            device=torch_device,
-                                        )
-                                        dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-                                    else:
-                                        extension = torch.randint(
-                                            high=5,
-                                            size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]),
-                                            dtype=dummy_input.dtype,
-                                            device=torch_device,
-                                        )
-                                        dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-
-                                if not use_mask:
-                                    dummy_attention_mask = None
-                                else:
-                                    dummy_attention_mask = inputs_dict.get("attention_mask", None)
-                                    if dummy_attention_mask is None:
-                                        if is_encoder_decoder:
-                                            seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1]
-                                        else:
-                                            seqlen = dummy_input.shape[-1]
-                                        dummy_attention_mask = (
-                                            torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device)
-                                        )
-
-                                    dummy_attention_mask = dummy_attention_mask[:batch_size]
-                                    if dummy_attention_mask.shape[0] != batch_size:
-                                        extension = torch.ones(
-                                            batch_size - dummy_attention_mask.shape[0],
-                                            *dummy_attention_mask.shape[1:],
-                                            dtype=dummy_attention_mask.dtype,
-                                            device=torch_device,
-                                        )
-                                        dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0)
-                                        dummy_attention_mask = dummy_attention_mask.to(torch_device)
-
-                                    dummy_attention_mask[:] = 1
-                                    if padding_side == "left":
-                                        dummy_attention_mask[-1, :-1] = 1
-                                        dummy_attention_mask[-1, -4:] = 0
-                                    elif padding_side == "right":
-                                        dummy_attention_mask[-1, 1:] = 1
-                                        dummy_attention_mask[-1, :3] = 0
-
-                                for enable_kernels in [False, True]:
-                                    failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
-                                    if is_encoder_decoder:
-                                        decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[
-                                            :batch_size
-                                        ]
-                                        if decoder_input_ids.shape[0] != batch_size:
-                                            extension = torch.ones(
-                                                batch_size - decoder_input_ids.shape[0],
-                                                *decoder_input_ids.shape[1:],
-                                                dtype=decoder_input_ids.dtype,
-                                                device=torch_device,
-                                            )
-                                            decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0)
-                                            decoder_input_ids = decoder_input_ids.to(torch_device)
-
-                                        # TODO: never an `attention_mask` arg here?
-                                        processed_inputs = {
-                                            model.main_input_name: dummy_input,
-                                            "decoder_input_ids": decoder_input_ids,
-                                            "decoder_attention_mask": dummy_attention_mask,
-                                            "output_hidden_states": True,
-                                        }
-                                    else:
-                                        processed_inputs = {
-                                            model.main_input_name: dummy_input,
-                                            "output_hidden_states": True,
-                                        }
-
-                                        # Otherwise fails for e.g. WhisperEncoderModel
-                                        if "attention_mask" in inspect.signature(model_eager.forward).parameters:
-                                            processed_inputs["attention_mask"] = dummy_attention_mask
-
-                                        if (
-                                            self.has_attentions
-                                            and "output_attentions" in inspect.signature(model_sdpa.forward).parameters
-                                        ):
-                                            processed_inputs["output_attentions"] = output_attentions
-                                    if not deactivate_mask and (
-                                        "bool_masked_pos" in inspect.signature(model_eager.forward).parameters
-                                    ):
-                                        dummy_mask = torch.ones((self.model_tester.num_masks,))
-
-                                        # In case of additional token (like class) we define a custom `mask_length`
-                                        if hasattr(self.model_tester, "mask_length"):
-                                            mask_length = self.model_tester.mask_length - dummy_mask.size(0)
-                                        else:
-                                            mask_length = self.model_tester.seq_length - dummy_mask.size(0)
-                                        dummy_mask = torch.cat([dummy_mask, torch.zeros(mask_length)])
-                                        dummy_bool_masked_pos = dummy_mask.expand(batch_size, -1).bool()
-                                        processed_inputs["bool_masked_pos"] = dummy_bool_masked_pos.to(torch_device)
-
-                                    if "noise" in inspect.signature(model_eager.forward).parameters:
-                                        np.random.seed(2)
-                                        num_patches = int(
-                                            (self.model_tester.image_size // self.model_tester.patch_size) ** 2
-                                        )
-                                        noise = np.random.uniform(size=(batch_size, num_patches))
-                                        processed_inputs["noise"] = torch.from_numpy(noise)
-
-                                    # TODO: test gradients as well (& for FA2 as well!)
-                                    with torch.no_grad():
-                                        with torch.backends.cuda.sdp_kernel(
-                                            enable_flash=enable_kernels,
-                                            enable_math=True,
-                                            enable_mem_efficient=enable_kernels,
-                                        ):
-                                            prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
-                                            outputs_eager = model_eager(**prepared_inputs)
-                                            outputs_sdpa = model_sdpa(**prepared_inputs)
-
-                                    logits_eager = (
-                                        outputs_eager.hidden_states[-1]
-                                        if not is_encoder_decoder
-                                        else outputs_eager.decoder_hidden_states[-1]
-                                    )
-                                    logits_sdpa = (
-                                        outputs_sdpa.hidden_states[-1]
-                                        if not is_encoder_decoder
-                                        else outputs_sdpa.decoder_hidden_states[-1]
-                                    )
-
-                                    if torch_device in ["cpu", "cuda"]:
-                                        atol = atols[torch_device, enable_kernels, torch_dtype]
-                                        rtol = rtols[torch_device, enable_kernels, torch_dtype]
-                                    else:
-                                        atol = 1e-7
-                                        rtol = 1e-4
-
-                                    # Masked tokens output slightly deviates - we don't mind that.
-                                    if use_mask:
-                                        if padding_side == "left":
-                                            sub_sdpa = logits_sdpa[:-1]
-                                            sub_eager = logits_eager[:-1]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            sub_sdpa = logits_sdpa[-1, :-4]
-                                            sub_eager = logits_eager[-1, :-4]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            # Testing the padding tokens is not really meaningful but anyway
-                                            # sub_sdpa = logits_sdpa[-1, -4:]
-                                            # sub_eager = logits_eager[-1, -4:]
-                                            # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
-                                        elif padding_side == "right":
-                                            sub_sdpa = logits_sdpa[:-1]
-                                            sub_eager = logits_eager[:-1]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            sub_sdpa = logits_sdpa[-1, 3:]
-                                            sub_eager = logits_eager[-1, 3:]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            # Testing the padding tokens is not really meaningful but anyway
-                                            # sub_sdpa = logits_sdpa[-1, :3]
-                                            # sub_eager = logits_eager[-1, :3]
-                                            # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
-
-                                    else:
-                                        if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                            )
-
-                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
-
 
 @slow
 @require_torch_accelerator
diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py
index 97b59f5aa50621..60eb964927278a 100644
--- a/tests/models/granite/test_modeling_granite.py
+++ b/tests/models/granite/test_modeling_granite.py
@@ -25,7 +25,6 @@
     require_read_token,
     require_torch,
     require_torch_gpu,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -445,15 +444,6 @@ def test_use_flash_attention_2_true(self):
                 if not has_flash:
                     raise ValueError("The flash model should have flash attention layers")
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        """
-        skipping the test since mup is very flaky and gets consistently different outputs
-        """
-        self.skipTest("skipping the test since mup is very flaky and gets consistently different outputs")
-
 
 @require_torch_gpu
 class GraniteIntegrationTest(unittest.TestCase):
diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py
index f2f76b9fa75bf3..97af65667ed048 100644
--- a/tests/models/granitemoe/test_modeling_granitemoe.py
+++ b/tests/models/granitemoe/test_modeling_granitemoe.py
@@ -25,7 +25,6 @@
     require_read_token,
     require_torch,
     require_torch_gpu,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -444,15 +443,6 @@ def test_use_flash_attention_2_true(self):
                 if not has_flash:
                     raise ValueError("The flash model should have flash attention layers")
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        """
-        skipping the test since mup is very flaky and gets consistently different outputs
-        """
-        self.skipTest("skipping the test since mup is very flaky and gets consistently different outputs")
-
 
 @require_torch_gpu
 class GraniteMoeIntegrationTest(unittest.TestCase):
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index 7be87fd78390ab..12004cc3c8ad89 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -134,7 +134,7 @@ def __init__(
             num_attention_heads=self.vision_num_attention_heads,
             num_hidden_layers=self.vision_num_hidden_layers,
             intermediate_size=self.vision_intermediate_size,
-        )
+        ).to_dict()
 
         self.perceiver_qk_layer_norms_perceiver = perceiver_qk_layer_norms_perceiver
         self.perceiver_resampler_depth = perceiver_resampler_depth
@@ -316,7 +316,6 @@ def prepare_pixel_values(self):
         return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
     @require_torch_sdpa
-    @slow
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test")
@@ -353,6 +352,12 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
 
         return inputs_dict
 
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @unittest.skip("Idefics requires both text and image inputs which is currently not done in this test.")
+    def test_eager_matches_sdpa_inference(self):
+        pass
+
     def test_model_outputs_equivalence(self):
         try:
             orig = self.all_model_classes
@@ -602,6 +607,12 @@ def setUp(self):
         )
         self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37)
 
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @unittest.skip("Idefics requires both text and image inputs which is currently not done in this test.")
+    def test_eager_matches_sdpa_inference(self, torch_dtype):
+        pass
+
     @pytest.mark.generate
     def test_left_padding_compatibility(self):
         """Overwrite because IDEFICS needs image attention mask to be also padded"""
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index 7ce57dcba3eb43..82508f57e0f1f5 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -90,7 +90,7 @@ def __init__(
         },
         is_training=True,
         vision_config={
-            "image_size": 16,
+            "image_size": 8,
             "patch_size": 4,
             "num_channels": 3,
             "is_training": True,
@@ -123,10 +123,10 @@ def __init__(
         self.batch_size = 3
         self.num_channels = 3
         self.image_size = 30
-        self.encoder_seq_length = 95
-        self.image_grid_pinpoints = [[32, 32]]
-        self.num_image_tokens = 88
+        self.image_grid_pinpoints = [[16, 16]]
+        self.num_image_tokens = 24
         self.seq_length = seq_length + self.num_image_tokens
+        self.encoder_seq_length = self.seq_length
 
     def get_config(self):
         return LlavaNextConfig(
diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
index 3ebb5752bd8ded..83caabe16bb68b 100644
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -91,7 +91,7 @@ def __init__(
         },
         is_training=True,
         vision_config={
-            "image_size": 16,
+            "image_size": 8,
             "patch_size": 4,
             "num_channels": 3,
             "is_training": True,
@@ -125,10 +125,10 @@ def __init__(
         self.batch_size = 3
         self.num_channels = 3
         self.image_size = 30
-        self.encoder_seq_length = 127
-        self.image_grid_pinpoints = [[32, 32]]
-        self.num_image_tokens = 88
-        self.num_video_tokens = 32
+
+        self.image_grid_pinpoints = [[16, 16]]
+        self.num_image_tokens = 24
+        self.num_video_tokens = 8
         self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens
 
     def get_config(self):
diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py
index df0007d666a077..7ddc6b747447a3 100644
--- a/tests/models/mimi/test_modeling_mimi.py
+++ b/tests/models/mimi/test_modeling_mimi.py
@@ -409,10 +409,14 @@ def test_identity_shortcut(self):
         config.use_conv_shortcut = False
         self.model_tester.create_and_check_model_forward(config, inputs_dict)
 
+    # Overwrite to use `audio_values` as the tensors to compare.
+    # TODO: Try to do this in the parent class.
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
-    @slow
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        if torch_dtype == "float16" and torch_device == "cpu":
+            self.skipTest("`replication_pad1d` not implemented for 'Half")
+
         if not self.has_attentions:
             self.skipTest(reason="Model architecture does not support attentions")
 
@@ -513,7 +517,7 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                             can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
                             if not (self.has_attentions and can_output_attn) and output_attentions:
                                 continue
-                            for batch_size in [1, 5]:
+                            for batch_size in [7]:
                                 dummy_input = inputs_dict[model.main_input_name]
 
                                 if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
@@ -564,11 +568,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                     dummy_attention_mask[:] = 1
                                     if padding_side == "left":
-                                        dummy_attention_mask[-1, :-1] = 1
-                                        dummy_attention_mask[-1, -4:] = 0
+                                        dummy_attention_mask[-1, :2] = 0
+                                        dummy_attention_mask[-1, 2:] = 1
                                     elif padding_side == "right":
-                                        dummy_attention_mask[-1, 1:] = 1
-                                        dummy_attention_mask[-1, :3] = 0
+                                        dummy_attention_mask[-1, -2:] = 0
+                                        dummy_attention_mask[-1, :-2] = 1
 
                                 for enable_kernels in [False, True]:
                                     failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
@@ -655,52 +659,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                     # Masked tokens output slightly deviates - we don't mind that.
                                     if use_mask:
+                                        _logits_sdpa = torch.zeros_like(input=logits_sdpa)
+                                        _logits_eager = torch.zeros_like(input=logits_eager)
+
+                                        _logits_sdpa[:-1] = logits_sdpa[:-1]
+                                        _logits_eager[:-1] = logits_eager[:-1]
+
                                         if padding_side == "left":
-                                            sub_sdpa = logits_sdpa[:-1]
-                                            sub_eager = logits_eager[:-1]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            sub_sdpa = logits_sdpa[-1, :-4]
-                                            sub_eager = logits_eager[-1, :-4]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            # Testing the padding tokens is not really meaningful but anyway
-                                            # sub_sdpa = logits_sdpa[-1, -4:]
-                                            # sub_eager = logits_eager[-1, -4:]
-                                            # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
-                                        elif padding_side == "right":
-                                            sub_sdpa = logits_sdpa[:-1]
-                                            sub_eager = logits_eager[:-1]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            sub_sdpa = logits_sdpa[-1, 3:]
-                                            sub_eager = logits_eager[-1, 3:]
-                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                                fail_cases.append(
-                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                                )
-
-                                            # Testing the padding tokens is not really meaningful but anyway
-                                            # sub_sdpa = logits_sdpa[-1, :3]
-                                            # sub_eager = logits_eager[-1, :3]
-                                            # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
+                                            _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
 
-                                    else:
-                                        if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                            )
+                                        elif padding_side == "right":
+                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
+                                            _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
+
+                                        logits_sdpa = _logits_sdpa
+                                        logits_eager = _logits_eager
+
+                                    results = [
+                                        torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
+                                        for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
+                                    ]
+                                    # If 80% batch elements have matched results, it's fine
+                                    if np.mean(results) < 0.8:
+                                        fail_cases.append(
+                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+                                        )
 
                 self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
 
diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py
index 9ed5d678225b18..8da927f815db81 100644
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@@ -30,12 +30,10 @@
 from transformers.models.mllama.configuration_mllama import MllamaTextConfig
 from transformers.testing_utils import (
     cleanup,
-    is_flaky,
     require_bitsandbytes,
     require_read_token,
     require_torch,
     require_torch_gpu,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -359,13 +357,6 @@ def _check_attentions_for_generate(
 
             self.assertListEqual([layer_attention.shape for layer_attention in iter_attentions], expected_shapes)
 
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_inference_1_bfloat16(self):
-        # A workaround to override parametrized test with flaky decorator
-        super().test_eager_matches_sdpa_inference_1_bfloat16()
-
     @unittest.skip("For some unknown reasons the tests fails in CrossAttention layer when doing torch.sdpa(). ")
     def test_sdpa_can_compile_dynamic(self):
         pass
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index 963cace28d6e41..37b5af3ae7e312 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -452,7 +452,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
-    @slow
     # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         if not self.has_attentions:
@@ -479,8 +478,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
 
         atols = {
             ("cpu", False, torch.float32): 1e-6,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-6,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-6,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -491,8 +492,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         }
         rtols = {
             ("cpu", False, torch.float32): 1e-4,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-4,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-4,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -528,7 +531,7 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                 fail_cases = []
                 for padding_side in ["left", "right"]:
                     for use_mask in [False, True]:
-                        for batch_size in [1, 5]:
+                        for batch_size in [7]:
                             # Ignore copy
                             batch_size_input_ids = self.model_tester.num_codebooks * batch_size
                             dummy_input = inputs_dict[model.main_input_name]
@@ -585,11 +588,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 dummy_attention_mask[:] = 1
                                 if padding_side == "left":
-                                    dummy_attention_mask[-1, :-1] = 1
-                                    dummy_attention_mask[-1, -4:] = 0
+                                    dummy_attention_mask[-1, :2] = 0
+                                    dummy_attention_mask[-1, 2:] = 1
                                 elif padding_side == "right":
-                                    dummy_attention_mask[-1, 1:] = 1
-                                    dummy_attention_mask[-1, :3] = 0
+                                    dummy_attention_mask[-1, -2:] = 0
+                                    dummy_attention_mask[-1, :-2] = 1
 
                             for enable_kernels in [False, True]:
                                 failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
@@ -632,52 +635,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 # Masked tokens output slightly deviates - we don't mind that.
                                 if use_mask:
+                                    _logits_sdpa = torch.zeros_like(input=logits_sdpa)
+                                    _logits_eager = torch.zeros_like(input=logits_eager)
+
+                                    _logits_sdpa[:-1] = logits_sdpa[:-1]
+                                    _logits_eager[:-1] = logits_eager[:-1]
+
                                     if padding_side == "left":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, :-4]
-                                        sub_eager = logits_eager[-1, :-4]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, -4:]
-                                        # sub_eager = logits_eager[-1, -4:]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
+
                                     elif padding_side == "right":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, 3:]
-                                        sub_eager = logits_eager[-1, 3:]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, :3]
-                                        # sub_eager = logits_eager[-1, :3]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
 
-                                else:
-                                    if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
-                                        fail_cases.append(
-                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                        )
+                                    logits_sdpa = _logits_sdpa
+                                    logits_eager = _logits_eager
+
+                                results = [
+                                    torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
+                                    for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
+                                ]
+                                # If 80% batch elements have matched results, it's fine
+                                if np.mean(results) < 0.8:
+                                    fail_cases.append(
+                                        get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+                                    )
 
                 self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
 
@@ -1496,8 +1479,6 @@ def test_sdpa_can_dispatch_composite_models(self):
 
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
-    @slow
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         if not self.has_attentions:
             self.skipTest(reason="Model architecture does not support attentions")
@@ -1523,8 +1504,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
 
         atols = {
             ("cpu", False, torch.float32): 1e-6,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-6,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-6,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -1535,8 +1518,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         }
         rtols = {
             ("cpu", False, torch.float32): 1e-4,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-4,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-4,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -1549,8 +1534,26 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         def get_mean_reldiff(failcase, x, ref, atol, rtol):
             return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
 
+        if hasattr(self.model_tester, "num_hidden_layers"):
+            self.model_tester.num_hidden_layers = 1
+
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            config.rms_norm_eps = 1.0
+            config.layer_norm_eps = 1.0
+            config.norm_eps = 1.0
+            config.norm_epsilon = 1.0
+            config.layer_norm_epsilon = 1.0
+
+            for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]:
+                if hasattr(config, attr):
+                    getattr(config, attr).rms_norm_eps = 1.0
+                    getattr(config, attr).layer_norm_eps = 1.0
+                    getattr(config, attr).norm_eps = 1.0
+                    getattr(config, attr).norm_epsilon = 1.0
+                    getattr(config, attr).layer_norm_epsilon = 1.0
+
             model = model_class(config)
 
             is_encoder_decoder = model.config.is_encoder_decoder
@@ -1567,12 +1570,19 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                 )
                 model_eager = model_eager.eval().to(torch_device)
 
+                for x in model_eager.modules():
+                    if isinstance(x, (torch.nn.LayerNorm, torch.nn.GroupNorm)):
+                        x.eps = 1.0
+                for x in model_sdpa.modules():
+                    if isinstance(x, (torch.nn.LayerNorm, torch.nn.GroupNorm)):
+                        x.eps = 1.0
+
                 # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
                 # but it would be nicer to have an efficient way to use parameterized.expand
                 fail_cases = []
                 for padding_side in ["left", "right"]:
                     for use_mask in [False, True]:
-                        for batch_size in [1, 5]:
+                        for batch_size in [7]:
                             dummy_input = inputs_dict[model.main_input_name]
 
                             if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
@@ -1622,11 +1632,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 dummy_attention_mask[:] = 1
                                 if padding_side == "left":
-                                    dummy_attention_mask[-1, :-1] = 1
-                                    dummy_attention_mask[-1, -4:] = 0
+                                    dummy_attention_mask[-1, :2] = 0
+                                    dummy_attention_mask[-1, 2:] = 1
                                 elif padding_side == "right":
-                                    dummy_attention_mask[-1, 1:] = 1
-                                    dummy_attention_mask[-1, :3] = 0
+                                    dummy_attention_mask[-1, -2:] = 0
+                                    dummy_attention_mask[-1, :-2] = 1
 
                             for enable_kernels in [False, True]:
                                 failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
@@ -1687,52 +1697,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 # Masked tokens output slightly deviates - we don't mind that.
                                 if use_mask:
+                                    _logits_sdpa = torch.zeros_like(input=logits_sdpa)
+                                    _logits_eager = torch.zeros_like(input=logits_eager)
+
+                                    _logits_sdpa[:-1] = logits_sdpa[:-1]
+                                    _logits_eager[:-1] = logits_eager[:-1]
+
                                     if padding_side == "left":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, :-4]
-                                        sub_eager = logits_eager[-1, :-4]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, -4:]
-                                        # sub_eager = logits_eager[-1, -4:]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
+
                                     elif padding_side == "right":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, 3:]
-                                        sub_eager = logits_eager[-1, 3:]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, :3]
-                                        # sub_eager = logits_eager[-1, :3]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
 
-                                else:
-                                    if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
-                                        fail_cases.append(
-                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                        )
+                                    logits_sdpa = _logits_sdpa
+                                    logits_eager = _logits_eager
+
+                                results = [
+                                    torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
+                                    for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
+                                ]
+                                # If 80% batch elements have matched results, it's fine
+                                if np.mean(results) < 0.8:
+                                    fail_cases.append(
+                                        get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+                                    )
 
                 self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
 
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index 957db9f23b0f21..de7a2745ca073f 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -460,7 +460,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
-    @slow
     # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         if not self.has_attentions:
@@ -487,8 +486,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
 
         atols = {
             ("cpu", False, torch.float32): 1e-6,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-6,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-6,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -499,8 +500,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         }
         rtols = {
             ("cpu", False, torch.float32): 1e-4,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-4,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-4,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -536,7 +539,7 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                 fail_cases = []
                 for padding_side in ["left", "right"]:
                     for use_mask in [False, True]:
-                        for batch_size in [1, 5]:
+                        for batch_size in [7]:
                             # Ignore copy
                             batch_size_input_ids = self.model_tester.num_codebooks * batch_size
                             dummy_input = inputs_dict[model.main_input_name]
@@ -593,11 +596,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 dummy_attention_mask[:] = 1
                                 if padding_side == "left":
-                                    dummy_attention_mask[-1, :-1] = 1
-                                    dummy_attention_mask[-1, -4:] = 0
+                                    dummy_attention_mask[-1, :2] = 0
+                                    dummy_attention_mask[-1, 2:] = 1
                                 elif padding_side == "right":
-                                    dummy_attention_mask[-1, 1:] = 1
-                                    dummy_attention_mask[-1, :3] = 0
+                                    dummy_attention_mask[-1, -2:] = 0
+                                    dummy_attention_mask[-1, :-2] = 1
 
                             for enable_kernels in [False, True]:
                                 failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
@@ -640,52 +643,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 # Masked tokens output slightly deviates - we don't mind that.
                                 if use_mask:
+                                    _logits_sdpa = torch.zeros_like(input=logits_sdpa)
+                                    _logits_eager = torch.zeros_like(input=logits_eager)
+
+                                    _logits_sdpa[:-1] = logits_sdpa[:-1]
+                                    _logits_eager[:-1] = logits_eager[:-1]
+
                                     if padding_side == "left":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, :-4]
-                                        sub_eager = logits_eager[-1, :-4]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, -4:]
-                                        # sub_eager = logits_eager[-1, -4:]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
+
                                     elif padding_side == "right":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, 3:]
-                                        sub_eager = logits_eager[-1, 3:]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, :3]
-                                        # sub_eager = logits_eager[-1, :3]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
 
-                                else:
-                                    if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
-                                        fail_cases.append(
-                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                        )
+                                    logits_sdpa = _logits_sdpa
+                                    logits_eager = _logits_eager
+
+                                results = [
+                                    torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
+                                    for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
+                                ]
+                                # If 80% batch elements have matched results, it's fine
+                                if np.mean(results) < 0.8:
+                                    fail_cases.append(
+                                        get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+                                    )
 
                 self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
 
@@ -1486,7 +1469,6 @@ def test_sdpa_can_dispatch_composite_models(self):
 
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
-    @slow
     # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         if not self.all_model_classes[0]._supports_sdpa:
@@ -1510,8 +1492,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
 
         atols = {
             ("cpu", False, torch.float32): 1e-6,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-6,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-6,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -1522,8 +1506,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         }
         rtols = {
             ("cpu", False, torch.float32): 1e-4,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-4,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-4,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -1559,7 +1545,7 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                 fail_cases = []
                 for padding_side in ["left", "right"]:
                     for use_mask in [False, True]:
-                        for batch_size in [1, 5]:
+                        for batch_size in [7]:
                             dummy_input = inputs_dict[model.main_input_name]
 
                             if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
@@ -1609,11 +1595,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 dummy_attention_mask[:] = 1
                                 if padding_side == "left":
-                                    dummy_attention_mask[-1, :-1] = 1
-                                    dummy_attention_mask[-1, -4:] = 0
+                                    dummy_attention_mask[-1, :2] = 0
+                                    dummy_attention_mask[-1, 2:] = 1
                                 elif padding_side == "right":
-                                    dummy_attention_mask[-1, 1:] = 1
-                                    dummy_attention_mask[-1, :3] = 0
+                                    dummy_attention_mask[-1, -2:] = 0
+                                    dummy_attention_mask[-1, :-2] = 1
 
                             for enable_kernels in [False, True]:
                                 failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
@@ -1674,52 +1660,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                                 # Masked tokens output slightly deviates - we don't mind that.
                                 if use_mask:
+                                    _logits_sdpa = torch.zeros_like(input=logits_sdpa)
+                                    _logits_eager = torch.zeros_like(input=logits_eager)
+
+                                    _logits_sdpa[:-1] = logits_sdpa[:-1]
+                                    _logits_eager[:-1] = logits_eager[:-1]
+
                                     if padding_side == "left":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, :-4]
-                                        sub_eager = logits_eager[-1, :-4]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, -4:]
-                                        # sub_eager = logits_eager[-1, -4:]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
+
                                     elif padding_side == "right":
-                                        sub_sdpa = logits_sdpa[:-1]
-                                        sub_eager = logits_eager[:-1]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        sub_sdpa = logits_sdpa[-1, 3:]
-                                        sub_eager = logits_eager[-1, 3:]
-                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                            fail_cases.append(
-                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
-                                            )
-
-                                        # Testing the padding tokens is not really meaningful but anyway
-                                        # sub_sdpa = logits_sdpa[-1, :3]
-                                        # sub_eager = logits_eager[-1, :3]
-                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
-                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
+                                        _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
 
-                                else:
-                                    if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
-                                        fail_cases.append(
-                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                        )
+                                    logits_sdpa = _logits_sdpa
+                                    logits_eager = _logits_eager
+
+                                results = [
+                                    torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
+                                    for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
+                                ]
+                                # If 80% batch elements have matched results, it's fine
+                                if np.mean(results) < 0.8:
+                                    fail_cases.append(
+                                        get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+                                    )
 
                 self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
 
diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py
index 37a581a33866ce..fd62c74d3d6e11 100644
--- a/tests/models/nemotron/test_modeling_nemotron.py
+++ b/tests/models/nemotron/test_modeling_nemotron.py
@@ -19,7 +19,6 @@
 import unittest
 
 import pytest
-from parameterized import parameterized
 
 from transformers import NemotronConfig, is_torch_available
 from transformers.testing_utils import (
@@ -99,15 +98,6 @@ def setUp(self):
         self.model_tester = NemotronModelTester(self)
         self.config_tester = ConfigTester(self, config_class=NemotronConfig, hidden_size=37)
 
-    @require_torch_sdpa
-    @slow
-    @unittest.skip(
-        reason="Due to custom causal mask, there is a slightly too big difference between eager and sdpa in bfloat16."
-    )
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        pass
-
     @unittest.skip("Eager and SDPA do not produce the same outputs, thus this test fails")
     def test_model_outputs_equivalence(self, **kwargs):
         pass
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index 074e0083fd0202..ce44436a20ad2c 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -17,7 +17,6 @@
 import unittest
 
 import requests
-from parameterized import parameterized
 
 from transformers import (
     PaliGemmaConfig,
@@ -30,7 +29,6 @@
     cleanup,
     require_read_token,
     require_torch,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -301,14 +299,6 @@ def test_disk_offload_safetensors(self):
     def test_model_parallelism(self):
         pass
 
-    @require_torch_sdpa
-    @slow
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        self.skipTest(
-            "Due to custom causal mask, there is a slightly too big difference between eager and sdpa in bfloat16."
-        )
-
     @unittest.skip(
         reason="PaliGemmma's SigLip encoder uses the same initialization scheme as the Flax original implementation"
     )
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index afd45dc0167a5b..c3902c9e75bc66 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -66,12 +66,12 @@ def __init__(
         bos_token_id=0,
         eos_token_id=1,
         pad_token_id=2,
-        vision_start_token_id=151652,
-        image_token_id=151655,
-        video_token_id=151656,
+        vision_start_token_id=3,
+        image_token_id=4,
+        video_token_id=5,
         hidden_act="silu",
         hidden_size=32,
-        vocab_size=152064,
+        vocab_size=99,
         intermediate_size=37,
         max_position_embeddings=512,
         max_window_layers=3,
@@ -166,6 +166,8 @@ def prepare_config_and_inputs_for_common(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
 
+        input_ids[:, -1] = self.pad_token_id
+        input_ids[input_ids == self.video_token_id] = self.pad_token_id
         input_ids[input_ids == self.image_token_id] = self.pad_token_id
         input_ids[:, self.num_image_tokens] = self.image_token_id
         labels = torch.zeros(
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 4da6dc19addd4a..090907b164e80d 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -57,8 +57,8 @@ def __init__(
         image_token_index=0,
         video_token_index=1,
         projector_hidden_act="gelu",
-        seq_length=13,
-        num_frames=8,
+        seq_length=3,
+        num_frames=2,
         vision_feature_select_strategy="default",
         vision_feature_layer=-1,
         text_config={
@@ -88,7 +88,7 @@ def __init__(
         vision_config={
             "model_type": "clip_vision_model",
             "batch_size": 12,
-            "image_size": 30,
+            "image_size": 8,
             "patch_size": 6,
             "num_channels": 3,
             "is_training": True,
@@ -123,10 +123,11 @@ def __init__(
         self.batch_size = 5
         self.num_channels = 3
         self.image_size = 224
-        self.encoder_seq_length = 246
-        self.num_image_tokens = 25
-        self.num_video_tokens = 26 * self.num_frames
+
+        self.num_image_tokens = (vision_config["image_size"] // vision_config["patch_size"]) ** 2
+        self.num_video_tokens = (self.num_image_tokens + 1) * self.num_frames
         self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens
+        self.encoder_seq_length = self.seq_length
 
     def get_config(self):
         return VideoLlavaConfig(
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 801990331fea53..212eae1471222f 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -22,7 +22,7 @@
 
 from transformers import VideoMAEConfig
 from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import require_torch, require_torch_sdpa, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -213,6 +213,11 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
 
         return inputs_dict
 
+    @unittest.skip("`mse_cpu` not implemented for 'BFloat16'")
+    @require_torch_sdpa
+    def test_eager_matches_sdpa_inference_1_bfloat16(self):
+        pass
+
     def test_config(self):
         self.config_tester.run_common_tests()
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index c7a11ff0ac8a24..94b5e175bf88a2 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -3928,7 +3928,6 @@ def test_sdpa_can_dispatch_composite_models(self):
 
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
-    @slow
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         if not self.has_attentions:
             self.skipTest(reason="Model architecture does not support attentions")
@@ -3954,8 +3953,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
 
         atols = {
             ("cpu", False, torch.float32): 1e-6,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-6,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-6,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -3966,8 +3967,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         }
         rtols = {
             ("cpu", False, torch.float32): 1e-4,
+            ("cpu", False, torch.float16): 5e-3,
             ("cpu", False, torch.bfloat16): 1e-2,
             ("cpu", True, torch.float32): 1e-4,
+            ("cpu", True, torch.float16): 5e-3,
             ("cpu", True, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float32): 1e-4,
             ("cuda", False, torch.bfloat16): 1e-2,
@@ -3983,12 +3986,31 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
         if hasattr(self.model_tester, "num_hidden_layers"):
             self.model_tester.num_hidden_layers = 1
         if hasattr(self.model_tester, "vision_config") and "num_hidden_layers" in self.model_tester.vision_config:
+            self.model_tester.vision_config = copy.deepcopy(self.model_tester.vision_config)
             self.model_tester.vision_config["num_hidden_layers"] = 1
         if hasattr(self.model_tester, "text_config") and "num_hidden_layers" in self.model_tester.text_config:
+            self.model_tester.text_config = copy.deepcopy(self.model_tester.text_config)
             self.model_tester.text_config["num_hidden_layers"] = 1
 
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            config.rms_norm_eps = 1.0
+            config.layer_norm_eps = 1.0
+            config.norm_eps = 1.0
+            config.norm_epsilon = 1.0
+            config.layer_norm_epsilon = 1.0
+
+            # norm layers (layer/group norm, etc.) could cause flaky tests when the tensors have very small variance.
+            # (We don't need the original epsilon values to check eager/sdpa matches)
+            for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]:
+                if hasattr(config, attr):
+                    getattr(config, attr).rms_norm_eps = 1.0
+                    getattr(config, attr).layer_norm_eps = 1.0
+                    getattr(config, attr).norm_eps = 1.0
+                    getattr(config, attr).norm_epsilon = 1.0
+                    getattr(config, attr).layer_norm_epsilon = 1.0
+
             model = model_class(config)
             # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors.
             # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask.
@@ -4000,14 +4022,22 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
-                model_sdpa = model_sdpa.eval().to(torch_device)
+                model_sdpa = model_sdpa.eval().to(torch_device, dtype=torch_dtype)
 
                 model_eager = model_class.from_pretrained(
                     tmpdirname,
                     torch_dtype=torch_dtype,
                     attn_implementation="eager",
                 )
-                model_eager = model_eager.eval().to(torch_device)
+                model_eager = model_eager.eval().to(torch_device, dtype=torch_dtype)
+
+                # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
+                for x in model_eager.modules():
+                    if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
+                        x.eps = 1.0
+                for x in model_sdpa.modules():
+                    if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
+                        x.eps = 1.0
 
                 # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
                 # but it would be nicer to have an efficient way to use parameterized.expand

From d2bae7ee9d2bc8791e05763e56f46754ed3e71ae Mon Sep 17 00:00:00 2001
From: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com>
Date: Tue, 5 Nov 2024 16:11:02 +0100
Subject: [PATCH 09/12] Changing __repr__ in torchao to show quantized Linear
 (#34202)

* Changing __repr__ in torchao

* small update

* make style

* small update

* add LinearActivationQuantizedTensor

* remove some cases

* update imports & handle return None

* update
---
 .../quantizers/quantizer_torchao.py           | 36 ++++++++++++++++---
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
index f6bf431aa028d4..9a03eb25f4de0d 100644
--- a/src/transformers/quantizers/quantizer_torchao.py
+++ b/src/transformers/quantizers/quantizer_torchao.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib
+import types
 from typing import TYPE_CHECKING, Union
 
 from packaging import version
@@ -30,9 +31,7 @@
 
 if is_torch_available():
     import torch
-
-if is_torchao_available():
-    from torchao.quantization import quantize_
+    import torch.nn as nn
 
 logger = logging.get_logger(__name__)
 
@@ -46,6 +45,25 @@ def find_parent(model, name):
     return parent
 
 
+def _quantization_type(weight):
+    from torchao.dtypes import AffineQuantizedTensor
+    from torchao.quantization.linear_activation_quantized_tensor import LinearActivationQuantizedTensor
+
+    if isinstance(weight, AffineQuantizedTensor):
+        return f"{weight.__class__.__name__}({weight._quantization_type()})"
+
+    if isinstance(weight, LinearActivationQuantizedTensor):
+        return f"{weight.__class__.__name__}(activation={weight.input_quant_func}, weight={_quantization_type(weight.original_weight_tensor)})"
+
+
+def _linear_extra_repr(self):
+    weight = _quantization_type(self.weight)
+    if weight is None:
+        return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight=None"
+    else:
+        return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight={weight}"
+
+
 class TorchAoHfQuantizer(HfQuantizer):
     """
     Quantizer for torchao: https://github.com/pytorch/ao/
@@ -152,9 +170,17 @@ def create_quantized_param(
         Each nn.Linear layer that needs to be quantized is processsed here.
         First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module.
         """
+        from torchao.quantization import quantize_
+
         module, tensor_name = get_module_from_name(model, param_name)
-        module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device)
-        quantize_(module, self.quantization_config.get_apply_tensor_subclass())
+
+        if self.pre_quantized:
+            module._parameters[tensor_name] = torch.nn.Parameter(param_value.to(device=target_device))
+            if isinstance(module, nn.Linear):
+                module.extra_repr = types.MethodType(_linear_extra_repr, module)
+        else:
+            module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device)
+            quantize_(module, self.quantization_config.get_apply_tensor_subclass())
 
     def _process_model_after_weight_loading(self, model):
         """No process required for torchao quantized model"""

From 9f28d0c5d00a730947e6057fa34ffeb311347534 Mon Sep 17 00:00:00 2001
From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
Date: Tue, 5 Nov 2024 11:02:14 -0500
Subject: [PATCH 10/12] Fix torchvision interpolation CI (#34539)

fix-torch-interpolation-ci
---
 .../models/detr/image_processing_detr_fast.py         | 10 ++++++----
 .../models/rt_detr/image_processing_rt_detr_fast.py   | 11 ++++++-----
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py
index eadde59e55e475..b414b4224e683c 100644
--- a/src/transformers/models/detr/image_processing_detr_fast.py
+++ b/src/transformers/models/detr/image_processing_detr_fast.py
@@ -347,7 +347,7 @@ def __init__(
         format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
-        resample: [Union[PILImageResampling, F.InterpolationMode]] = PILImageResampling.BILINEAR,
+        resample: Union[PILImageResampling, "F.InterpolationMode"] = PILImageResampling.BILINEAR,
         do_rescale: bool = True,
         rescale_factor: Union[int, float] = 1 / 255,
         do_normalize: bool = True,
@@ -462,7 +462,7 @@ def resize(
         self,
         image: torch.Tensor,
         size: SizeDict,
-        interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR,
+        interpolation: "F.InterpolationMode" = None,
         **kwargs,
     ) -> torch.Tensor:
         """
@@ -485,6 +485,7 @@ def resize(
             interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
                 Resampling filter to use if resizing the image.
         """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
         if size.shortest_edge and size.longest_edge:
             # Resize the image so that the shortest edge or the longest edge is of the given size
             # while maintaining the aspect ratio of the original image.
@@ -517,7 +518,7 @@ def resize_annotation(
         orig_size: Tuple[int, int],
         target_size: Tuple[int, int],
         threshold: float = 0.5,
-        interpolation: F.InterpolationMode = F.InterpolationMode.NEAREST,
+        interpolation: "F.InterpolationMode" = None,
     ):
         """
         Resizes an annotation to a target size.
@@ -534,6 +535,7 @@ def resize_annotation(
             resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`):
                 The resampling filter to use when resizing the masks.
         """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST
         ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
 
         new_annotation = {}
@@ -680,7 +682,7 @@ def preprocess(
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
-        resample: Optional[Union[PILImageResampling, F.InterpolationMode]] = None,
+        resample: Optional[Union[PILImageResampling, "F.InterpolationMode"]] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[Union[int, float]] = None,
         do_normalize: Optional[bool] = None,
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
index 9f63b5b7ced467..0470352d38f456 100644
--- a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
@@ -43,7 +43,6 @@
     get_image_type,
     infer_channel_dimension_format,
     make_list_of_images,
-    pil_torch_interpolation_mapping,
     validate_annotations,
 )
 from ...utils import (
@@ -197,7 +196,7 @@ def __init__(
         format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
-        resample: Union[PILImageResampling, F.InterpolationMode] = PILImageResampling.BILINEAR,
+        resample: Union[PILImageResampling, "F.InterpolationMode"] = PILImageResampling.BILINEAR,
         do_rescale: bool = True,
         rescale_factor: Union[int, float] = 1 / 255,
         do_normalize: bool = False,
@@ -256,7 +255,7 @@ def resize(
         self,
         image: torch.Tensor,
         size: SizeDict,
-        interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR,
+        interpolation: "F.InterpolationMode" = None,
         **kwargs,
     ) -> torch.Tensor:
         """
@@ -279,6 +278,7 @@ def resize(
             interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
                 Resampling filter to use if resizing the image.
         """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
         if size.shortest_edge and size.longest_edge:
             # Resize the image so that the shortest edge or the longest edge is of the given size
             # while maintaining the aspect ratio of the original image.
@@ -312,7 +312,7 @@ def resize_annotation(
         orig_size: Tuple[int, int],
         target_size: Tuple[int, int],
         threshold: float = 0.5,
-        interpolation: F.InterpolationMode = F.InterpolationMode.NEAREST,
+        interpolation: "F.InterpolationMode" = None,
     ):
         """
         Resizes an annotation to a target size.
@@ -329,6 +329,7 @@ def resize_annotation(
             resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`):
                 The resampling filter to use when resizing the masks.
         """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST
         ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
 
         new_annotation = {}
@@ -480,7 +481,7 @@ def preprocess(
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
-        resample: Optional[Union[PILImageResampling, F.InterpolationMode]] = None,
+        resample: Optional[Union[PILImageResampling, "F.InterpolationMode"]] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[Union[int, float]] = None,
         do_normalize: Optional[bool] = None,

From e83aaaa86be6de424eade37d4d48fddcbefb007e Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Wed, 6 Nov 2024 01:26:20 +0800
Subject: [PATCH 11/12] Fix `use_parallel_residual` and `qkv_bias` for StableLM
 GGUF config extraction (#34450)

* fix stablelm qkv_bias

* fix stablelm qkv_bias and use_parallel_residual

* remove original_model.config for stablelm gguf test
---
 src/transformers/modeling_gguf_pytorch_utils.py | 11 +++++++++++
 tests/quantization/ggml/test_ggml.py            |  8 --------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py
index c784ca0eb4ca2c..f58bf330ce7db3 100644
--- a/src/transformers/modeling_gguf_pytorch_utils.py
+++ b/src/transformers/modeling_gguf_pytorch_utils.py
@@ -106,6 +106,17 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     if "qwen2moe" in architecture:
         updated_architecture = "qwen2_moe"
 
+    # For stablelm architecture, we need to set qkv_bias and use_parallel_residual from tensors
+    # If `qkv_bias=True`, qkv_proj with bias will be present in the tensors
+    # If `use_parallel_residual=False`, ffn_norm will be present in the tensors
+    if "stablelm" in architecture:
+        attn_bias_name = {"attn_q.bias", "attn_k.bias", "attn_v.bias"}
+        ffn_norm_name = "ffn_norm"
+        qkv_bias = any(bias_name in tensor.name for tensor in reader.tensors for bias_name in attn_bias_name)
+        use_parallel_residual = any(ffn_norm_name in tensor.name for tensor in reader.tensors)
+        parsed_parameters["config"]["qkv_bias"] = qkv_bias
+        parsed_parameters["config"]["use_parallel_residual"] = not use_parallel_residual
+
     model_size = ""
     # extract the number of params from file name as architectures can differ ;
     # eg. for falcon : `...falcon-7b-...`
diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py
index da1af9bff8df90..84278e7032537b 100644
--- a/tests/quantization/ggml/test_ggml.py
+++ b/tests/quantization/ggml/test_ggml.py
@@ -673,10 +673,6 @@ def test_stablelm_fp16(self):
             self.stablelm2_model_id,
             gguf_file=self.fp16_stablelm2_model_id,
             torch_dtype=torch.float16,
-            # for precise comparison it is required to use the original model config
-            # as quantized one is different in parameters: use_parallel_residual and use_qkv_bias
-            # and it highly influences on the output results
-            config=original_model.config,
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.stablelm2_model_id, gguf_file=self.fp16_stablelm2_model_id)
@@ -703,10 +699,6 @@ def test_stablelm_weights_conversion_fp16(self):
             gguf_file=self.fp16_stablelm2_model_id,
             device_map="auto",
             torch_dtype=torch.float16,
-            # for precise comparison it is required to use the original model config
-            # as quantized one is different in parameters: use_parallel_residual and use_qkv_bias
-            # and it highly influences on the output results
-            config=original_model.config,
         )
 
         converted_state_dict = converted_model.state_dict()

From 7bbc62474391aff64f63fcc064c975752d1fa4de Mon Sep 17 00:00:00 2001
From: Ahnjj_DEV <ahnjj.dev@gmail.com>
Date: Wed, 6 Nov 2024 02:32:17 +0900
Subject: [PATCH 12/12] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated=20`?=
 =?UTF-8?q?convbert.md`=20to=20Korean=20(#34599)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: convbert.md

* Update _toctree.yml

* feat: nmt draft
---
 docs/source/ko/_toctree.yml          |   4 +-
 docs/source/ko/model_doc/convbert.md | 135 +++++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/model_doc/convbert.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 20519157eddc6d..eaacc998298ff4 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -358,8 +358,8 @@
         title: (번역중) CodeGen
       - local: model_doc/cohere
         title: Cohere
-      - local: in_translation
-        title: (번역중) ConvBERT
+      - local: model_doc/convbert
+        title: ConvBERT
       - local: in_translation
         title: (번역중) CPM
       - local: in_translation
diff --git a/docs/source/ko/model_doc/convbert.md b/docs/source/ko/model_doc/convbert.md
new file mode 100644
index 00000000000000..ec64a369b56a3f
--- /dev/null
+++ b/docs/source/ko/model_doc/convbert.md
@@ -0,0 +1,135 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ConvBERT [[convbert]]
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=convbert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-convbert-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/conv-bert-base">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## 개요 [[overview]]
+
+ConvBERT 모델은 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan에 의해 제안되었으며, 제안 논문 제목은 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496)입니다.
+
+논문의 초록은 다음과 같습니다:
+
+*BERT와 그 변형 모델과 같은 사전 학습된 언어 모델들은 최근 다양한 자연어 이해 과제에서 놀라운 성과를 이루었습니다. 그러나 BERT는 글로벌 셀프 어텐션 블록에 크게 의존하기 때문에 메모리 사용량이 많고 계산 비용이 큽니다. 모든 어텐션 헤드가 글로벌 관점에서 어텐션 맵을 생성하기 위해 입력 시퀀스 전체를 탐색하지만, 일부 헤드는 로컬 종속성만 학습할 필요가 있다는 것을 발견했습니다. 이는 불필요한 계산이 포함되어 있음을 의미합니다. 따라서 우리는 이러한 self-attention 헤드들을 대체하여 로컬 종속성을 직접 모델링하기 위해 새로운 span 기반 동적 컨볼루션을 제안합니다. 새로운 컨볼루션 헤드와 나머지 self-attention 헤드들이 결합하여 글로벌 및 로컬 문맥 학습에 더 효율적인 혼합 어텐션 블록을 구성합니다. 우리는 BERT에 이 혼합 어텐션 설계를 적용하여 ConvBERT 모델을 구축했습니다. 실험 결과, ConvBERT는 다양한 다운스트림 과제에서 BERT 및 그 변형 모델보다 더 우수한 성능을 보였으며, 훈련 비용과 모델 파라미터 수가 더 적었습니다. 특히 ConvBERTbase 모델은 GLUE 스코어 86.4를 달성하여 ELECTRAbase보다 0.7 높은 성과를 보이며, 훈련 비용은 1/4 이하로 줄었습니다. 코드와 사전 학습된 모델은 공개될 예정입니다.*
+
+이 모델은 [abhishek](https://huggingface.co/abhishek)에 의해 기여되었으며, 원본 구현은 여기에서 찾을 수 있습니다 : https://github.com/yitu-opensource/ConvBert
+
+
+
+## 사용 팁 [[usage-tips]]
+ConvBERT 훈련 팁은 BERT와 유사합니다. 사용 팁은 [BERT 문서](bert).를 참고하십시오.
+
+
+## 리소스 [[resources]]
+
+- [텍스트 분류 작업 가이드 (Text classification task guide)](../tasks/sequence_classification)
+- [토큰 분류 작업 가이드 (Token classification task guide)](../tasks/token_classification)
+- [질의응답 작업 가이드 (Question answering task guide)](../tasks/question_answering)
+- [마스킹된 언어 모델링 작업 가이드 (Masked language modeling task guide)](../tasks/masked_language_modeling)
+- [다중 선택 작업 가이드 (Multiple choice task guide)](../tasks/multiple_choice)
+
+## ConvBertConfig [[transformers.ConvBertConfig]]
+
+[[autodoc]] ConvBertConfig
+
+## ConvBertTokenizer [[transformers.ConvBertTokenizer]]
+
+[[autodoc]] ConvBertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## ConvBertTokenizerFast [[transformers.ConvBertTokenizerFast]]
+
+[[autodoc]] ConvBertTokenizerFast
+
+<frameworkcontent>
+<pt>
+
+## ConvBertModel [[transformers.ConvBertModel]]
+
+[[autodoc]] ConvBertModel
+    - forward
+
+## ConvBertForMaskedLM [[transformers.ConvBertForMaskedLM]]
+
+[[autodoc]] ConvBertForMaskedLM
+    - forward
+
+## ConvBertForSequenceClassification [[transformers.ConvBertForSequenceClassification]]
+
+[[autodoc]] ConvBertForSequenceClassification
+    - forward
+
+## ConvBertForMultipleChoice [[transformers.ConvBertForMultipleChoice]]
+
+[[autodoc]] ConvBertForMultipleChoice
+    - forward
+
+## ConvBertForTokenClassification [[transformers.ConvBertForTokenClassification]]
+
+[[autodoc]] ConvBertForTokenClassification
+    - forward
+
+## ConvBertForQuestionAnswering [[transformers.ConvBertForQuestionAnswering]]
+
+[[autodoc]] ConvBertForQuestionAnswering
+    - forward
+
+</pt>
+<tf>
+
+## TFConvBertModel [[transformers.TFConvBertModel]]
+
+[[autodoc]] TFConvBertModel
+    - call
+
+## TFConvBertForMaskedLM [[transformers.TFConvBertForMaskedLM]]
+
+[[autodoc]] TFConvBertForMaskedLM 
+    - call
+
+## TFConvBertForSequenceClassification [[transformers.TFConvBertForSequenceClassification]]
+
+[[autodoc]] TFConvBertForSequenceClassification
+    - call
+
+## TFConvBertForMultipleChoice [[transformers.TFConvBertForMultipleChoice]]
+
+[[autodoc]] TFConvBertForMultipleChoice
+    - call
+
+## TFConvBertForTokenClassification [[transformers.TFConvBertForTokenClassification]]
+
+[[autodoc]] TFConvBertForTokenClassification
+    - call
+
+## TFConvBertForQuestionAnswering [[transformers.TFConvBertForQuestionAnswering]]
+
+[[autodoc]] TFConvBertForQuestionAnswering
+    - call
+
+</tf>
+</frameworkcontent>