Student0809 commited on Jul 24

Commit

8c78b88

verified ·

1 Parent(s): 7feac49

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
docs/resources/grpo_countdown.png +3 -0
docs/resources/grpo_geoqa.png +3 -0
docs/resources/grpo_openr1_multimodal.png +3 -0
docs/transformers/build/lib/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py +246 -0
docs/transformers/build/lib/transformers/models/depth_anything/modeling_depth_anything.py +469 -0
docs/transformers/build/lib/transformers/models/depth_pro/configuration_depth_pro.py +205 -0
docs/transformers/build/lib/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +254 -0
docs/transformers/build/lib/transformers/models/depth_pro/image_processing_depth_pro.py +392 -0
docs/transformers/build/lib/transformers/models/depth_pro/image_processing_depth_pro_fast.py +189 -0
docs/transformers/build/lib/transformers/models/depth_pro/modeling_depth_pro.py +1218 -0
docs/transformers/build/lib/transformers/models/detr/__init__.py +31 -0
docs/transformers/build/lib/transformers/models/detr/configuration_detr.py +289 -0
docs/transformers/build/lib/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py +277 -0
docs/transformers/build/lib/transformers/models/detr/convert_detr_to_pytorch.py +385 -0
docs/transformers/build/lib/transformers/models/detr/feature_extraction_detr.py +48 -0
docs/transformers/build/lib/transformers/models/detr/image_processing_detr_fast.py +1312 -0
docs/transformers/build/lib/transformers/models/detr/modeling_detr.py +1815 -0
docs/transformers/build/lib/transformers/models/dialogpt/__init__.py +0 -0
docs/transformers/build/lib/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py +46 -0
docs/transformers/build/lib/transformers/models/diffllama/__init__.py +27 -0
docs/transformers/build/lib/transformers/models/diffllama/configuration_diffllama.py +199 -0
docs/transformers/build/lib/transformers/models/esm/openfold_utils/rigid_utils.py +1242 -0
docs/transformers/build/lib/transformers/models/falcon/configuration_falcon.py +211 -0
docs/transformers/build/lib/transformers/models/falcon/convert_custom_code_checkpoint.py +74 -0
docs/transformers/build/lib/transformers/models/falcon/modeling_falcon.py +1566 -0
docs/transformers/build/lib/transformers/models/falcon_mamba/__init__.py +27 -0
docs/transformers/build/lib/transformers/models/falcon_mamba/configuration_falcon_mamba.py +162 -0
docs/transformers/build/lib/transformers/models/falcon_mamba/modeling_falcon_mamba.py +873 -0
docs/transformers/build/lib/transformers/models/fastspeech2_conformer/__init__.py +28 -0
docs/transformers/build/lib/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +480 -0
docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py +210 -0
docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_hifigan.py +134 -0
docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py +102 -0
docs/transformers/build/lib/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +1697 -0
docs/transformers/build/lib/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +188 -0
docs/transformers/build/lib/transformers/models/flaubert/__init__.py +29 -0
docs/transformers/build/lib/transformers/models/flaubert/configuration_flaubert.py +235 -0
docs/transformers/build/lib/transformers/models/flaubert/modeling_flaubert.py +1739 -0
docs/transformers/build/lib/transformers/models/flaubert/modeling_tf_flaubert.py +1344 -0
docs/transformers/build/lib/transformers/models/flaubert/tokenization_flaubert.py +568 -0
docs/transformers/build/lib/transformers/models/flava/__init__.py +31 -0
docs/transformers/build/lib/transformers/models/flava/configuration_flava.py +701 -0
docs/transformers/build/lib/transformers/models/flava/convert_dalle_to_flava_codebook.py +102 -0
docs/transformers/build/lib/transformers/models/flava/convert_flava_original_pytorch_to_hf.py +99 -0
docs/transformers/build/lib/transformers/models/flava/feature_extraction_flava.py +38 -0
docs/transformers/build/lib/transformers/models/flava/image_processing_flava.py +705 -0
docs/transformers/build/lib/transformers/models/flava/image_processing_flava_fast.py +549 -0
docs/transformers/build/lib/transformers/models/flava/modeling_flava.py +2127 -0
docs/transformers/build/lib/transformers/models/flava/processing_flava.py +168 -0

.gitattributes CHANGED Viewed

@@ -48,3 +48,6 @@ wandb/offline-run-20250624_115955-iye05c18/run-iye05c18.wandb filter=lfs diff=lf
 wandb/offline-run-20250721_000454-up3efnok/run-up3efnok.wandb filter=lfs diff=lfs merge=lfs -text
 wandb/offline-run-20250722_003110-femxkckf/run-femxkckf.wandb filter=lfs diff=lfs merge=lfs -text
 seamless_interaction/assets/banner.gif filter=lfs diff=lfs merge=lfs -text

 wandb/offline-run-20250721_000454-up3efnok/run-up3efnok.wandb filter=lfs diff=lfs merge=lfs -text
 wandb/offline-run-20250722_003110-femxkckf/run-femxkckf.wandb filter=lfs diff=lfs merge=lfs -text
 seamless_interaction/assets/banner.gif filter=lfs diff=lfs merge=lfs -text
+docs/resources/grpo_countdown.png filter=lfs diff=lfs merge=lfs -text
+docs/resources/grpo_geoqa.png filter=lfs diff=lfs merge=lfs -text
+docs/resources/grpo_openr1_multimodal.png filter=lfs diff=lfs merge=lfs -text

docs/resources/grpo_countdown.png ADDED Viewed

Git LFS Details

SHA256: 1b55fe6864e0c92549940d6989d92b3ab22be38a035cff3694525252737fc91e
Pointer size: 132 Bytes
Size of remote file: 2.23 MB

docs/resources/grpo_geoqa.png ADDED Viewed

Git LFS Details

SHA256: 71246376b16f2ff288542dca2ff31532b16ef99f5e862797463d548e447e1f8d
Pointer size: 132 Bytes
Size of remote file: 2.24 MB

docs/resources/grpo_openr1_multimodal.png ADDED Viewed

Git LFS Details

SHA256: 050f56792468a4c9797a90314e322c16dd916bde3be24a7ce7c7b96381e70d9e
Pointer size: 132 Bytes
Size of remote file: 2.3 MB

docs/transformers/build/lib/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Distill Any Depth checkpoints from the original repository. URL:
+https://github.com/Westlake-AGI-Lab/Distill-Any-Depth"""
+import argparse
+import re
+from pathlib import Path
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from safetensors.torch import load_file
+from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor
+from transformers.utils import logging
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
+    r"(backbone|pretrained)\.cls_token": r"backbone.embeddings.cls_token",
+    r"(backbone|pretrained)\.mask_token": r"backbone.embeddings.mask_token",
+    r"(backbone|pretrained)\.pos_embed": r"backbone.embeddings.position_embeddings",
+    r"(backbone|pretrained)\.patch_embed\.proj\.(weight|bias)": r"backbone.embeddings.patch_embeddings.projection.\2",
+    r"(backbone|pretrained)\.norm\.(weight|bias)": r"backbone.layernorm.\2",
+    r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.attn\.proj\.(weight|bias)": r"backbone.encoder.layer.\4.attention.output.dense.\5",
+    r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.ls(1|2)\.gamma": r"backbone.encoder.layer.\4.layer_scale\5.lambda1",
+    r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.mlp\.fc(1|2)\.(weight|bias)": r"backbone.encoder.layer.\4.mlp.fc\5.\6",
+    r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.norm(1|2)\.(weight|bias)": r"backbone.encoder.layer.\4.norm\5.\6",
+    r"depth_head\.projects\.(\d+)\.(weight|bias)": r"neck.reassemble_stage.layers.\1.projection.\2",
+    r"depth_head\.resize_layers\.(?!2)(\d+)\.(weight|bias)": r"neck.reassemble_stage.layers.\1.resize.\2",
+    r"depth_head\.scratch\.layer(\d+)_rn\.weight": lambda m: f"neck.convs.{int(m[1]) - 1}.weight",
+    r"depth_head\.scratch\.output_conv(\d+)(?:\.(\d+))?\.(weight|bias)": lambda m: (
+        f"head.conv{int(m[1]) + (int(m[2]) // 2 if m[2] else 0)}.{m[3]}" if m[1] == "2" else f"head.conv{m[1]}.{m[3]}"
+    ),
+    r"depth_head\.scratch\.refinenet(\d+)\.out_conv\.(weight|bias)": lambda m: f"neck.fusion_stage.layers.{3 - (int(m[1]) - 1)}.projection.{m[2]}",
+    r"depth_head\.scratch\.refinenet(\d+)\.resConfUnit(\d+)\.conv(\d+)\.(weight|bias)": lambda m: f"neck.fusion_stage.layers.{3 - (int(m[1]) - 1)}.residual_layer{m[2]}.convolution{m[3]}.{m[4]}",
+}
+def get_dpt_config(model_name):
+    if "small" in model_name:
+        out_indices = [3, 6, 9, 12]
+        backbone_config = Dinov2Config.from_pretrained(
+            "facebook/dinov2-small", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
+        )
+        fusion_hidden_size = 64
+        neck_hidden_sizes = [48, 96, 192, 384]
+    elif "base" in model_name:
+        out_indices = [3, 6, 9, 12]
+        backbone_config = Dinov2Config.from_pretrained(
+            "facebook/dinov2-base", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
+        )
+        fusion_hidden_size = 128
+        neck_hidden_sizes = [96, 192, 384, 768]
+    elif "large" in model_name:
+        out_indices = [5, 12, 18, 24]
+        backbone_config = Dinov2Config.from_pretrained(
+            "facebook/dinov2-large", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
+        )
+        fusion_hidden_size = 256
+        neck_hidden_sizes = [256, 512, 1024, 1024]
+    else:
+        raise NotImplementedError(f"Model not supported: {model_name}")
+    depth_estimation_type = "relative"
+    max_depth = None
+    config = DepthAnythingConfig(
+        reassemble_hidden_size=backbone_config.hidden_size,
+        patch_size=backbone_config.patch_size,
+        backbone_config=backbone_config,
+        fusion_hidden_size=fusion_hidden_size,
+        neck_hidden_sizes=neck_hidden_sizes,
+        depth_estimation_type=depth_estimation_type,
+        max_depth=max_depth,
+    )
+    return config
+def convert_key_pattern(key, mapping):
+    for pattern, replacement in mapping.items():
+        match = re.fullmatch(pattern, key)
+        if match:
+            if callable(replacement):
+                return replacement(match)
+            return re.sub(pattern, replacement, key)
+    return None
+def convert_keys(state_dict, config):
+    new_state_dict = {}
+    qkv_pattern = r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.attn\.qkv\.(weight|bias)"
+    qkv_keys = [k for k in list(state_dict.keys()) if re.match(qkv_pattern, k)]
+    for old_key in qkv_keys:
+        value = state_dict.pop(old_key)
+        match = re.match(qkv_pattern, old_key)
+        _, _, _, layer, attr = match.groups()
+        hidden_size = config.backbone_config.hidden_size
+        q = value[:hidden_size]
+        k = value[hidden_size : hidden_size * 2]
+        v = value[-hidden_size:]
+        for proj, tensor in zip(["query", "key", "value"], [q, k, v]):
+            new_key = f"backbone.encoder.layer.{layer}.attention.attention.{proj}.{attr}"
+            new_state_dict[new_key] = tensor
+    for old_key in list(state_dict.keys()):
+        value = state_dict.pop(old_key)
+        new_key = convert_key_pattern(old_key, ORIGINAL_TO_CONVERTED_KEY_MAPPING)
+        new_state_dict[new_key] = value
+    return new_state_dict
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    return Image.open(requests.get(url, stream=True).raw)
+name_to_checkpoint = {
+    "distill-any-depth-small": "small/model.safetensors",
+    "distill-any-depth-base": "base/model.safetensors",
+    "distill-any-depth-large": "large/model.safetensors",
+}
+@torch.no_grad()
+def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits):
+    config = get_dpt_config(model_name)
+    repo_id = "xingyang1/Distill-Any-Depth"
+    filepath = hf_hub_download(repo_id=repo_id, filename=name_to_checkpoint[model_name])
+    state_dict = load_file(filepath)
+    converted_state_dict = convert_keys(state_dict, config)
+    model = DepthAnythingForDepthEstimation(config)
+    model.load_state_dict(converted_state_dict)
+    model.eval()
+    processor = DPTImageProcessor(
+        do_resize=True,
+        size={"height": 518, "width": 518},
+        ensure_multiple_of=14,
+        keep_aspect_ratio=True,
+        do_rescale=True,
+        do_normalize=True,
+        image_mean=[0.485, 0.456, 0.406],
+        image_std=[0.229, 0.224, 0.225],
+    )
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+    with torch.no_grad():
+        outputs = model(pixel_values)
+        predicted_depth = outputs.predicted_depth
+    print("Shape of predicted depth:", predicted_depth.shape)
+    print("First values:", predicted_depth[0, :3, :3])
+    if verify_logits:
+        print("Verifying logits...")
+        expected_shape = torch.Size([1, 518, 686])
+        if model_name == "distill-any-depth-small":
+            expected_slice = torch.tensor(
+                [[2.5653, 2.5249, 2.5570], [2.4897, 2.5235, 2.5355], [2.5255, 2.5261, 2.5422]]
+            )
+        elif model_name == "distill-any-depth-base":
+            expected_slice = torch.tensor(
+                [[4.8976, 4.9075, 4.9403], [4.8872, 4.8906, 4.9448], [4.8712, 4.8898, 4.8838]]
+            )
+        elif model_name == "distill-any-depth-large":
+            expected_slice = torch.tensor(
+                [[55.1067, 51.1828, 51.6803], [51.9098, 50.7529, 51.4494], [50.1745, 50.5491, 50.8818]]
+            )
+        else:
+            raise ValueError("Not supported")
+        assert predicted_depth.shape == torch.Size(expected_shape)
+        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4)
+        print("Looks ok!")
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model and processor to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+    if push_to_hub:
+        print("Pushing model and processor to hub...")
+        model.push_to_hub(repo_id=f"{model_name.title()}-hf")
+        processor.push_to_hub(repo_id=f"{model_name.title()}-hf")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        default="distill-any-depth-small",
+        type=str,
+        choices=name_to_checkpoint.keys(),
+        help="Name of the model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether to push the model to the hub after conversion.",
+    )
+    parser.add_argument(
+        "--verify_logits",
+        action="store_true",
+        required=False,
+        help="Whether to verify the logits after conversion.",
+    )
+    args = parser.parse_args()
+    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits)

docs/transformers/build/lib/transformers/models/depth_anything/modeling_depth_anything.py ADDED Viewed

	@@ -0,0 +1,469 @@

+# coding=utf-8
+# Copyright 2024 TikTok and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Depth Anything model."""
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from ...file_utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import DepthEstimatorOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from ...utils.backbone_utils import load_backbone
+from .configuration_depth_anything import DepthAnythingConfig
+logger = logging.get_logger(__name__)
+# General docstring
+_CONFIG_FOR_DOC = "DepthAnythingConfig"
+DEPTH_ANYTHING_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`DepthAnythingConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+DEPTH_ANYTHING_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+class DepthAnythingReassembleLayer(nn.Module):
+    def __init__(self, config, channels, factor):
+        super().__init__()
+        self.projection = nn.Conv2d(in_channels=config.reassemble_hidden_size, out_channels=channels, kernel_size=1)
+        # up/down sampling depending on factor
+        if factor > 1:
+            self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0)
+        elif factor == 1:
+            self.resize = nn.Identity()
+        elif factor < 1:
+            # so should downsample
+            self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1)
+    # Copied from transformers.models.dpt.modeling_dpt.DPTReassembleLayer.forward
+    def forward(self, hidden_state):
+        hidden_state = self.projection(hidden_state)
+        hidden_state = self.resize(hidden_state)
+        return hidden_state
+class DepthAnythingReassembleStage(nn.Module):
+    """
+    This class reassembles the hidden states of the backbone into image-like feature representations at various
+    resolutions.
+    This happens in 3 stages:
+    1. Take the patch embeddings and reshape them to image-like feature representations.
+    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
+    3. Resizing the spatial dimensions (height, width).
+    Args:
+        config (`[DepthAnythingConfig]`):
+            Model configuration class defining the model architecture.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList()
+        for channels, factor in zip(config.neck_hidden_sizes, config.reassemble_factors):
+            self.layers.append(DepthAnythingReassembleLayer(config, channels=channels, factor=factor))
+    def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
+        """
+        Args:
+            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
+                List of hidden states from the backbone.
+        """
+        out = []
+        for i, hidden_state in enumerate(hidden_states):
+            # reshape to (batch_size, num_channels, height, width)
+            hidden_state = hidden_state[:, 1:]
+            batch_size, _, num_channels = hidden_state.shape
+            hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels)
+            hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+            hidden_state = self.layers[i](hidden_state)
+            out.append(hidden_state)
+        return out
+class DepthAnythingPreActResidualLayer(nn.Module):
+    """
+    ResidualConvUnit, pre-activate residual unit.
+    Args:
+        config (`[DepthAnythingConfig]`):
+            Model configuration class defining the model architecture.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.activation1 = nn.ReLU()
+        self.convolution1 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+        )
+        self.activation2 = nn.ReLU()
+        self.convolution2 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+        )
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        hidden_state = self.activation1(hidden_state)
+        hidden_state = self.convolution1(hidden_state)
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+        return hidden_state + residual
+class DepthAnythingFeatureFusionLayer(nn.Module):
+    """Feature fusion layer, merges feature maps from different stages.
+    Args:
+        config (`[DepthAnythingConfig]`):
+            Model configuration class defining the model architecture.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
+        self.residual_layer1 = DepthAnythingPreActResidualLayer(config)
+        self.residual_layer2 = DepthAnythingPreActResidualLayer(config)
+    def forward(self, hidden_state, residual=None, size=None):
+        if residual is not None:
+            if hidden_state.shape != residual.shape:
+                residual = nn.functional.interpolate(
+                    residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
+                )
+            hidden_state = hidden_state + self.residual_layer1(residual)
+        hidden_state = self.residual_layer2(hidden_state)
+        modifier = {"scale_factor": 2} if size is None else {"size": size}
+        hidden_state = nn.functional.interpolate(
+            hidden_state,
+            **modifier,
+            mode="bilinear",
+            align_corners=True,
+        )
+        hidden_state = self.projection(hidden_state)
+        return hidden_state
+class DepthAnythingFeatureFusionStage(nn.Module):
+    # Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage.__init__ with DPT->DepthAnything
+    def __init__(self, config):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for _ in range(len(config.neck_hidden_sizes)):
+            self.layers.append(DepthAnythingFeatureFusionLayer(config))
+    def forward(self, hidden_states, size=None):
+        # reversing the hidden_states, we start from the last
+        hidden_states = hidden_states[::-1]
+        fused_hidden_states = []
+        fused_hidden_state = None
+        for idx, (hidden_state, layer) in enumerate(zip(hidden_states, self.layers)):
+            size = hidden_states[idx + 1].shape[2:] if idx != (len(hidden_states) - 1) else None
+            if fused_hidden_state is None:
+                # first layer only uses the last hidden_state
+                fused_hidden_state = layer(hidden_state, size=size)
+            else:
+                fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size)
+            fused_hidden_states.append(fused_hidden_state)
+        return fused_hidden_states
+# Modified from transformers.models.dpt.modeling_dpt.DPTPreTrainedModel with DPT->DepthAnything,dpt->depth_anything
+# avoiding sdpa and flash_attn_2 support, it's done in the backend
+class DepthAnythingPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = DepthAnythingConfig
+    base_model_prefix = "depth_anything"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class DepthAnythingNeck(nn.Module):
+    """
+    DepthAnythingNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
+    input and produces another list of tensors as output. For DepthAnything, it includes 2 stages:
+    * DepthAnythingReassembleStage
+    * DepthAnythingFeatureFusionStage.
+    Args:
+        config (dict): config dict.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.reassemble_stage = DepthAnythingReassembleStage(config)
+        self.convs = nn.ModuleList()
+        for channel in config.neck_hidden_sizes:
+            self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False))
+        # fusion
+        self.fusion_stage = DepthAnythingFeatureFusionStage(config)
+    def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
+        """
+        Args:
+            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
+                List of hidden states from the backbone.
+        """
+        if not isinstance(hidden_states, (tuple, list)):
+            raise TypeError("hidden_states should be a tuple or list of tensors")
+        if len(hidden_states) != len(self.config.neck_hidden_sizes):
+            raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")
+        # postprocess hidden states
+        hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width)
+        features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)]
+        # fusion blocks
+        output = self.fusion_stage(features)
+        return output
+class DepthAnythingDepthEstimationHead(nn.Module):
+    """
+    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
+    the predictions to the input resolution after the first convolutional layer (details can be found in the DPT paper's
+    supplementary material). The final activation function is either ReLU or Sigmoid, depending on the depth estimation
+    type (relative or metric). For metric depth estimation, the output is scaled by the maximum depth used during pretraining.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.head_in_index = config.head_in_index
+        self.patch_size = config.patch_size
+        features = config.fusion_hidden_size
+        self.conv1 = nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(features // 2, config.head_hidden_size, kernel_size=3, stride=1, padding=1)
+        self.activation1 = nn.ReLU()
+        self.conv3 = nn.Conv2d(config.head_hidden_size, 1, kernel_size=1, stride=1, padding=0)
+        if config.depth_estimation_type == "relative":
+            self.activation2 = nn.ReLU()
+        elif config.depth_estimation_type == "metric":
+            self.activation2 = nn.Sigmoid()
+        else:
+            raise ValueError(f"Unknown depth estimation type: {config.depth_estimation_type}")
+        self.max_depth = config.max_depth
+    def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) -> torch.Tensor:
+        hidden_states = hidden_states[self.head_in_index]
+        predicted_depth = self.conv1(hidden_states)
+        predicted_depth = nn.functional.interpolate(
+            predicted_depth,
+            (int(patch_height * self.patch_size), int(patch_width * self.patch_size)),
+            mode="bilinear",
+            align_corners=True,
+        )
+        predicted_depth = self.conv2(predicted_depth)
+        predicted_depth = self.activation1(predicted_depth)
+        predicted_depth = self.conv3(predicted_depth)
+        predicted_depth = self.activation2(predicted_depth) * self.max_depth
+        predicted_depth = predicted_depth.squeeze(dim=1)  # shape (batch_size, height, width)
+        return predicted_depth
+@add_start_docstrings(
+    """
+    Depth Anything Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
+    """,
+    DEPTH_ANYTHING_START_DOCSTRING,
+)
+class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel):
+    _no_split_modules = ["DPTViTEmbeddings"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.backbone = load_backbone(config)
+        self.neck = DepthAnythingNeck(config)
+        self.head = DepthAnythingDepthEstimationHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(DEPTH_ANYTHING_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], DepthEstimatorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth depth estimation maps for computing the loss.
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> import requests
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
+        >>> model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf")
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> # interpolate to original size
+        >>> post_processed_output = image_processor.post_process_depth_estimation(
+        ...     outputs,
+        ...     target_sizes=[(image.height, image.width)],
+        ... )
+        >>> # visualize the prediction
+        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
+        >>> depth = predicted_depth * 255 / predicted_depth.max()
+        >>> depth = depth.detach().cpu().numpy()
+        >>> depth = Image.fromarray(depth.astype("uint8"))
+        ```"""
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        outputs = self.backbone.forward_with_filtered_kwargs(
+            pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
+        )
+        hidden_states = outputs.feature_maps
+        _, _, height, width = pixel_values.shape
+        patch_size = self.config.patch_size
+        patch_height = height // patch_size
+        patch_width = width // patch_size
+        hidden_states = self.neck(hidden_states, patch_height, patch_width)
+        predicted_depth = self.head(hidden_states, patch_height, patch_width)
+        if not return_dict:
+            if output_hidden_states:
+                output = (predicted_depth,) + outputs[1:]
+            else:
+                output = (predicted_depth,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return DepthEstimatorOutput(
+            loss=loss,
+            predicted_depth=predicted_depth,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+__all__ = ["DepthAnythingForDepthEstimation", "DepthAnythingPreTrainedModel"]

docs/transformers/build/lib/transformers/models/depth_pro/configuration_depth_pro.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DepthPro model configuration"""
+from copy import deepcopy
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto.configuration_auto import CONFIG_MAPPING, AutoConfig
+logger = logging.get_logger(__name__)
+class DepthProConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DepthProModel`]. It is used to instantiate a
+    DepthPro model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DepthPro
+    [apple/DepthPro](https://huggingface.co/apple/DepthPro) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        fusion_hidden_size (`int`, *optional*, defaults to 256):
+            The number of channels before fusion.
+        patch_size (`int`, *optional*, defaults to 384):
+            The size (resolution) of each patch. This is also the image_size for backbone model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        intermediate_hook_ids (`List[int]`, *optional*, defaults to `[11, 5]`):
+            Indices of the intermediate hidden states from the patch encoder to use for fusion.
+        intermediate_feature_dims (`List[int]`, *optional*, defaults to `[256, 256]`):
+            Hidden state dimensions during upsampling for each intermediate hidden state in `intermediate_hook_ids`.
+        scaled_images_ratios (`List[float]`, *optional*, defaults to `[0.25, 0.5, 1]`):
+            Ratios of scaled images to be used by the patch encoder.
+        scaled_images_overlap_ratios (`List[float]`, *optional*, defaults to `[0.0, 0.5, 0.25]`):
+            Overlap ratios between patches for each scaled image in `scaled_images_ratios`.
+        scaled_images_feature_dims (`List[int]`, *optional*, defaults to `[1024, 1024, 512]`):
+            Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`.
+        merge_padding_value (`int`, *optional*, defaults to 3):
+            When merging smaller patches back to the image size, overlapping sections of this size are removed.
+        use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
+            Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
+        use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the pre-activate residual units of the fusion blocks.
+        use_fov_model (`bool`, *optional*, defaults to `False`):
+            Whether to use `DepthProFovModel` to generate the field of view.
+        num_fov_head_layers (`int`, *optional*, defaults to 2):
+            Number of convolution layers in the head of `DepthProFovModel`.
+        image_model_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the image encoder model, which is loaded using the [`AutoModel`] API.
+            By default, Dinov2 model is used as backbone.
+        patch_model_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the patch encoder model, which is loaded using the [`AutoModel`] API.
+            By default, Dinov2 model is used as backbone.
+        fov_model_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the fov encoder model, which is loaded using the [`AutoModel`] API.
+            By default, Dinov2 model is used as backbone.
+    Example:
+    ```python
+    >>> from transformers import DepthProConfig, DepthProModel
+    >>> # Initializing a DepthPro apple/DepthPro style configuration
+    >>> configuration = DepthProConfig()
+    >>> # Initializing a model (with random weights) from the apple/DepthPro style configuration
+    >>> model = DepthProModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "depth_pro"
+    sub_configs = {"image_model_config": AutoConfig, "patch_model_config": AutoConfig, "fov_model_config": AutoConfig}
+    def __init__(
+        self,
+        fusion_hidden_size=256,
+        patch_size=384,
+        initializer_range=0.02,
+        intermediate_hook_ids=[11, 5],
+        intermediate_feature_dims=[256, 256],
+        scaled_images_ratios=[0.25, 0.5, 1],
+        scaled_images_overlap_ratios=[0.0, 0.5, 0.25],
+        scaled_images_feature_dims=[1024, 1024, 512],
+        merge_padding_value=3,
+        use_batch_norm_in_fusion_residual=False,
+        use_bias_in_fusion_residual=True,
+        use_fov_model=False,
+        num_fov_head_layers=2,
+        image_model_config=None,
+        patch_model_config=None,
+        fov_model_config=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # scaled_images_ratios is sorted
+        if scaled_images_ratios != sorted(scaled_images_ratios):
+            raise ValueError(
+                f"Values in scaled_images_ratios={scaled_images_ratios} should be sorted from low to high"
+            )
+        # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent
+        if not (len(scaled_images_ratios) == len(scaled_images_overlap_ratios) == len(scaled_images_feature_dims)):
+            raise ValueError(
+                f"len(scaled_images_ratios)={len(scaled_images_ratios)} and "
+                f"len(scaled_images_overlap_ratios)={len(scaled_images_overlap_ratios)} and "
+                f"len(scaled_images_feature_dims)={len(scaled_images_feature_dims)}, "
+                f"should match in config."
+            )
+        # intermediate_hook_ids, intermediate_feature_dims should be consistent
+        if not (len(intermediate_hook_ids) == len(intermediate_feature_dims)):
+            raise ValueError(
+                f"len(intermediate_hook_ids)={len(intermediate_hook_ids)} and "
+                f"len(intermediate_feature_dims)={len(intermediate_feature_dims)}, "
+                f"should match in config."
+            )
+        # fusion_hidden_size should be consistent with num_fov_head_layers
+        if fusion_hidden_size // 2**num_fov_head_layers == 0:
+            raise ValueError(
+                f"fusion_hidden_size={fusion_hidden_size} should be consistent with num_fov_head_layers={num_fov_head_layers} "
+                "i.e fusion_hidden_size // 2**num_fov_head_layers > 0"
+            )
+        self.fusion_hidden_size = fusion_hidden_size
+        self.patch_size = patch_size
+        self.initializer_range = initializer_range
+        self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
+        self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
+        self.use_fov_model = use_fov_model
+        self.num_fov_head_layers = num_fov_head_layers
+        self.intermediate_hook_ids = intermediate_hook_ids
+        self.intermediate_feature_dims = intermediate_feature_dims
+        self.scaled_images_ratios = scaled_images_ratios
+        self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = scaled_images_feature_dims
+        self.merge_padding_value = merge_padding_value
+        self.image_model_config = image_model_config
+        self.patch_model_config = patch_model_config
+        self.fov_model_config = fov_model_config
+        for sub_config_key in self.sub_configs.keys():
+            sub_config = getattr(self, sub_config_key)
+            if sub_config is None:
+                sub_config = CONFIG_MAPPING["dinov2"](image_size=patch_size)
+                logger.info(
+                    f"`{sub_config_key}` is `None`. Initializing `{sub_config_key}` with the `Dinov2Config` "
+                    f"with default values except `{sub_config_key}.image_size` is set to `config.patch_size`."
+                )
+            elif isinstance(sub_config, dict):
+                sub_config = deepcopy(sub_config)
+                if "model_type" not in sub_config:
+                    raise KeyError(
+                        f"The `model_type` key is missing in the `{sub_config_key}` dictionary. Please provide the model type."
+                    )
+                elif sub_config["model_type"] not in CONFIG_MAPPING:
+                    raise ValueError(
+                        f"The model type `{sub_config['model_type']}` in `{sub_config_key}` is not supported. Please provide a valid model type."
+                    )
+                image_size = sub_config.get("image_size")
+                if image_size != patch_size:
+                    logger.info(
+                        f"The `image_size` in `{sub_config_key}` is set to `{image_size}`, "
+                        f"but it does not match the required `patch_size` of `{patch_size}`. "
+                        f"Updating `image_size` to `{patch_size}` for consistency. "
+                        f"Ensure that `image_size` aligns with `patch_size` in the configuration."
+                    )
+                    sub_config.update({"image_size": patch_size})
+                sub_config = CONFIG_MAPPING[sub_config["model_type"]](**sub_config)
+            elif isinstance(sub_config, PretrainedConfig):
+                sub_config = sub_config
+                image_size = getattr(sub_config, "image_size", None)
+                if image_size != patch_size:
+                    raise ValueError(
+                        f"`config.{sub_config_key}.image_size={image_size}` should match `config.patch_size={patch_size}`."
+                    )
+            else:
+                raise TypeError(
+                    f"Invalid type for `sub_config`. Expected `PretrainedConfig`, `dict`, or `None`, but got {type(sub_config)}."
+                )
+            setattr(self, sub_config_key, sub_config)
+__all__ = ["DepthProConfig"]

docs/transformers/build/lib/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py ADDED Viewed

	@@ -0,0 +1,254 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import gc
+import os
+import regex as re
+import torch
+from huggingface_hub import hf_hub_download
+from transformers import (
+    DepthProConfig,
+    DepthProForDepthEstimation,
+    DepthProImageProcessorFast,
+)
+# fmt: off
+ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
+    # encoder
+    r"encoder.(patch|image)_encoder.cls_token":                                 r"depth_pro.encoder.\1_encoder.model.embeddings.cls_token",
+    r"encoder.(patch|image)_encoder.pos_embed":                                 r"depth_pro.encoder.\1_encoder.model.embeddings.position_embeddings",
+    r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)":            r"depth_pro.encoder.\1_encoder.model.embeddings.patch_embeddings.projection.\2",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)":      r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.norm\3.\4",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)":       r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.attention.(query|key|value).\3",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)":      r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.output.dense.\3",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma":                r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.layer_scale\3.lambda1",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)":    r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.mlp.fc\3.\4",
+    r"encoder.(patch|image)_encoder.norm.(weight|bias)":                        r"depth_pro.encoder.\1_encoder.model.layernorm.\2",
+    r"encoder.fuse_lowres.(weight|bias)":                                       r"depth_pro.neck.fuse_image_with_low_res.\1",
+    # fov
+    r"fov.encoder.0.cls_token":                                                 r"fov_model.fov_encoder.model.embeddings.cls_token",
+    r"fov.encoder.0.pos_embed":                                                 r"fov_model.fov_encoder.model.embeddings.position_embeddings",
+    r"fov.encoder.0.patch_embed.proj.(weight|bias)":                            r"fov_model.fov_encoder.model.embeddings.patch_embeddings.projection.\1",
+    r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)":                      r"fov_model.fov_encoder.model.encoder.layer.\1.norm\2.\3",
+    r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)":                       r"fov_model.fov_encoder.model.encoder.layer.\1.attention.attention.(query|key|value).\2",
+    r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)":                      r"fov_model.fov_encoder.model.encoder.layer.\1.attention.output.dense.\2",
+    r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma":                                r"fov_model.fov_encoder.model.encoder.layer.\1.layer_scale\2.lambda1",
+    r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)":                    r"fov_model.fov_encoder.model.encoder.layer.\1.mlp.fc\2.\3",
+    r"fov.encoder.0.norm.(weight|bias)":                                        r"fov_model.fov_encoder.model.layernorm.\1",
+    r"fov.downsample.0.(weight|bias)":                                          r"fov_model.conv.\1",
+    r"fov.encoder.1.(weight|bias)":                                             r"fov_model.fov_encoder.neck.\1",
+    r"fov.head.(\d+).(weight|bias)":                                            r"fov_model.head.layers.\1.\2",
+    # head
+    r"head.(\d+).(weight|bias)":                                                r"head.layers.\1.\2",
+    # upsamples
+    r"encoder.upsample_lowres.(weight|bias)":                                   r"depth_pro.neck.feature_upsample.image_block.layers.0.\1",
+    r"encoder.upsample_latent(\d+).(\d+).(weight|bias)": lambda match: (
+        f"depth_pro.neck.feature_upsample.intermediate.{1-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
+    ),
+    r"encoder.upsample(\d+).(\d+).(weight|bias)": lambda match: (
+        f"depth_pro.neck.feature_upsample.scaled_images.{2-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
+    ),
+    # projections between encoder and fusion
+    r"decoder.convs.(\d+).weight": lambda match: (
+        f"depth_pro.neck.feature_projection.projections.{4-int(match.group(1))}.weight"
+    ),
+    # fusion stage
+    r"decoder.fusions.([1234]).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: (
+        f"fusion_stage.intermediate.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}"
+    ),
+    r"decoder.fusions.0.resnet(\d+).residual.(\d+).(weight|bias)": lambda match: (
+        f"fusion_stage.final.residual_layer{match.group(1)}.convolution{(int(match.group(2))+1)//2}.{match.group(3)}"
+    ),
+    r"decoder.fusions.([1234]).out_conv.(weight|bias)": lambda match: (
+        f"fusion_stage.intermediate.{4-int(match.group(1))}.projection.{match.group(2)}"
+    ),
+    r"decoder.fusions.0.out_conv.(weight|bias)": lambda match: (
+        f"fusion_stage.final.projection.{match.group(1)}"
+    ),
+    r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: (
+        f"fusion_stage.intermediate.{4-int(match.group(1))}.deconv.{match.group(2)}"
+    ),
+}
+# fmt: on
+def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
+    output_dict = {}
+    if state_dict_keys is not None:
+        old_text = "\n".join(state_dict_keys)
+        new_text = old_text
+        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
+            if replacement is None:
+                new_text = re.sub(pattern, "", new_text)  # an empty line
+                continue
+            new_text = re.sub(pattern, replacement, new_text)
+        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
+    return output_dict
+def get_qkv_state_dict(key, parameter):
+    """
+    new key which looks like this
+    xxxx.(q|k|v).xxx    (m, n)
+    is converted to
+    xxxx.q.xxxx         (m//3, n)
+    xxxx.k.xxxx         (m//3, n)
+    xxxx.v.xxxx         (m//3, n)
+    """
+    qkv_state_dict = {}
+    placeholder = re.search(r"(\(.*?\))", key).group(1)  # finds   "(query|key|value)"
+    replacements_keys = placeholder[1:-1].split("|")  # creates ['query', 'key', 'value']
+    replacements_vals = torch.split(
+        parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0
+    )
+    for replacement_key, replacement_val in zip(replacements_keys, replacements_vals):
+        qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val
+    return qkv_state_dict
+def write_model(
+    hf_repo_id: str,
+    output_dir: str,
+    safe_serialization: bool = True,
+):
+    os.makedirs(output_dir, exist_ok=True)
+    # ------------------------------------------------------------
+    # Create and save config
+    # ------------------------------------------------------------
+    # create config
+    backbone_config = {
+        "model_type": "dinov2",
+        "num_hidden_layers": 24,
+        "patch_size": 16,
+        "hidden_size": 1024,
+        "num_attention_heads": 16,
+        "image_size": 384,
+        "use_mask_token": False,
+    }
+    config = DepthProConfig(
+        # original implementation uses same config for all 3 models
+        image_model_config=backbone_config,
+        patch_model_config=backbone_config,
+        fov_model_config=backbone_config,
+        use_fov_model=True,
+    )
+    # save config
+    config.save_pretrained(output_dir)
+    print("Model config saved successfully...")
+    # ------------------------------------------------------------
+    # Convert weights
+    # ------------------------------------------------------------
+    # download and load state_dict from hf repo
+    file_path = hf_hub_download(hf_repo_id, "depth_pro.pt")
+    loaded = torch.load(file_path, weights_only=True)
+    print("Converting model...")
+    all_keys = list(loaded.keys())
+    new_keys = convert_old_keys_to_new_keys(all_keys)
+    state_dict = {}
+    for key in all_keys:
+        new_key = new_keys[key]
+        current_parameter = loaded.pop(key)
+        if "qkv" in key:
+            qkv_state_dict = get_qkv_state_dict(new_key, current_parameter)
+            state_dict.update(qkv_state_dict)
+        else:
+            state_dict[new_key] = current_parameter
+    print("Loading the checkpoint in a DepthPro model.")
+    model = DepthProForDepthEstimation(config)
+    model.load_state_dict(state_dict, strict=True, assign=True)
+    print("Checkpoint loaded successfully.")
+    print("Saving the model.")
+    model.save_pretrained(output_dir, safe_serialization=safe_serialization)
+    del state_dict, model
+    # Safety check: reload the converted model
+    gc.collect()
+    print("Reloading the model to check if it's saved correctly.")
+    model = DepthProForDepthEstimation.from_pretrained(output_dir, device_map="auto")
+    print("Model reloaded successfully.")
+    return model
+def write_image_processor(output_dir: str):
+    image_processor = DepthProImageProcessorFast()
+    image_processor.save_pretrained(output_dir)
+    return image_processor
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--hf_repo_id",
+        default="apple/DepthPro",
+        help="Location of official weights from apple on HF",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="apple_DepthPro",
+        help="Location to write the converted model and processor",
+    )
+    parser.add_argument(
+        "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action=argparse.BooleanOptionalAction,
+        help="Whether or not to push the converted model to the huggingface hub.",
+    )
+    parser.add_argument(
+        "--hub_repo_id",
+        default="apple/DepthPro-hf",
+        help="Huggingface hub repo to write the converted model and processor",
+    )
+    args = parser.parse_args()
+    model = write_model(
+        hf_repo_id=args.hf_repo_id,
+        output_dir=args.output_dir,
+        safe_serialization=args.safe_serialization,
+    )
+    image_processor = write_image_processor(
+        output_dir=args.output_dir,
+    )
+    if args.push_to_hub:
+        print("Pushing to hub...")
+        model.push_to_hub(args.hub_repo_id)
+        image_processor.push_to_hub(args.hub_repo_id)
+if __name__ == "__main__":
+    main()

docs/transformers/build/lib/transformers/models/depth_pro/image_processing_depth_pro.py ADDED Viewed

	@@ -0,0 +1,392 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for DepthPro."""
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+import numpy as np
+from ...utils.import_utils import requires
+if TYPE_CHECKING:
+    from .modeling_depth_pro import DepthProDepthEstimatorOutput
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_torch_available,
+    make_list_of_images,
+    pil_torch_interpolation_mapping,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    logging,
+    requires_backends,
+)
+if is_torch_available():
+    import torch
+logger = logging.get_logger(__name__)
+@requires(backends=("torchvision", "torch"))
+class DepthProImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a DepthPro image processor.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 1536, "width": 1536}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        Returns:
+            `np.ndarray`: The resized images.
+        """
+        requires_backends(self, "torch")
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        # we use torch interpolation instead of image.resize because DepthProImageProcessor
+        # rescales, then normalizes, which may cause some values to become negative, before resizing the image.
+        # image.resize expects all values to be in range [0, 1] or [0, 255] and throws an exception otherwise,
+        # however pytorch interpolation works with negative values.
+        # relevant issue here: https://github.com/huggingface/transformers/issues/34920
+        # input should be (B, C, H, W)
+        image_tensor = torch.from_numpy(image).unsqueeze(0)
+        resized_image = torch.nn.functional.interpolate(
+            input=image_tensor,
+            size=output_size,
+            mode=pil_torch_interpolation_mapping[resample].value,
+        )
+        resized_image = resized_image.squeeze(0).numpy()
+        return resized_image
+    def _validate_input_arguments(
+        self,
+        do_resize: bool,
+        size: Dict[str, int],
+        resample: PILImageResampling,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, List[float]],
+        image_std: Union[float, List[float]],
+        data_format: Union[str, ChannelDimension],
+    ):
+        if do_resize and None in (size, resample):
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+        if do_normalize and None in (image_mean, image_std):
+            raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images.
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        size = size if size is not None else self.size
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        self._validate_input_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            data_format=data_format,
+        )
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        all_images = []
+        for image in images:
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+            # depth-pro rescales and normalizes the image before resizing it
+            # uses torch interpolation which requires ChannelDimension.FIRST
+            if do_resize:
+                image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
+                image = self.resize(image=image, size=size, resample=resample)
+                image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST)
+            else:
+                image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            all_images.append(image)
+        data = {"pixel_values": all_images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    def post_process_depth_estimation(
+        self,
+        outputs: "DepthProDepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
+    ) -> Dict[str, List[TensorType]]:
+        """
+        Post-processes the raw depth predictions from the model to generate
+        final depth predictions which is caliberated using the field of view if provided
+        and resized to specified target sizes if provided.
+        Args:
+            outputs ([`DepthProDepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`):
+                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)`
+                or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
+                is performed.
+        Returns:
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`.
+        Raises:
+            `ValueError`:
+                If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched.
+        """
+        requires_backends(self, "torch")
+        predicted_depth = outputs.predicted_depth
+        fov = outputs.field_of_view
+        batch_size = len(predicted_depth)
+        if target_sizes is not None and batch_size != len(target_sizes):
+            raise ValueError(
+                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
+            )
+        results = []
+        fov = [None] * batch_size if fov is None else fov
+        target_sizes = [None] * batch_size if target_sizes is None else target_sizes
+        for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes):
+            focal_length = None
+            if target_size is not None:
+                # scale image w.r.t fov
+                if fov_value is not None:
+                    width = target_size[1]
+                    focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value))
+                    depth = depth * width / focal_length
+                # interpolate
+                depth = torch.nn.functional.interpolate(
+                    # input should be (B, C, H, W)
+                    input=depth.unsqueeze(0).unsqueeze(1),
+                    size=target_size,
+                    mode=pil_torch_interpolation_mapping[self.resample].value,
+                ).squeeze()
+            # inverse the depth
+            depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
+            results.append(
+                {
+                    "predicted_depth": depth,
+                    "field_of_view": fov_value,
+                    "focal_length": focal_length,
+                }
+            )
+        return results
+__all__ = ["DepthProImageProcessor"]

docs/transformers/build/lib/transformers/models/depth_pro/image_processing_depth_pro_fast.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for DepthPro."""
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from ...image_processing_base import BatchFeature
+from ...image_processing_utils_fast import (
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    BaseImageProcessorFast,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    PILImageResampling,
+    SizeDict,
+)
+from ...utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+    requires_backends,
+)
+from ...utils.import_utils import requires
+if TYPE_CHECKING:
+    from .modeling_depth_pro import DepthProDepthEstimatorOutput
+logger = logging.get_logger(__name__)
+if is_torch_available():
+    import torch
+if is_torchvision_available():
+    from ...image_utils import pil_torch_interpolation_mapping
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+@add_start_docstrings(
+    "Constructs a fast DepthPro image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+)
+@requires(backends=("torchvision", "torch"))
+class DepthProImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 1536, "width": 1536}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    # DepthPro resizes image after rescaling and normalizing,
+    # which makes it different from BaseImageProcessorFast._preprocess
+    def _preprocess(
+        self,
+        images: List["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        # Group images by size for batched scaling
+        grouped_images, grouped_images_index = group_images_by_shape(images)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            if do_resize:
+                stacked_images = self.resize(
+                    image=stacked_images,
+                    size=size,
+                    interpolation=interpolation,
+                    antialias=False,
+                )
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+    # Copied from transformers.models.depth_pro.image_processing_depth_pro.DepthProImageProcessor.post_process_depth_estimation
+    def post_process_depth_estimation(
+        self,
+        outputs: "DepthProDepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
+    ) -> Dict[str, List[TensorType]]:
+        """
+        Post-processes the raw depth predictions from the model to generate
+        final depth predictions which is caliberated using the field of view if provided
+        and resized to specified target sizes if provided.
+        Args:
+            outputs ([`DepthProDepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`):
+                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)`
+                or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
+                is performed.
+        Returns:
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`.
+        Raises:
+            `ValueError`:
+                If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched.
+        """
+        requires_backends(self, "torch")
+        predicted_depth = outputs.predicted_depth
+        fov = outputs.field_of_view
+        batch_size = len(predicted_depth)
+        if target_sizes is not None and batch_size != len(target_sizes):
+            raise ValueError(
+                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
+            )
+        results = []
+        fov = [None] * batch_size if fov is None else fov
+        target_sizes = [None] * batch_size if target_sizes is None else target_sizes
+        for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes):
+            focal_length = None
+            if target_size is not None:
+                # scale image w.r.t fov
+                if fov_value is not None:
+                    width = target_size[1]
+                    focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value))
+                    depth = depth * width / focal_length
+                # interpolate
+                depth = torch.nn.functional.interpolate(
+                    # input should be (B, C, H, W)
+                    input=depth.unsqueeze(0).unsqueeze(1),
+                    size=target_size,
+                    mode=pil_torch_interpolation_mapping[self.resample].value,
+                ).squeeze()
+            # inverse the depth
+            depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
+            results.append(
+                {
+                    "predicted_depth": depth,
+                    "field_of_view": fov_value,
+                    "focal_length": focal_length,
+                }
+            )
+        return results
+__all__ = ["DepthProImageProcessorFast"]

docs/transformers/build/lib/transformers/models/depth_pro/modeling_depth_pro.py ADDED Viewed

	@@ -0,0 +1,1218 @@

+# coding=utf-8
+# Copyright 2024 The Apple Research Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DepthPro model."""
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from ..auto import AutoModel
+from .configuration_depth_pro import DepthProConfig
+logger = logging.get_logger(__name__)
+@dataclass
+class DepthProOutput(ModelOutput):
+    """
+    Base class for DepthPro's outputs.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        features (`Union[torch.FloatTensor, List[torch.FloatTensor]]`, *optional*):
+            Features from encoders. Can be a single feature or a list of features.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, n_patches_per_batch, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    features: Union[torch.FloatTensor, List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+@dataclass
+class DepthProDepthEstimatorOutput(ModelOutput):
+    """
+    Base class for DepthProForDepthEstimation's output.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
+            Predicted depth for each pixel.
+        field_of_view (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
+            Field of View Scaler.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, n_patches_per_batch, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    predicted_depth: Optional[torch.FloatTensor] = None
+    field_of_view: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) -> torch.Tensor:
+    """Creates Patches from Batch."""
+    batch_size, num_channels, height, width = pixel_values.shape
+    if height == width == patch_size:
+        # create patches only if scaled image is not already equal to patch size
+        return pixel_values
+    stride = torch_int(patch_size * (1 - overlap_ratio))
+    patches = F.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride))
+    patches = patches.permute(2, 0, 1)
+    patches = patches.reshape(-1, num_channels, patch_size, patch_size)
+    return patches
+def reshape_features(hidden_states: torch.Tensor) -> torch.Tensor:
+    """Discard class token and reshape 1D feature map to a 2D grid."""
+    n_samples, seq_len, hidden_size = hidden_states.shape
+    size = torch_int(seq_len**0.5)
+    hidden_states = hidden_states[:, -(size**2) :, :]  # remove special tokens if there are any
+    hidden_states = hidden_states.reshape(n_samples, size, size, hidden_size)
+    hidden_states = hidden_states.permute(0, 3, 1, 2)
+    return hidden_states
+def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor:
+    """Merges smaller patches into image-like feature map."""
+    n_patches, hidden_size, out_size, out_size = patches.shape
+    n_patches_per_batch = n_patches // batch_size
+    sqrt_n_patches_per_batch = torch_int(n_patches_per_batch**0.5)
+    new_out_size = sqrt_n_patches_per_batch * out_size
+    if n_patches == batch_size:
+        # merge only if the patches were created from scaled image
+        # patches are not created when scaled image size is equal to patch size
+        return patches
+    if n_patches_per_batch < 4:
+        # for each batch, atleast 4 small patches are required to
+        # recreate a large square patch from merging them and later padding is applied
+        # 3 x (8x8) patches becomes 1 x ( 8x8 ) patch (extra patch ignored, no padding)
+        # 4 x (8x8) patches becomes 1 x (16x16) patch (padding later)
+        # 5 x (8x8) patches becomes 1 x (16x16) patch (extra patch ignored, padding later)
+        # 9 x (8x8) patches becomes 1 x (24x24) patch (padding later)
+        # thus the following code only rearranges the patches and removes extra ones
+        padding = 0
+    # make sure padding is not large enough to remove more than half of the patch
+    padding = min(out_size // 4, padding)
+    if padding == 0:
+        # faster when no padding is required
+        merged = patches.reshape(n_patches_per_batch, batch_size, hidden_size, out_size, out_size)
+        merged = merged.permute(1, 2, 0, 3, 4)
+        merged = merged[:, :, : sqrt_n_patches_per_batch**2, :, :]
+        merged = merged.reshape(
+            batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size
+        )
+        merged = merged.permute(0, 1, 2, 4, 3, 5)
+        merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size)
+    else:
+        # padding example:
+        # let out_size = 8, new_out_size = 32, padding = 2
+        # each patch is separated by "|"
+        # and padding is applied to the merging edges of each patch
+        # 00 01 02 03 04 05 06 07 | 08 09 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31
+        # 00 01 02 03 04 05 -- -- | -- -- 10 11 12 13 -- -- | -- -- 18 19 20 21 -- -- | -- -- 26 27 28 29 30 31
+        i = 0
+        boxes = []
+        for h in range(sqrt_n_patches_per_batch):
+            boxes_in_row = []
+            for w in range(sqrt_n_patches_per_batch):
+                box = patches[batch_size * i : batch_size * (i + 1)]
+                # collect paddings
+                paddings = [0, 0, 0, 0]
+                if h != 0:
+                    # remove pad from height if box is not at top border
+                    paddings[0] = padding
+                if w != 0:
+                    # remove pad from width if box is not at left border
+                    paddings[2] = padding
+                if h != sqrt_n_patches_per_batch - 1:
+                    # remove pad from height if box is not at bottom border
+                    paddings[1] = padding
+                if w != sqrt_n_patches_per_batch - 1:
+                    # remove pad from width if box is not at right border
+                    paddings[3] = padding
+                # remove paddings
+                _, _, box_h, box_w = box.shape
+                pad_top, pad_bottom, pad_left, pad_right = paddings
+                box = box[:, :, pad_top : box_h - pad_bottom, pad_left : box_w - pad_right]
+                boxes_in_row.append(box)
+                i += 1
+            boxes_in_row = torch.cat(boxes_in_row, dim=-1)
+            boxes.append(boxes_in_row)
+        merged = torch.cat(boxes, dim=-2)
+    return merged
+def reconstruct_feature_maps(
+    hidden_state: torch.Tensor, batch_size: int, padding: int, output_size: Tuple[float, float]
+) -> torch.Tensor:
+    """
+    Reconstructs feature maps from the hidden state produced by any of the encoder. Converts the hidden state of shape
+    `(n_patches_per_batch * batch_size, seq_len, hidden_size)` to feature maps of shape
+    `(batch_size, hidden_size, output_size[0], output_size[1])`.
+    Args:
+        hidden_state (torch.Tensor): Input tensor of shape `(n_patches_per_batch * batch_size, seq_len, hidden_size)`
+            representing the encoded patches.
+        batch_size (int): The number of samples in a batch.
+        padding (int): The amount of padding to be removed when merging patches.
+        output_size (Tuple[float, float]): The desired output size for the feature maps, specified as `(height, width)`.
+    Returns:
+        torch.Tensor: Reconstructed feature maps of shape `(batch_size, hidden_size, output_size[0], output_size[1])`.
+    """
+    # reshape back to image like
+    features = reshape_features(hidden_state)
+    # merge all patches in a batch to create one large patch per batch
+    features = merge_patches(
+        features,
+        batch_size=batch_size,
+        padding=padding,
+    )
+    # interpolate patches to base size
+    features = F.interpolate(
+        features,
+        size=output_size,
+        mode="bilinear",
+        align_corners=False,
+    )
+    return features
+class DepthProPatchEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.intermediate_hook_ids = config.intermediate_hook_ids
+        self.intermediate_feature_dims = config.intermediate_feature_dims
+        self.scaled_images_ratios = config.scaled_images_ratios
+        self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = config.scaled_images_feature_dims
+        self.merge_padding_value = config.merge_padding_value
+        self.n_scaled_images = len(config.scaled_images_ratios)
+        self.n_intermediate_hooks = len(config.intermediate_hook_ids)
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
+        self.model = AutoModel.from_config(config.patch_model_config)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> List[torch.Tensor]:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if min(self.scaled_images_ratios) * min(height, width) < self.config.patch_size:
+            raise ValueError(
+                f"Image size {height}x{width} is too small to be scaled "
+                f"with scaled_images_ratios={self.scaled_images_ratios} "
+                f"when patch_size={self.config.patch_size}."
+            )
+        # STEP 1: create 3-level image
+        scaled_images = []
+        for ratio in self.scaled_images_ratios:
+            scaled_images.append(
+                F.interpolate(
+                    pixel_values,
+                    scale_factor=ratio,
+                    mode="bilinear",
+                    align_corners=False,
+                )
+            )
+        # STEP 2: create patches
+        for i in range(self.n_scaled_images):
+            scaled_images[i] = split_to_patches(
+                scaled_images[i],
+                patch_size=self.config.patch_size,
+                overlap_ratio=self.scaled_images_overlap_ratios[i],
+            )
+        n_patches_per_scaled_image = [len(i) for i in scaled_images]
+        patches = torch.cat(scaled_images[::-1], dim=0)  # -1 as patch encoder expects high res patches first
+        # STEP 3: apply patch encoder
+        encodings = self.model(
+            # each patch is processed as a separate batch
+            patches,
+            head_mask=head_mask,
+            # required for intermediate features
+            output_hidden_states=self.n_intermediate_hooks > 0,
+        )
+        scaled_images_last_hidden_state = torch.split_with_sizes(encodings[0], n_patches_per_scaled_image[::-1])
+        # -1 (reverse list) as patch encoder returns high res patches first, we need low res first
+        scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1]
+        # calculate base height and width
+        # base height and width are the dimensions of the lowest resolution features
+        exponent_value = torch_int(math.log2(width / self.out_size))
+        base_height = height // 2**exponent_value
+        base_width = width // 2**exponent_value
+        # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
+        scaled_images_features = []
+        for i in range(self.n_scaled_images):
+            hidden_state = scaled_images_last_hidden_state[i]
+            batch_size = batch_size
+            padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i]))
+            output_height = base_height * 2**i
+            output_width = base_width * 2**i
+            features = reconstruct_feature_maps(
+                hidden_state,
+                batch_size=batch_size,
+                padding=padding,
+                output_size=(output_height, output_width),
+            )
+            scaled_images_features.append(features)
+        # STEP 5: get intermediate features - (1-2) in diagram
+        intermediate_features = []
+        for i in range(self.n_intermediate_hooks):
+            # +1 to correct index position as hidden_states contain embedding output as well
+            hidden_state = encodings[2][self.intermediate_hook_ids[i] + 1]
+            padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1]))
+            output_height = base_height * 2 ** (self.n_scaled_images - 1)
+            output_width = base_width * 2 ** (self.n_scaled_images - 1)
+            features = reconstruct_feature_maps(
+                hidden_state,
+                batch_size=batch_size,
+                padding=padding,
+                output_size=(output_height, output_width),
+            )
+            intermediate_features.append(features)
+        # STEP 7: combine all features
+        features = [*scaled_images_features, *intermediate_features]
+        return features
+class DepthProImageEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
+        self.model = AutoModel.from_config(config.image_model_config)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, DepthProOutput]:
+        batch_size, num_channels, height, width = pixel_values.shape
+        # scale the image for image_encoder
+        size = self.config.image_model_config.image_size
+        pixel_values = F.interpolate(
+            pixel_values,
+            size=(size, size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        encodings = self.model(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        # calculate base height and width
+        # base height and width are the dimensions of the lowest resolution features
+        exponent_value = torch_int(math.log2(width / self.out_size))
+        base_height = height // 2**exponent_value
+        base_width = width // 2**exponent_value
+        features = reconstruct_feature_maps(
+            encodings[0],
+            batch_size=batch_size,
+            padding=0,
+            output_size=(base_height, base_width),
+        )
+        if not return_dict:
+            return (encodings[0], features) + encodings[2:]  # ignore last_hidden_state and poooler output
+        return DepthProOutput(
+            last_hidden_state=encodings.last_hidden_state,
+            features=features,
+            hidden_states=encodings.hidden_states,
+            attentions=encodings.attentions,
+        )
+class DepthProEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.intermediate_hook_ids = config.intermediate_hook_ids
+        self.intermediate_feature_dims = config.intermediate_feature_dims
+        self.scaled_images_ratios = config.scaled_images_ratios
+        self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = config.scaled_images_feature_dims
+        self.merge_padding_value = config.merge_padding_value
+        self.n_scaled_images = len(self.scaled_images_ratios)
+        self.n_intermediate_hooks = len(self.intermediate_hook_ids)
+        self.patch_encoder = DepthProPatchEncoder(config)
+        self.image_encoder = DepthProImageEncoder(config)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, DepthProOutput]:
+        batch_size, num_channels, height, width = pixel_values.shape
+        patch_features = self.patch_encoder(
+            pixel_values,
+            head_mask=head_mask,
+        )
+        image_encodings = self.image_encoder(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_features = image_encodings[1]  # index 1 contains features
+        features = [image_features, *patch_features]
+        if not return_dict:
+            return (image_encodings[0], features) + image_encodings[2:]
+        return DepthProOutput(
+            last_hidden_state=image_encodings.last_hidden_state,
+            features=features,
+            hidden_states=image_encodings.hidden_states,
+            attentions=image_encodings.attentions,
+        )
+class DepthProFeatureUpsampleBlock(nn.Module):
+    def __init__(
+        self,
+        config: DepthProConfig,
+        input_dims: int,
+        intermediate_dims: int,
+        output_dims: int,
+        n_upsample_layers: int,
+        use_proj: bool = True,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList()
+        # create first projection layer
+        if use_proj:
+            proj = nn.Conv2d(
+                in_channels=input_dims,
+                out_channels=intermediate_dims,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=bias,
+            )
+            self.layers.append(proj)
+        # create following upsample layers
+        for i in range(n_upsample_layers):
+            in_channels = intermediate_dims if i == 0 else output_dims
+            layer = nn.ConvTranspose2d(
+                in_channels=in_channels,
+                out_channels=output_dims,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=bias,
+            )
+            self.layers.append(layer)
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            features = layer(features)
+        return features
+class DepthProFeatureUpsample(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.n_scaled_images = len(self.config.scaled_images_ratios)
+        self.n_intermediate_hooks = len(self.config.intermediate_hook_ids)
+        # for image_features
+        self.image_block = DepthProFeatureUpsampleBlock(
+            config=config,
+            input_dims=config.image_model_config.hidden_size,
+            intermediate_dims=config.image_model_config.hidden_size,
+            output_dims=config.scaled_images_feature_dims[0],
+            n_upsample_layers=1,
+            use_proj=False,
+            bias=True,
+        )
+        # for scaled_images_features
+        self.scaled_images = nn.ModuleList()
+        for i, feature_dims in enumerate(config.scaled_images_feature_dims):
+            block = DepthProFeatureUpsampleBlock(
+                config=config,
+                input_dims=config.patch_model_config.hidden_size,
+                intermediate_dims=feature_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=1,
+            )
+            self.scaled_images.append(block)
+        # for intermediate_features
+        self.intermediate = nn.ModuleList()
+        for i, feature_dims in enumerate(config.intermediate_feature_dims):
+            intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims
+            block = DepthProFeatureUpsampleBlock(
+                config=config,
+                input_dims=config.patch_model_config.hidden_size,
+                intermediate_dims=intermediate_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=2 + i,
+            )
+            self.intermediate.append(block)
+    def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
+        features[0] = self.image_block(features[0])
+        for i in range(self.n_scaled_images):
+            features[i + 1] = self.scaled_images[i](features[i + 1])
+        for i in range(self.n_intermediate_hooks):
+            features[self.n_scaled_images + i + 1] = self.intermediate[i](features[self.n_scaled_images + i + 1])
+        return features
+class DepthProFeatureProjection(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
+        self.projections = nn.ModuleList()
+        for i, in_channels in enumerate(combined_feature_dims):
+            if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size:
+                # projection for last layer can be ignored if input and output channels already match
+                self.projections.append(nn.Identity())
+            else:
+                self.projections.append(
+                    nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=config.fusion_hidden_size,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=False,
+                    )
+                )
+    def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
+        projected_features = []
+        for i, projection in enumerate(self.projections):
+            upsampled_feature = projection(features[i])
+            projected_features.append(upsampled_feature)
+        return projected_features
+class DepthProNeck(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.feature_upsample = DepthProFeatureUpsample(config)
+        self.fuse_image_with_low_res = nn.Conv2d(
+            in_channels=config.scaled_images_feature_dims[0] * 2,
+            out_channels=config.scaled_images_feature_dims[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.feature_projection = DepthProFeatureProjection(config)
+    def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
+        features = self.feature_upsample(features)
+        # global features = low res features + image features
+        global_features = torch.cat((features[1], features[0]), dim=1)
+        global_features = self.fuse_image_with_low_res(global_features)
+        features = [global_features, *features[2:]]
+        features = self.feature_projection(features)
+        return features
+# General docstring
+_CONFIG_FOR_DOC = "DepthProConfig"
+DEPTH_PRO_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+DEPTH_PRO_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        use_fov_model (`bool`, *optional*, defaults to `True`):
+            Whether to use `DepthProFovModel` to generate the field of view.
+"""
+class DepthProPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = DepthProConfig
+    base_model_prefix = "depth_pro"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+    _no_split_modules = ["DepthProPreActResidualLayer"]
+    _keys_to_ignore_on_load_unexpected = ["fov_model.*"]
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+            if module.bias is not None:
+                module.bias.data.zero_()
+@add_start_docstrings(
+    "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.",
+    DEPTH_PRO_START_DOCSTRING,
+)
+class DepthProModel(DepthProPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.encoder = DepthProEncoder(config)
+        self.neck = DepthProNeck(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.encoder.image_encoder.model.get_input_embeddings()
+    @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, DepthProOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, DepthProModel
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> checkpoint = "apple/DepthPro-hf"
+        >>> processor = AutoProcessor.from_pretrained(checkpoint)
+        >>> model = DepthProModel.from_pretrained(checkpoint)
+        >>> # prepare image for the model
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     output = model(**inputs)
+        >>> output.last_hidden_state.shape
+        torch.Size([1, 35, 577, 1024])
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encodings = self.encoder(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        features = encodings[1]  # index 1 contains features
+        features = self.neck(features)
+        if not return_dict:
+            return (encodings[0], features) + encodings[2:]
+        return DepthProOutput(
+            last_hidden_state=encodings.last_hidden_state,
+            features=features,
+            hidden_states=encodings.hidden_states,
+            attentions=encodings.attentions,
+        )
+# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPT->DepthPro
+class DepthProPreActResidualLayer(nn.Module):
+    """
+    ResidualConvUnit, pre-activate residual unit.
+    Args:
+        config (`[DepthProConfig]`):
+            Model configuration class defining the model architecture.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.use_batch_norm = config.use_batch_norm_in_fusion_residual
+        use_bias_in_fusion_residual = (
+            config.use_bias_in_fusion_residual
+            if config.use_bias_in_fusion_residual is not None
+            else not self.use_batch_norm
+        )
+        self.activation1 = nn.ReLU()
+        self.convolution1 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+        self.activation2 = nn.ReLU()
+        self.convolution2 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+        if self.use_batch_norm:
+            self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size)
+            self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size)
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        hidden_state = self.activation1(hidden_state)
+        hidden_state = self.convolution1(hidden_state)
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm1(hidden_state)
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm2(hidden_state)
+        return hidden_state + residual
+# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
+# except it uses deconv and skip_add and needs no interpolation
+class DepthProFeatureFusionLayer(nn.Module):
+    def __init__(self, config: DepthProConfig, use_deconv: bool = True):
+        super().__init__()
+        self.config = config
+        self.use_deconv = use_deconv
+        self.residual_layer1 = DepthProPreActResidualLayer(config)
+        self.residual_layer2 = DepthProPreActResidualLayer(config)
+        if self.use_deconv:
+            self.deconv = nn.ConvTranspose2d(
+                in_channels=config.fusion_hidden_size,
+                out_channels=config.fusion_hidden_size,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=False,
+            )
+        self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
+    def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if residual is not None:
+            residual = self.residual_layer1(residual)
+            hidden_state = hidden_state + residual
+        hidden_state = self.residual_layer2(hidden_state)
+        if self.use_deconv:
+            hidden_state = self.deconv(hidden_state)
+        hidden_state = self.projection(hidden_state)
+        return hidden_state
+# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro
+# with deconv and reversed layers
+class DepthProFeatureFusionStage(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.num_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
+        self.intermediate = nn.ModuleList()
+        for _ in range(self.num_layers - 1):
+            self.intermediate.append(DepthProFeatureFusionLayer(config))
+        # final layer doesnot require deconvolution
+        self.final = DepthProFeatureFusionLayer(config, use_deconv=False)
+    def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
+        if self.num_layers != len(hidden_states):
+            raise ValueError(
+                f"num_layers={self.num_layers} in DepthProFeatureFusionStage"
+                f"doesnot match len(hidden_states)={len(hidden_states)}"
+            )
+        fused_hidden_states = []
+        fused_hidden_state = None
+        for hidden_state, layer in zip(hidden_states[:-1], self.intermediate):
+            if fused_hidden_state is None:
+                # first layer only uses the last hidden_state
+                fused_hidden_state = layer(hidden_state)
+            else:
+                fused_hidden_state = layer(fused_hidden_state, hidden_state)
+            fused_hidden_states.append(fused_hidden_state)
+        hidden_state = hidden_states[-1]
+        fused_hidden_state = self.final(fused_hidden_state, hidden_state)
+        fused_hidden_states.append(fused_hidden_state)
+        return fused_hidden_states
+class DepthProFovEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
+        self.model = AutoModel.from_config(config.fov_model_config)
+        self.neck = nn.Linear(config.fov_model_config.hidden_size, config.fusion_hidden_size // 2)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        # scale the image for fov_encoder
+        size = self.config.fov_model_config.image_size
+        pixel_values = F.interpolate(
+            pixel_values,
+            size=(size, size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        encodings = self.model(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+        )
+        hidden_state = encodings[0]
+        hidden_state = self.neck(hidden_state)
+        # calculate base height and width
+        # base height and width are the dimensions of the lowest resolution features
+        exponent_value = torch_int(math.log2(width / self.out_size))
+        base_height = height // 2**exponent_value
+        base_width = width // 2**exponent_value
+        features = reconstruct_feature_maps(
+            hidden_state,
+            batch_size=batch_size,
+            padding=0,
+            output_size=(base_height, base_width),
+        )
+        return features
+class DepthProFovHead(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.fusion_hidden_size = config.fusion_hidden_size
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
+        # create initial head layers
+        self.layers = nn.ModuleList()
+        for i in range(config.num_fov_head_layers):
+            self.layers.append(
+                nn.Conv2d(
+                    math.ceil(self.fusion_hidden_size / 2 ** (i + 1)),
+                    math.ceil(self.fusion_hidden_size / 2 ** (i + 2)),
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                )
+            )
+            self.layers.append(nn.ReLU(True))
+        # calculate expected shapes to finally generate a scalar output from final head layer
+        final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1))
+        final_kernel_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
+        self.layers.append(
+            nn.Conv2d(
+                in_channels=final_in_channels, out_channels=1, kernel_size=final_kernel_size, stride=1, padding=0
+            )
+        )
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        features = F.interpolate(
+            features,
+            size=(self.out_size, self.out_size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        for layer in self.layers:
+            features = layer(features)
+        return features
+class DepthProFovModel(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.fusion_hidden_size = config.fusion_hidden_size
+        self.fov_encoder = DepthProFovEncoder(config)
+        self.conv = nn.Conv2d(
+            self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1
+        )
+        self.activation = nn.ReLU(inplace=True)
+        self.head = DepthProFovHead(config)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        global_features: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        fov_features = self.fov_encoder(pixel_values, head_mask)
+        global_features = self.conv(global_features)
+        global_features = self.activation(global_features)
+        fov_features = fov_features + global_features
+        fov_output = self.head(fov_features)
+        fov_output = fov_output.flatten()
+        return fov_output
+class DepthProDepthEstimationHead(nn.Module):
+    """
+    The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks.
+    This module comprises a sequence of convolutional and transposed convolutional layers
+    that process the feature map from the fusion to produce a single-channel depth map.
+    Key operations include dimensionality reduction and upsampling to match the input resolution.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        features = config.fusion_hidden_size
+        self.layers = nn.ModuleList(
+            [
+                nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+                nn.ConvTranspose2d(
+                    in_channels=features // 2,
+                    out_channels=features // 2,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    bias=True,
+                ),
+                nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(True),
+                nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+                nn.ReLU(),
+            ]
+        )
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        predicted_depth = hidden_states.squeeze(dim=1)
+        return predicted_depth
+@add_start_docstrings(
+    """
+    DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers).
+    """,
+    DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING,
+)
+class DepthProForDepthEstimation(DepthProPreTrainedModel):
+    def __init__(self, config, use_fov_model=None):
+        super().__init__(config)
+        self.config = config
+        self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model
+        # dinov2 (vit) like encoders
+        self.depth_pro = DepthProModel(config)
+        # dpt (vit) like fusion stage
+        self.fusion_stage = DepthProFeatureFusionStage(config)
+        # depth estimation head
+        self.head = DepthProDepthEstimationHead(config)
+        # dinov2 (vit) like encoder
+        self.fov_model = DepthProFovModel(config) if self.use_fov_model else None
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DepthProDepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], DepthProDepthEstimatorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth depth estimation maps for computing the loss.
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, DepthProForDepthEstimation
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> checkpoint = "apple/DepthPro-hf"
+        >>> processor = AutoImageProcessor.from_pretrained(checkpoint)
+        >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint)
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        >>> model.to(device)
+        >>> # prepare image for the model
+        >>> inputs = processor(images=image, return_tensors="pt").to(device)
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> # interpolate to original size
+        >>> post_processed_output = processor.post_process_depth_estimation(
+        ...     outputs, target_sizes=[(image.height, image.width)],
+        ... )
+        >>> # get the field of view (fov) predictions
+        >>> field_of_view = post_processed_output[0]["field_of_view"]
+        >>> focal_length = post_processed_output[0]["focal_length"]
+        >>> # visualize the prediction
+        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
+        >>> depth = predicted_depth * 255 / predicted_depth.max()
+        >>> depth = depth.detach().cpu().numpy()
+        >>> depth = Image.fromarray(depth.astype("uint8"))
+        ```"""
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        depth_pro_outputs = self.depth_pro(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        features = depth_pro_outputs.features
+        fused_hidden_states = self.fusion_stage(features)
+        predicted_depth = self.head(fused_hidden_states[-1])
+        if self.use_fov_model:
+            # frozen features from encoder are used
+            features_for_fov = features[0].detach()
+            fov = self.fov_model(
+                pixel_values=pixel_values,
+                global_features=features_for_fov,
+                head_mask=head_mask,
+            )
+        else:
+            fov = None
+        if not return_dict:
+            outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions]
+            return tuple(v for v in outputs if v is not None)
+        return DepthProDepthEstimatorOutput(
+            loss=loss,
+            predicted_depth=predicted_depth,
+            field_of_view=fov,
+            hidden_states=depth_pro_outputs.hidden_states,
+            attentions=depth_pro_outputs.attentions,
+        )
+__all__ = ["DepthProPreTrainedModel", "DepthProModel", "DepthProForDepthEstimation"]

docs/transformers/build/lib/transformers/models/detr/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_detr import *
+    from .feature_extraction_detr import *
+    from .image_processing_detr import *
+    from .image_processing_detr_fast import *
+    from .modeling_detr import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/detr/configuration_detr.py ADDED Viewed

	@@ -0,0 +1,289 @@

+# coding=utf-8
+# Copyright 2021 Facebook AI Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DETR model configuration"""
+from collections import OrderedDict
+from typing import Mapping
+from packaging import version
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto import CONFIG_MAPPING
+logger = logging.get_logger(__name__)
+class DetrConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DetrModel`]. It is used to instantiate a DETR
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DETR
+    [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        use_timm_backbone (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
+            API.
+        backbone_config (`PretrainedConfig` or `dict`, *optional*):
+            The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
+            case it will default to `ResNetConfig()`.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_queries (`int`, *optional*, defaults to 100):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetrModel`] can
+            detect in a single image. For COCO, we recommend 100 queries.
+        d_model (`int`, *optional*, defaults to 256):
+            This parameter is a general dimension parameter, defining dimensions for components such as the encoder layer and projection parameters in the decoder layer, among others.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float`, *optional*, defaults to 1):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+        backbone (`str`, *optional*, defaults to `"resnet50"`):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, `True`):
+            Whether to use pretrained weights for the backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        dilation (`bool`, *optional*, defaults to `False`):
+            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
+            `use_timm_backbone` = `True`.
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the Focal loss in the panoptic segmentation loss.
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+    Examples:
+    ```python
+    >>> from transformers import DetrConfig, DetrModel
+    >>> # Initializing a DETR facebook/detr-resnet-50 style configuration
+    >>> configuration = DetrConfig()
+    >>> # Initializing a model (with random weights) from the facebook/detr-resnet-50 style configuration
+    >>> model = DetrModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "detr"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+    def __init__(
+        self,
+        use_timm_backbone=True,
+        backbone_config=None,
+        num_channels=3,
+        num_queries=100,
+        encoder_layers=6,
+        encoder_ffn_dim=2048,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=2048,
+        decoder_attention_heads=8,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        init_xavier_std=1.0,
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        backbone="resnet50",
+        use_pretrained_backbone=True,
+        backbone_kwargs=None,
+        dilation=False,
+        class_cost=1,
+        bbox_cost=5,
+        giou_cost=2,
+        mask_loss_coefficient=1,
+        dice_loss_coefficient=1,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        eos_coefficient=0.1,
+        **kwargs,
+    ):
+        # We default to values which were previously hard-coded in the model. This enables configurability of the config
+        # while keeping the default behavior the same.
+        if use_timm_backbone and backbone_kwargs is None:
+            backbone_kwargs = {}
+            if dilation:
+                backbone_kwargs["output_stride"] = 16
+            backbone_kwargs["out_indices"] = [1, 2, 3, 4]
+            backbone_kwargs["in_chans"] = num_channels
+        # Backwards compatibility
+        elif not use_timm_backbone and backbone in (None, "resnet50"):
+            if backbone_config is None:
+                logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+                backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
+            elif isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.get("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+            backbone = None
+            # set timm attributes to None
+            dilation = None
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_config = backbone_config
+        self.num_channels = num_channels
+        self.num_queries = num_queries
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.num_hidden_layers = encoder_layers
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.backbone_kwargs = backbone_kwargs
+        self.dilation = dilation
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.mask_loss_coefficient = mask_loss_coefficient
+        self.dice_loss_coefficient = dice_loss_coefficient
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.eos_coefficient = eos_coefficient
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+    @classmethod
+    def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
+        """Instantiate a [`DetrConfig`] (or a derived class) from a pre-trained backbone model configuration.
+        Args:
+            backbone_config ([`PretrainedConfig`]):
+                The backbone configuration.
+        Returns:
+            [`DetrConfig`]: An instance of a configuration object
+        """
+        return cls(backbone_config=backbone_config, **kwargs)
+class DetrOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("pixel_mask", {0: "batch"}),
+            ]
+        )
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-5
+    @property
+    def default_onnx_opset(self) -> int:
+        return 12
+__all__ = ["DetrConfig", "DetrOnnxConfig"]

docs/transformers/build/lib/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DETR checkpoints with timm backbone."""
+import argparse
+import json
+from collections import OrderedDict
+from pathlib import Path
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor
+from transformers.utils import logging
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+# here we list all keys to be renamed (original name on the left, our name on the right)
+rename_keys = []
+for i in range(6):
+    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
+    )
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
+    rename_keys.append(
+        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
+    )
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
+    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
+    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
+    )
+    rename_keys.append(
+        (
+            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
+            f"decoder.layers.{i}.encoder_attn.out_proj.weight",
+        )
+    )
+    rename_keys.append(
+        (
+            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
+            f"decoder.layers.{i}.encoder_attn.out_proj.bias",
+        )
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
+    )
+    rename_keys.append(
+        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
+    )
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
+    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
+# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
+rename_keys.extend(
+    [
+        ("input_proj.weight", "input_projection.weight"),
+        ("input_proj.bias", "input_projection.bias"),
+        ("query_embed.weight", "query_position_embeddings.weight"),
+        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
+        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
+        ("class_embed.weight", "class_labels_classifier.weight"),
+        ("class_embed.bias", "class_labels_classifier.bias"),
+        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
+        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
+        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
+        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
+        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
+        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
+    ]
+)
+def rename_key(state_dict, old, new):
+    val = state_dict.pop(old)
+    state_dict[new] = val
+def rename_backbone_keys(state_dict):
+    new_state_dict = OrderedDict()
+    for key, value in state_dict.items():
+        if "backbone.0.body" in key:
+            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
+            new_state_dict[new_key] = value
+        else:
+            new_state_dict[key] = value
+    return new_state_dict
+def read_in_q_k_v(state_dict, is_panoptic=False):
+    prefix = ""
+    if is_panoptic:
+        prefix = "detr."
+    # first: transformer encoder
+    for i in range(6):
+        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
+    for i in range(6):
+        # read in weights + bias of input projection layer of self-attention
+        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+        # read in weights + bias of input projection layer of cross-attention
+        in_proj_weight_cross_attn = state_dict.pop(
+            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
+        )
+        in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) of cross-attention to the state dict
+        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
+        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
+        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
+        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
+        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
+        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+@torch.no_grad()
+def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our DETR structure.
+    """
+    # load default config
+    config = DetrConfig()
+    # set backbone and dilation attributes
+    if "resnet101" in model_name:
+        config.backbone = "resnet101"
+    if "dc5" in model_name:
+        config.dilation = True
+    is_panoptic = "panoptic" in model_name
+    if is_panoptic:
+        config.num_labels = 250
+    else:
+        config.num_labels = 91
+        repo_id = "huggingface/label-files"
+        filename = "coco-detection-id2label.json"
+        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+    # load image processor
+    format = "coco_panoptic" if is_panoptic else "coco_detection"
+    image_processor = DetrImageProcessor(format=format)
+    # prepare image
+    img = prepare_img()
+    encoding = image_processor(images=img, return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+    logger.info(f"Converting model {model_name}...")
+    # load original model from torch hub
+    detr = torch.hub.load("facebookresearch/detr", model_name, pretrained=True).eval()
+    state_dict = detr.state_dict()
+    # rename keys
+    for src, dest in rename_keys:
+        if is_panoptic:
+            src = "detr." + src
+        rename_key(state_dict, src, dest)
+    state_dict = rename_backbone_keys(state_dict)
+    # query, key and value matrices need special treatment
+    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
+    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
+    prefix = "detr.model." if is_panoptic else "model."
+    for key in state_dict.copy().keys():
+        if is_panoptic:
+            if (
+                key.startswith("detr")
+                and not key.startswith("class_labels_classifier")
+                and not key.startswith("bbox_predictor")
+            ):
+                val = state_dict.pop(key)
+                state_dict["detr.model" + key[4:]] = val
+            elif "class_labels_classifier" in key or "bbox_predictor" in key:
+                val = state_dict.pop(key)
+                state_dict["detr." + key] = val
+            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
+                continue
+            else:
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+        else:
+            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+    # finally, create HuggingFace model and load state dict
+    model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+    # verify our conversion
+    original_outputs = detr(pixel_values)
+    outputs = model(pixel_values)
+    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
+    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
+    if is_panoptic:
+        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
+    # Save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name", default="detr_resnet50", type=str, help="Name of the DETR model you'd like to convert."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)

docs/transformers/build/lib/transformers/models/detr/convert_detr_to_pytorch.py ADDED Viewed

	@@ -0,0 +1,385 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DETR checkpoints with native (Transformers) backbone."""
+import argparse
+import json
+from pathlib import Path
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor, ResNetConfig
+from transformers.utils import logging
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+def get_detr_config(model_name):
+    # initialize config
+    if "resnet-50" in model_name:
+        backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50")
+    elif "resnet-101" in model_name:
+        backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-101")
+    else:
+        raise ValueError("Model name should include either resnet50 or resnet101")
+    config = DetrConfig(use_timm_backbone=False, backbone_config=backbone_config)
+    # set label attributes
+    is_panoptic = "panoptic" in model_name
+    if is_panoptic:
+        config.num_labels = 250
+    else:
+        config.num_labels = 91
+        repo_id = "huggingface/label-files"
+        filename = "coco-detection-id2label.json"
+        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+    return config, is_panoptic
+def create_rename_keys(config):
+    # here we list all keys to be renamed (original name on the left, our name on the right)
+    rename_keys = []
+    # stem
+    # fmt: off
+    rename_keys.append(("backbone.0.body.conv1.weight", "backbone.conv_encoder.model.embedder.embedder.convolution.weight"))
+    rename_keys.append(("backbone.0.body.bn1.weight", "backbone.conv_encoder.model.embedder.embedder.normalization.weight"))
+    rename_keys.append(("backbone.0.body.bn1.bias", "backbone.conv_encoder.model.embedder.embedder.normalization.bias"))
+    rename_keys.append(("backbone.0.body.bn1.running_mean", "backbone.conv_encoder.model.embedder.embedder.normalization.running_mean"))
+    rename_keys.append(("backbone.0.body.bn1.running_var", "backbone.conv_encoder.model.embedder.embedder.normalization.running_var"))
+    # stages
+    for stage_idx in range(len(config.backbone_config.depths)):
+        for layer_idx in range(config.backbone_config.depths[stage_idx]):
+            # shortcut
+            if layer_idx == 0:
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
+                    )
+                )
+            # 3 convs
+            for i in range(3):
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
+                    )
+                )
+    # fmt: on
+    for i in range(config.encoder_layers):
+        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+        rename_keys.append(
+            (
+                f"transformer.encoder.layers.{i}.self_attn.out_proj.weight",
+                f"encoder.layers.{i}.self_attn.out_proj.weight",
+            )
+        )
+        rename_keys.append(
+            (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
+        )
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
+        rename_keys.append(
+            (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
+        )
+        rename_keys.append(
+            (f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")
+        )
+        rename_keys.append(
+            (f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")
+        )
+        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
+        # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
+        rename_keys.append(
+            (
+                f"transformer.decoder.layers.{i}.self_attn.out_proj.weight",
+                f"decoder.layers.{i}.self_attn.out_proj.weight",
+            )
+        )
+        rename_keys.append(
+            (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
+        )
+        rename_keys.append(
+            (
+                f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
+                f"decoder.layers.{i}.encoder_attn.out_proj.weight",
+            )
+        )
+        rename_keys.append(
+            (
+                f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
+                f"decoder.layers.{i}.encoder_attn.out_proj.bias",
+            )
+        )
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
+        rename_keys.append(
+            (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
+        )
+        rename_keys.append(
+            (f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")
+        )
+        rename_keys.append(
+            (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
+        )
+        rename_keys.append(
+            (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
+        )
+        rename_keys.append(
+            (f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")
+        )
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
+    # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
+    rename_keys.extend(
+        [
+            ("input_proj.weight", "input_projection.weight"),
+            ("input_proj.bias", "input_projection.bias"),
+            ("query_embed.weight", "query_position_embeddings.weight"),
+            ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
+            ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
+            ("class_embed.weight", "class_labels_classifier.weight"),
+            ("class_embed.bias", "class_labels_classifier.bias"),
+            ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
+            ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
+            ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
+            ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
+            ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
+            ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
+        ]
+    )
+    return rename_keys
+def rename_key(state_dict, old, new):
+    val = state_dict.pop(old)
+    state_dict[new] = val
+def read_in_q_k_v(state_dict, is_panoptic=False):
+    prefix = ""
+    if is_panoptic:
+        prefix = "detr."
+    # first: transformer encoder
+    for i in range(6):
+        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
+    for i in range(6):
+        # read in weights + bias of input projection layer of self-attention
+        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+        # read in weights + bias of input projection layer of cross-attention
+        in_proj_weight_cross_attn = state_dict.pop(
+            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
+        )
+        in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) of cross-attention to the state dict
+        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
+        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
+        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
+        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
+        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
+        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+@torch.no_grad()
+def convert_detr_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
+    """
+    Copy/paste/tweak model's weights to our DETR structure.
+    """
+    # load default config
+    config, is_panoptic = get_detr_config(model_name)
+    # load original model from torch hub
+    model_name_to_original_name = {
+        "detr-resnet-50": "detr_resnet50",
+        "detr-resnet-101": "detr_resnet101",
+    }
+    logger.info(f"Converting model {model_name}...")
+    detr = torch.hub.load("facebookresearch/detr", model_name_to_original_name[model_name], pretrained=True).eval()
+    state_dict = detr.state_dict()
+    # rename keys
+    for src, dest in create_rename_keys(config):
+        if is_panoptic:
+            src = "detr." + src
+        rename_key(state_dict, src, dest)
+    # query, key and value matrices need special treatment
+    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
+    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
+    prefix = "detr.model." if is_panoptic else "model."
+    for key in state_dict.copy().keys():
+        if is_panoptic:
+            if (
+                key.startswith("detr")
+                and not key.startswith("class_labels_classifier")
+                and not key.startswith("bbox_predictor")
+            ):
+                val = state_dict.pop(key)
+                state_dict["detr.model" + key[4:]] = val
+            elif "class_labels_classifier" in key or "bbox_predictor" in key:
+                val = state_dict.pop(key)
+                state_dict["detr." + key] = val
+            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
+                continue
+            else:
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+        else:
+            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+    # finally, create HuggingFace model and load state dict
+    model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+    # verify our conversion on an image
+    format = "coco_panoptic" if is_panoptic else "coco_detection"
+    processor = DetrImageProcessor(format=format)
+    encoding = processor(images=prepare_img(), return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+    original_outputs = detr(pixel_values)
+    outputs = model(pixel_values)
+    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-3)
+    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-3)
+    if is_panoptic:
+        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
+    print("Looks ok!")
+    if pytorch_dump_folder_path is not None:
+        # Save model and image processor
+        logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+    if push_to_hub:
+        # Upload model and image processor to the hub
+        logger.info("Uploading PyTorch model and image processor to the hub...")
+        model.push_to_hub(f"nielsr/{model_name}")
+        processor.push_to_hub(f"nielsr/{model_name}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        default="detr-resnet-50",
+        type=str,
+        choices=["detr-resnet-50", "detr-resnet-101"],
+        help="Name of the DETR model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
+    )
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.")
+    args = parser.parse_args()
+    convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

docs/transformers/build/lib/transformers/models/detr/feature_extraction_detr.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for DETR."""
+import warnings
+from ...image_transforms import rgb_to_id as _rgb_to_id
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_detr import DetrImageProcessor
+logger = logging.get_logger(__name__)
+def rgb_to_id(x):
+    warnings.warn(
+        "rgb_to_id has moved and will not be importable from this module from v5. "
+        "Please import from transformers.image_transforms instead.",
+        FutureWarning,
+    )
+    return _rgb_to_id(x)
+@requires(backends=("vision",))
+class DetrFeatureExtractor(DetrImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class DetrFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
+            " Please use DetrImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+__all__ = ["DetrFeatureExtractor"]

docs/transformers/build/lib/transformers/models/detr/image_processing_detr_fast.py ADDED Viewed

	@@ -0,0 +1,1312 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for DETR."""
+import io
+import pathlib
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from ...image_processing_utils import BatchFeature, get_size_dict
+from ...image_processing_utils_fast import (
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    SizeDict,
+    get_image_size_for_max_height_width,
+    get_max_height_width,
+    safe_squeeze,
+)
+from ...image_transforms import (
+    center_to_corners_format,
+    corners_to_center_format,
+    id_to_rgb,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    validate_annotations,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    is_vision_available,
+    logging,
+)
+from ...utils.import_utils import requires
+from .image_processing_detr import (
+    compute_segments,
+    convert_segmentation_to_rle,
+    get_size_with_aspect_ratio,
+    remove_low_and_no_objects,
+)
+if is_torch_available():
+    import torch
+    from torch import nn
+if is_vision_available():
+    import PIL
+if is_torchvision_v2_available():
+    from torchvision.io import read_image
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.io import read_image
+    from torchvision.transforms import functional as F
+logger = logging.get_logger(__name__)
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
+def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
+    """
+    Convert a COCO polygon annotation to a mask.
+    Args:
+        segmentations (`List[List[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8, device=device)
+        mask = torch.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, axis=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device)
+    return masks
+# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by DETR.
+    """
+    image_height, image_width = image.size()[-2:]
+    image_id = target["image_id"]
+    image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    classes = []
+    area = []
+    boxes = []
+    keypoints = []
+    for obj in annotations:
+        if "iscrowd" not in obj or obj["iscrowd"] == 0:
+            classes.append(obj["category_id"])
+            area.append(obj["area"])
+            boxes.append(obj["bbox"])
+            if "keypoints" in obj:
+                keypoints.append(obj["keypoints"])
+    classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+    area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+    iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+    # guard against no boxes via resizing
+    boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+    new_target = {
+        "image_id": image_id,
+        "class_labels": classes[keep],
+        "boxes": boxes[keep],
+        "area": area[keep],
+        "iscrowd": iscrowd[keep],
+        "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+    }
+    if keypoints:
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device)
+        new_target["masks"] = masks[keep]
+    return new_target
+def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+    h, w = masks.shape[-2:]
+    y = torch.arange(0, h, dtype=torch.float32, device=masks.device)
+    x = torch.arange(0, w, dtype=torch.float32, device=masks.device)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = torch.meshgrid(y, x, indexing="ij")
+    x_mask = masks * torch.unsqueeze(x, 0)
+    x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0]
+    x_min = (
+        torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+    )
+    y_mask = masks * torch.unsqueeze(y, 0)
+    y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0]
+    y_min = (
+        torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+    )
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
+# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    """
+    Converts RGB color to unique ID.
+    """
+    if isinstance(color, torch.Tensor) and len(color.shape) == 3:
+        if color.dtype == torch.uint8:
+            color = color.to(torch.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+def prepare_coco_panoptic_annotation(
+    image: torch.Tensor,
+    target: Dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> Dict:
+    """
+    Prepare a coco panoptic annotation for DETR.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+    new_target = {}
+    new_target["image_id"] = torch.as_tensor(
+        [target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device
+    )
+    new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+    new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+    if "segments_info" in target:
+        masks = read_image(annotation_path).permute(1, 2, 0).to(dtype=torch.int32, device=image.device)
+        masks = rgb_to_id(masks)
+        ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device)
+        masks = masks == ids[:, None, None]
+        masks = masks.to(torch.bool)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = torch.as_tensor(
+            [segment_info["category_id"] for segment_info in target["segments_info"]],
+            dtype=torch.int64,
+            device=image.device,
+        )
+        new_target["iscrowd"] = torch.as_tensor(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]],
+            dtype=torch.int64,
+            device=image.device,
+        )
+        new_target["area"] = torch.as_tensor(
+            [segment_info["area"] for segment_info in target["segments_info"]],
+            dtype=torch.float32,
+            device=image.device,
+        )
+    return new_target
+class DetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    do_pad: Optional[bool]
+    pad_size: Optional[Dict[str, int]]
+    return_segmentation_masks: Optional[bool]
+@add_start_docstrings(
+    "Constructs a fast Detr image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    """
+        format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+        return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+            Whether to return segmentation masks.
+    """,
+)
+@requires(backends=("torchvision", "torch"))
+class DetrImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_DEFAULT_MEAN
+    image_std = IMAGENET_DEFAULT_STD
+    format = AnnotationFormat.COCO_DETECTION
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_pad = True
+    size = {"shortest_edge": 800, "longest_edge": 1333}
+    default_to_square = False
+    model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = DetrFastImageProcessorKwargs
+    def __init__(self, **kwargs: Unpack[DetrFastImageProcessorKwargs]) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
+        size = kwargs.pop("size", None)
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None if size is None else 1333
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        self.size = get_size_dict(size, max_size=max_size, default_to_square=False)
+        # Backwards compatibility
+        do_convert_annotations = kwargs.get("do_convert_annotations", None)
+        do_normalize = kwargs.get("do_normalize", None)
+        if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
+            self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
+        super().__init__(**kwargs)
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `DetrImageProcessorFast.from_pretrained(checkpoint, size=600,
+        max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "pad_and_return_pixel_mask" in kwargs:
+            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+        return super().from_dict(image_processor_dict, **kwargs)
+    def prepare_annotation(
+        self,
+        image: torch.Tensor,
+        target: Dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Dict:
+        """
+        Prepare an annotation for feeding into DETR model.
+        """
+        format = format if format is not None else self.format
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotationFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+    def resize(
+        self,
+        image: torch.Tensor,
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                Resampling filter to use if resizing the image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if size.shortest_edge and size.longest_edge:
+            # Resize the image so that the shortest edge or the longest edge is of the given size
+            # while maintaining the aspect ratio of the original image.
+            new_size = get_size_with_aspect_ratio(
+                image.size()[-2:],
+                size["shortest_edge"],
+                size["longest_edge"],
+            )
+        elif size.max_height and size.max_width:
+            new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"])
+        elif size.height and size.width:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        image = F.resize(
+            image,
+            size=new_size,
+            interpolation=interpolation,
+            **kwargs,
+        )
+        return image
+    def resize_annotation(
+        self,
+        annotation: Dict[str, Any],
+        orig_size: Tuple[int, int],
+        target_size: Tuple[int, int],
+        threshold: float = 0.5,
+        interpolation: "F.InterpolationMode" = None,
+    ):
+        """
+        Resizes an annotation to a target size.
+        Args:
+            annotation (`Dict[str, Any]`):
+                The annotation dictionary.
+            orig_size (`Tuple[int, int]`):
+                The original size of the input image.
+            target_size (`Tuple[int, int]`):
+                The target size of the image, as returned by the preprocessing `resize` step.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The threshold used to binarize the segmentation masks.
+            resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`):
+                The resampling filter to use when resizing the masks.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST
+        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
+        new_annotation = {}
+        new_annotation["size"] = target_size
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                scaled_boxes = boxes * torch.as_tensor(
+                    [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
+                )
+                new_annotation["boxes"] = scaled_boxes
+            elif key == "area":
+                area = value
+                scaled_area = area * (ratio_width * ratio_height)
+                new_annotation["area"] = scaled_area
+            elif key == "masks":
+                masks = value[:, None]
+                masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks]
+                masks = torch.stack(masks).to(torch.float32)
+                masks = masks[:, 0] > threshold
+                new_annotation["masks"] = masks
+            elif key == "size":
+                new_annotation["size"] = target_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+        image_height, image_width = image_size
+        norm_annotation = {}
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                boxes = corners_to_center_format(boxes)
+                boxes /= torch.as_tensor(
+                    [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
+                )
+                norm_annotation[key] = boxes
+            else:
+                norm_annotation[key] = value
+        return norm_annotation
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: Dict,
+        input_image_size: Tuple[int, int],
+        output_image_size: Tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> Dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+        ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = F.pad(
+                    masks,
+                    padding,
+                    fill=0,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+    def pad(
+        self,
+        image: torch.Tensor,
+        padded_size: Tuple[int, int],
+        annotation: Optional[Dict[str, Any]] = None,
+        update_bboxes: bool = True,
+        fill: int = 0,
+    ):
+        original_size = image.size()[-2:]
+        padding_bottom = padded_size[0] - original_size[0]
+        padding_right = padded_size[1] - original_size[1]
+        if padding_bottom < 0 or padding_right < 0:
+            raise ValueError(
+                f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
+                f"original size. Got padded size: {padded_size}, original size: {original_size}."
+            )
+        if original_size != padded_size:
+            padding = [0, 0, padding_right, padding_bottom]
+            image = F.pad(image, padding, fill=fill)
+            if annotation is not None:
+                annotation = self._update_annotation_for_padded_image(
+                    annotation, original_size, padded_size, padding, update_bboxes
+                )
+        # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+        pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
+        pixel_mask[: original_size[0], : original_size[1]] = 1
+        return image, pixel_mask, annotation
+    @add_start_docstrings(
+        BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+        """
+        annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+            List of annotations associated with the image or batch of images. If annotation is for object
+            detection, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+                dictionary. An image can have no annotations, in which case the list should be empty.
+            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                An image can have no segments, in which case the list should be empty.
+            - "file_name" (`str`): The file name of the image.
+        format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+        return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+            Whether to return segmentation masks.
+        masks_path (`str` or `pathlib.Path`, *optional*):
+            Path to the directory containing the segmentation masks.
+        """,
+    )
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[DetrFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        if "pad_and_return_pixel_mask" in kwargs:
+            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead."
+            )
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in a future version, use"
+                " `size['longest_edge']` instead."
+            )
+            kwargs["size"] = kwargs.pop("max_size")
+        return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
+    def _preprocess(
+        self,
+        images: List["torch.Tensor"],
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]],
+        return_segmentation_masks: bool,
+        masks_path: Optional[Union[str, pathlib.Path]],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        do_convert_annotations: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        do_pad: bool,
+        pad_size: Optional[Dict[str, int]],
+        format: Optional[Union[str, AnnotationFormat]],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+        """
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+        if (
+            masks_path is not None
+            and format == AnnotationFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+        data = {}
+        processed_images = []
+        processed_annotations = []
+        pixel_masks = []  # Initialize pixel_masks here
+        for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+            # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+            if annotations is not None:
+                annotation = self.prepare_annotation(
+                    image,
+                    annotation,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+            if do_resize:
+                resized_image = self.resize(image, size=size, interpolation=interpolation)
+                if annotations is not None:
+                    annotation = self.resize_annotation(
+                        annotation,
+                        orig_size=image.size()[-2:],
+                        target_size=resized_image.size()[-2:],
+                    )
+                image = resized_image
+            # Fused rescale and normalize
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
+            if do_convert_annotations and annotations is not None:
+                annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
+            processed_images.append(image)
+            processed_annotations.append(annotation)
+        images = processed_images
+        annotations = processed_annotations if annotations is not None else None
+        if do_pad:
+            # depends on all resized image shapes so we need another loop
+            if pad_size is not None:
+                padded_size = (pad_size["height"], pad_size["width"])
+            else:
+                padded_size = get_max_height_width(images)
+            padded_images = []
+            padded_annotations = []
+            for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+                # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+                if padded_size == image.size()[-2:]:
+                    padded_images.append(image)
+                    pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+                    padded_annotations.append(annotation)
+                    continue
+                image, pixel_mask, annotation = self.pad(
+                    image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+                )
+                padded_images.append(image)
+                padded_annotations.append(annotation)
+                pixel_masks.append(pixel_mask)
+            images = padded_images
+            annotations = padded_annotations if annotations is not None else None
+            data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+        data.update({"pixel_values": torch.stack(images, dim=0)})
+        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+        return encoded_inputs
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation). For visualization, this should be the image size
+                after data augment, but before padding.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+        )
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+        prob = nn.functional.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+        # convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(out_bbox)
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+        boxes = boxes * scale_fct[:, None, :]
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+        return results
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_segmentation
+    def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
+        """
+        Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch.
+        Args:
+            outputs ([`DetrSegmentationOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
+            threshold (`float`, *optional*, defaults to 0.9):
+                Threshold to use to filter out queries.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_semantic_segmentation`.",
+        )
+        out_logits, raw_masks = outputs.logits, outputs.pred_masks
+        empty_label = out_logits.shape[-1] - 1
+        preds = []
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.tolist())
+        for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
+            # we filter empty queries and detection below threshold
+            cur_scores, cur_labels = cur_logits.softmax(-1).max(-1)
+            keep = cur_labels.ne(empty_label) & (cur_scores > threshold)
+            cur_scores = cur_scores[keep]
+            cur_labels = cur_labels[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
+            predictions = {"scores": cur_scores, "labels": cur_labels, "masks": cur_masks}
+            preds.append(predictions)
+        return preds
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_instance
+    def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
+        """
+        Converts the output of [`DetrForSegmentation`] into actual instance segmentation predictions. Only supports
+        PyTorch.
+        Args:
+            results (`List[Dict]`):
+                Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added.
+            outputs ([`DetrSegmentationOutput`]):
+                Raw outputs of the model.
+            orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation).
+            max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation).
+            threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
+            image in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_instance_segmentation`.",
+        )
+        if len(orig_target_sizes) != len(max_target_sizes):
+            raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes")
+        max_h, max_w = max_target_sizes.max(0)[0].tolist()
+        outputs_masks = outputs.pred_masks.squeeze(2)
+        outputs_masks = nn.functional.interpolate(
+            outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
+        )
+        outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()
+        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
+            img_h, img_w = t[0], t[1]
+            results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
+            results[i]["masks"] = nn.functional.interpolate(
+                results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
+            ).byte()
+        return results
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_panoptic
+    def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
+        """
+        Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only supports PyTorch.
+        Args:
+            outputs ([`DetrSegmentationOutput`]):
+                Raw outputs of the model.
+            processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
+                augmentation but before batching.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`, *optional*):
+                Torch Tensor (or list) corresponding to the requested final size `(height, width)` of each prediction.
+                If left to None, it will default to the `processed_sizes`.
+            is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
+                Dictionary mapping class indices to either True or False, depending on whether or not they are a thing.
+                If not set, defaults to the `is_thing_map` of COCO panoptic.
+            threshold (`float`, *optional*, defaults to 0.85):
+                Threshold to use to filter out queries.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for
+            an image in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_panoptic_segmentation`.",
+        )
+        if target_sizes is None:
+            target_sizes = processed_sizes
+        if len(processed_sizes) != len(target_sizes):
+            raise ValueError("Make sure to pass in as many processed_sizes as target_sizes")
+        if is_thing_map is None:
+            # default to is_thing_map of COCO panoptic
+            is_thing_map = {i: i <= 90 for i in range(201)}
+        out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes
+        if not len(out_logits) == len(raw_masks) == len(target_sizes):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks"
+            )
+        empty_label = out_logits.shape[-1] - 1
+        preds = []
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.tolist())
+        for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
+            out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
+        ):
+            # we filter empty queries and detection below threshold
+            cur_scores, cur_labels = cur_logits.softmax(-1).max(-1)
+            keep = cur_labels.ne(empty_label) & (cur_scores > threshold)
+            cur_scores = cur_scores[keep]
+            cur_labels = cur_labels[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_boxes = center_to_corners_format(cur_boxes[keep])
+            h, w = cur_masks.shape[-2:]
+            if len(cur_boxes) != len(cur_labels):
+                raise ValueError("Not as many boxes as there are classes")
+            # It may be that we have several predicted masks for the same stuff class.
+            # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+            cur_masks = cur_masks.flatten(1)
+            stuff_equiv_classes = defaultdict(lambda: [])
+            for k, label in enumerate(cur_labels):
+                if not is_thing_map[label.item()]:
+                    stuff_equiv_classes[label.item()].append(k)
+            def get_ids_area(masks, scores, dedup=False):
+                # This helper function creates the final panoptic segmentation image
+                # It also returns the area of the masks that appears on the image
+                m_id = masks.transpose(0, 1).softmax(-1)
+                if m_id.shape[-1] == 0:
+                    # We didn't detect any mask :(
+                    m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
+                else:
+                    m_id = m_id.argmax(-1).view(h, w)
+                if dedup:
+                    # Merge the masks corresponding to the same stuff class
+                    for equiv in stuff_equiv_classes.values():
+                        if len(equiv) > 1:
+                            for eq_id in equiv:
+                                m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
+                final_h, final_w = to_tuple(target_size)
+                seg_img = PIL.Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy()))
+                seg_img = seg_img.resize(size=(final_w, final_h), resample=PILImageResampling.NEAREST)
+                np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
+                np_seg_img = np_seg_img.view(final_h, final_w, 3)
+                np_seg_img = np_seg_img.numpy()
+                m_id = torch.from_numpy(rgb_to_id(np_seg_img))
+                area = []
+                for i in range(len(scores)):
+                    area.append(m_id.eq(i).sum().item())
+                return area, seg_img
+            area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
+            if cur_labels.numel() > 0:
+                # We know filter empty masks as long as we find some
+                while True:
+                    filtered_small = torch.as_tensor(
+                        [area[i] <= 4 for i, c in enumerate(cur_labels)], dtype=torch.bool, device=keep.device
+                    )
+                    if filtered_small.any().item():
+                        cur_scores = cur_scores[~filtered_small]
+                        cur_labels = cur_labels[~filtered_small]
+                        cur_masks = cur_masks[~filtered_small]
+                        area, seg_img = get_ids_area(cur_masks, cur_scores)
+                    else:
+                        break
+            else:
+                cur_labels = torch.ones(1, dtype=torch.long, device=cur_labels.device)
+            segments_info = []
+            for i, a in enumerate(area):
+                cat = cur_labels[i].item()
+                segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a})
+            del cur_labels
+            with io.BytesIO() as out:
+                seg_img.save(out, format="PNG")
+                predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+            preds.append(predictions)
+        return preds
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_object_detection
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
+    ):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+        prob = nn.functional.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+        # Convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(out_bbox)
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, List):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+        return results
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_semantic_segmentation
+    def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple[int, int]] = None):
+        """
+        Converts the output of [`DetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.
+        Args:
+            outputs ([`DetrForSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`List[Tuple[int, int]]`, *optional*):
+                A list of tuples (`Tuple[int, int]`) containing the target size (height, width) of each image in the
+                batch. If unset, predictions will not be resized.
+        Returns:
+            `List[torch.Tensor]`:
+                A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
+                corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
+                `torch.Tensor` correspond to a semantic class id.
+        """
+        class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.pred_masks  # [batch_size, num_queries, height, width]
+        # Remove the null class `[..., :-1]`
+        masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
+        masks_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+        # Semantic segmentation logits of shape (batch_size, num_classes, height, width)
+        segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
+        batch_size = class_queries_logits.shape[0]
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if batch_size != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+            semantic_segmentation = []
+            for idx in range(batch_size):
+                resized_logits = nn.functional.interpolate(
+                    segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
+                )
+                semantic_map = resized_logits[0].argmax(dim=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = segmentation.argmax(dim=1)
+            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
+        return semantic_segmentation
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_instance_segmentation
+    def post_process_instance_segmentation(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        target_sizes: Optional[List[Tuple[int, int]]] = None,
+        return_coco_annotation: Optional[bool] = False,
+    ) -> List[Dict]:
+        """
+        Converts the output of [`DetrForSegmentation`] into instance segmentation predictions. Only supports PyTorch.
+        Args:
+            outputs ([`DetrForSegmentation`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability score threshold to keep predicted instance masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
+                The overlap mask area threshold to merge or discard small disconnected parts within each binary
+                instance mask.
+            target_sizes (`List[Tuple]`, *optional*):
+                List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction. If unset, predictions will not be resized.
+            return_coco_annotation (`bool`, *optional*):
+                Defaults to `False`. If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE)
+                format.
+        Returns:
+            `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
+            - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
+              `List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to
+              `True`. Set to `None` if no mask if found above `threshold`.
+            - **segments_info** -- A dictionary that contains additional information on each segment.
+                - **id** -- An integer representing the `segment_id`.
+                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
+                - **score** -- Prediction score of segment with `segment_id`.
+        """
+        class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.pred_masks  # [batch_size, num_queries, height, width]
+        batch_size = class_queries_logits.shape[0]
+        num_labels = class_queries_logits.shape[-1] - 1
+        mask_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+        # Predicted label and score of each query (batch_size, num_queries)
+        pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
+        # Loop over items in batch size
+        results: List[Dict[str, TensorType]] = []
+        for i in range(batch_size):
+            mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
+                mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
+            )
+            # No mask found
+            if mask_probs_item.shape[0] <= 0:
+                height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+            # Get segmentation map and segment information of batch item
+            target_size = target_sizes[i] if target_sizes is not None else None
+            segmentation, segments = compute_segments(
+                mask_probs=mask_probs_item,
+                pred_scores=pred_scores_item,
+                pred_labels=pred_labels_item,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                label_ids_to_fuse=[],
+                target_size=target_size,
+            )
+            # Return segmentation map in run-length encoding (RLE) format
+            if return_coco_annotation:
+                segmentation = convert_segmentation_to_rle(segmentation)
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_panoptic_segmentation
+    def post_process_panoptic_segmentation(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        label_ids_to_fuse: Optional[Set[int]] = None,
+        target_sizes: Optional[List[Tuple[int, int]]] = None,
+    ) -> List[Dict]:
+        """
+        Converts the output of [`DetrForSegmentation`] into image panoptic segmentation predictions. Only supports
+        PyTorch.
+        Args:
+            outputs ([`DetrForSegmentation`]):
+                The outputs from [`DetrForSegmentation`].
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability score threshold to keep predicted instance masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
+                The overlap mask area threshold to merge or discard small disconnected parts within each binary
+                instance mask.
+            label_ids_to_fuse (`Set[int]`, *optional*):
+                The labels in this state will have all their instances be fused together. For instance we could say
+                there can only be one sky in an image, but several persons, so the label ID for sky would be in that
+                set, but not the one for person.
+            target_sizes (`List[Tuple]`, *optional*):
+                List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction in batch. If unset, predictions will not be resized.
+        Returns:
+            `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
+            - **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id` or
+              `None` if no mask if found above `threshold`. If `target_sizes` is specified, segmentation is resized to
+              the corresponding `target_sizes` entry.
+            - **segments_info** -- A dictionary that contains additional information on each segment.
+                - **id** -- an integer representing the `segment_id`.
+                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
+                - **was_fused** -- a boolean, `True` if `label_id` was in `label_ids_to_fuse`, `False` otherwise.
+                  Multiple instances of the same class / label were fused and assigned a single `segment_id`.
+                - **score** -- Prediction score of segment with `segment_id`.
+        """
+        if label_ids_to_fuse is None:
+            logger.warning_once("`label_ids_to_fuse` unset. No instance will be fused.")
+            label_ids_to_fuse = set()
+        class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.pred_masks  # [batch_size, num_queries, height, width]
+        batch_size = class_queries_logits.shape[0]
+        num_labels = class_queries_logits.shape[-1] - 1
+        mask_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+        # Predicted label and score of each query (batch_size, num_queries)
+        pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
+        # Loop over items in batch size
+        results: List[Dict[str, TensorType]] = []
+        for i in range(batch_size):
+            mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
+                mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
+            )
+            # No mask found
+            if mask_probs_item.shape[0] <= 0:
+                height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+            # Get segmentation map and segment information of batch item
+            target_size = target_sizes[i] if target_sizes is not None else None
+            segmentation, segments = compute_segments(
+                mask_probs=mask_probs_item,
+                pred_scores=pred_scores_item,
+                pred_labels=pred_labels_item,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                label_ids_to_fuse=label_ids_to_fuse,
+                target_size=target_size,
+            )
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+__all__ = ["DetrImageProcessorFast"]

docs/transformers/build/lib/transformers/models/detr/modeling_detr.py ADDED Viewed

	@@ -0,0 +1,1815 @@

+# coding=utf-8
+# Copyright 2021 Facebook AI Research The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DETR model."""
+import math
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+from torch import Tensor, nn
+from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_timm_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
+from ...utils.backbone_utils import load_backbone
+from .configuration_detr import DetrConfig
+if is_timm_available():
+    from timm import create_model
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "DetrConfig"
+_CHECKPOINT_FOR_DOC = "facebook/detr-resnet-50"
+@dataclass
+class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
+    """
+    Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+@dataclass
+class DetrModelOutput(Seq2SeqModelOutput):
+    """
+    Base class for outputs of the DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+@dataclass
+class DetrObjectDetectionOutput(ModelOutput):
+    """
+    Output type of [`DetrForObjectDetection`].
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
+            unnormalized bounding boxes.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class DetrSegmentationOutput(ModelOutput):
+    """
+    Output type of [`DetrForSegmentation`].
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
+            unnormalized bounding boxes.
+        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
+            Segmentation masks logits for all queries. See also
+            [`~DetrImageProcessor.post_process_semantic_segmentation`] or
+            [`~DetrImageProcessor.post_process_instance_segmentation`]
+            [`~DetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and panoptic
+            segmentation masks respectively.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
+            layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    pred_masks: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+# BELOW: utilities copied from
+# https://github.com/facebookresearch/detr/blob/master/backbone.py
+class DetrFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `DetrFrozenBatchNorm2d`.
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = DetrFrozenBatchNorm2d(module.num_features)
+            if not module.weight.device == torch.device("meta"):
+                new_module.weight.data.copy_(module.weight)
+                new_module.bias.data.copy_(module.bias)
+                new_module.running_mean.data.copy_(module.running_mean)
+                new_module.running_var.data.copy_(module.running_var)
+            model._modules[name] = new_module
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+class DetrConvEncoder(nn.Module):
+    """
+    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
+    nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
+        if config.use_timm_backbone:
+            # We default to values which were previously hard-coded. This enables configurability from the config
+            # using backbone arguments, while keeping the default behavior the same.
+            requires_backends(self, ["timm"])
+            kwargs = getattr(config, "backbone_kwargs", {})
+            kwargs = {} if kwargs is None else kwargs.copy()
+            out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
+            num_channels = kwargs.pop("in_chans", config.num_channels)
+            if config.dilation:
+                kwargs["output_stride"] = kwargs.get("output_stride", 16)
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=out_indices,
+                in_chans=num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = load_backbone(config)
+        # replace batch norm by frozen batch norm
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
+        backbone_model_type = None
+        if config.backbone is not None:
+            backbone_model_type = config.backbone
+        elif config.backbone_config is not None:
+            backbone_model_type = config.backbone_config.model_type
+        else:
+            raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
+        if "resnet" in backbone_model_type:
+            for name, parameter in self.model.named_parameters():
+                if config.use_timm_backbone:
+                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                        parameter.requires_grad_(False)
+                else:
+                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                        parameter.requires_grad_(False)
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+class DetrConvModel(nn.Module):
+    """
+    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+    """
+    def __init__(self, conv_encoder, position_embedding):
+        super().__init__()
+        self.conv_encoder = conv_encoder
+        self.position_embedding = position_embedding
+    def forward(self, pixel_values, pixel_mask):
+        # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+        out = self.conv_encoder(pixel_values, pixel_mask)
+        pos = []
+        for feature_map, mask in out:
+            # position encoding
+            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+        return out, pos
+class DetrSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, pixel_values, pixel_mask):
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+class DetrLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+    def forward(self, pixel_values, pixel_mask=None):
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = DetrSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = DetrLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+    return position_embedding
+class DetrAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+    Here, we add position embeddings to the queries and keys (as explained in the DETR paper).
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor]):
+        return tensor if object_queries is None else tensor + object_queries
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        spatial_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size, target_len, embed_dim = hidden_states.size()
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if object_queries is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, object_queries)
+        # add key-value position embeddings to the key value states
+        if spatial_position_embeddings is not None:
+            key_value_states_original = key_value_states
+            key_value_states = self.with_pos_embed(key_value_states, spatial_position_embeddings)
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, batch_size)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        source_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped
+class DetrEncoderLayer(nn.Module):
+    def __init__(self, config: DetrConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DetrAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        object_queries: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            object_queries (`torch.FloatTensor`, *optional*):
+                Object queries (also called content embeddings), to be added to the hidden states.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            object_queries=object_queries,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class DetrDecoderLayer(nn.Module):
+    def __init__(self, config: DetrConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DetrAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = DetrAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
+        query_position_embeddings: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            object_queries (`torch.FloatTensor`, *optional*):
+                object_queries that are added to the hidden states
+            in the cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
+            in the self-attention layer.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            object_queries=query_position_embeddings,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                object_queries=query_position_embeddings,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                spatial_position_embeddings=object_queries,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        return outputs
+class DetrPreTrainedModel(PreTrainedModel):
+    config_class = DetrConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+    _no_split_modules = [r"DetrConvEncoder", r"DetrEncoderLayer", r"DetrDecoderLayer"]
+    def _init_weights(self, module):
+        std = self.config.init_std
+        xavier_std = self.config.init_xavier_std
+        if isinstance(module, DetrMHAttentionMap):
+            nn.init.zeros_(module.k_linear.bias)
+            nn.init.zeros_(module.q_linear.bias)
+            nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
+            nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
+        elif isinstance(module, DetrLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+DETR_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`DetrConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+DETR_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`DetrImageProcessor.__call__`] for details.
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class DetrEncoder(DetrPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`DetrEncoderLayer`].
+    The encoder updates the flattened feature map through multiple self-attention layers.
+    Small tweak for DETR:
+    - object_queries are added to the forward pass.
+    Args:
+        config: DetrConfig
+    """
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        self.layers = nn.ModuleList([DetrEncoderLayer(config) for _ in range(config.encoder_layers)])
+        # in the original DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        object_queries=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Object queries that are added to the queries in each self-attention layer.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                # we add object_queries as extra input to the encoder_layer
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    object_queries=object_queries,
+                    output_attentions=output_attentions,
+                )
+                hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+class DetrDecoder(DetrPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetrDecoderLayer`].
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+    Some small tweaks for DETR:
+    - object_queries and query_position_embeddings are added to the forward pass.
+    - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
+    Args:
+        config: DetrConfig
+    """
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.layers = nn.ModuleList([DetrDecoderLayer(config) for _ in range(config.decoder_layers)])
+        # in DETR, the decoder uses layernorm after the last decoder layer output
+        self.layernorm = nn.LayerNorm(config.d_model)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        object_queries=None,
+        query_position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`:
+                - 1 for queries that are **not masked**,
+                - 0 for queries that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Object queries that are added to the queries and keys in each cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                , *optional*): Position embeddings that are added to the values and keys in each self-attention layer.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+            input_shape = inputs_embeds.size()[:-1]
+        combined_attention_mask = None
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            combined_attention_mask = combined_attention_mask + _prepare_4d_attention_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+        # optional intermediate hidden states
+        intermediate = () if self.config.auxiliary_loss else None
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    combined_attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    object_queries=object_queries,
+                    query_position_embeddings=query_position_embeddings,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if self.config.auxiliary_loss:
+                hidden_states = self.layernorm(hidden_states)
+                intermediate += (hidden_states,)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+        # finally, apply layernorm
+        hidden_states = self.layernorm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        # stack intermediate decoder activations
+        if self.config.auxiliary_loss:
+            intermediate = torch.stack(intermediate)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate]
+                if v is not None
+            )
+        return DetrDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+            intermediate_hidden_states=intermediate,
+        )
+@add_start_docstrings(
+    """
+    The bare DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
+    any specific head on top.
+    """,
+    DETR_START_DOCSTRING,
+)
+class DetrModel(DetrPreTrainedModel):
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+        # Create backbone + positional encoding
+        backbone = DetrConvEncoder(config)
+        object_queries = build_position_encoding(config)
+        self.backbone = DetrConvModel(backbone, object_queries)
+        # Create projection layer
+        self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
+        self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
+        self.encoder = DetrEncoder(config)
+        self.decoder = DetrDecoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_encoder(self):
+        return self.encoder
+    def get_decoder(self):
+        return self.decoder
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+    @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DetrModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], DetrModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, DetrModel
+        >>> from PIL import Image
+        >>> import requests
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
+        >>> model = DetrModel.from_pretrained("facebook/detr-resnet-50")
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+        >>> # the last hidden states are the final query embeddings of the Transformer decoder
+        >>> # these are of shape (batch_size, num_queries, hidden_size)
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 100, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), device=device)
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # pixel_values should be of shape (batch_size, num_channels, height, width)
+        # pixel_mask should be of shape (batch_size, height, width)
+        features, object_queries_list = self.backbone(pixel_values, pixel_mask)
+        # get final feature map and downsampled mask
+        feature_map, mask = features[-1]
+        if mask is None:
+            raise ValueError("Backbone does not return downsampled pixel mask")
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        projected_feature_map = self.input_projection(feature_map)
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
+        flattened_mask = mask.flatten(1)
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                object_queries=object_queries,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        # Fifth, sent query embeddings + object_queries through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1)
+        queries = torch.zeros_like(query_position_embeddings)
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            object_queries=object_queries,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return DetrModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+        )
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+class DetrMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+    """
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+@add_start_docstrings(
+    """
+    DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
+    such as COCO detection.
+    """,
+    DETR_START_DOCSTRING,
+)
+class DetrForObjectDetection(DetrPreTrainedModel):
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+        # DETR encoder-decoder model
+        self.model = DetrModel(config)
+        # Object detection heads
+        self.class_labels_classifier = nn.Linear(
+            config.d_model, config.num_labels + 1
+        )  # We add one for the "no object" class
+        self.bbox_predictor = DetrMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[List[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], DetrObjectDetectionOutput]:
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, DetrForObjectDetection
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
+        >>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
+        ...     0
+        ... ]
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98]
+        Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66]
+        Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76]
+        Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93]
+        Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # First, sent images through DETR base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        # class logits + predicted bounding boxes
+        logits = self.class_labels_classifier(sequence_output)
+        pred_boxes = self.bbox_predictor(sequence_output).sigmoid()
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            outputs_class, outputs_coord = None, None
+            if self.config.auxiliary_loss:
+                intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4]
+                outputs_class = self.class_labels_classifier(intermediate)
+                outputs_coord = self.bbox_predictor(intermediate).sigmoid()
+            loss, loss_dict, auxiliary_outputs = self.loss_function(
+                logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord
+            )
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+        return DetrObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+@add_start_docstrings(
+    """
+    DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
+    such as COCO panoptic.
+    """,
+    DETR_START_DOCSTRING,
+)
+class DetrForSegmentation(DetrPreTrainedModel):
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+        # object detection model
+        self.detr = DetrForObjectDetection(config)
+        # segmentation head
+        hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
+        intermediate_channel_sizes = self.detr.model.backbone.conv_encoder.intermediate_channel_sizes
+        self.mask_head = DetrMaskHeadSmallConv(
+            hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
+        )
+        self.bbox_attention = DetrMHAttentionMap(
+            hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[List[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], DetrSegmentationOutput]:
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
+            dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels,
+            bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves
+            should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)`, the boxes a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)` and the masks a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, height, width)`.
+        Returns:
+        Examples:
+        ```python
+        >>> import io
+        >>> import requests
+        >>> from PIL import Image
+        >>> import torch
+        >>> import numpy
+        >>> from transformers import AutoImageProcessor, DetrForSegmentation
+        >>> from transformers.image_transforms import rgb_to_id
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic")
+        >>> model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic")
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+        >>> # Use the `post_process_panoptic_segmentation` method of the `image_processor` to retrieve post-processed panoptic segmentation maps
+        >>> # Segmentation results are returned as a list of dictionaries
+        >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(300, 500)])
+        >>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found
+        >>> panoptic_seg = result[0]["segmentation"]
+        >>> # Get prediction score and segment_id to class_id mapping of each segment
+        >>> panoptic_segments_info = result[0]["segments_info"]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+        if pixel_mask is None:
+            pixel_mask = torch.ones((batch_size, height, width), device=device)
+        # First, get list of feature maps and position embeddings
+        features, object_queries_list = self.detr.model.backbone(pixel_values, pixel_mask=pixel_mask)
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        feature_map, mask = features[-1]
+        batch_size, num_channels, height, width = feature_map.shape
+        projected_feature_map = self.detr.model.input_projection(feature_map)
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
+        flattened_mask = mask.flatten(1)
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, heigth*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.detr.model.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                object_queries=object_queries,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.detr.model.query_position_embeddings.weight.unsqueeze(0).repeat(
+            batch_size, 1, 1
+        )
+        queries = torch.zeros_like(query_position_embeddings)
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.detr.model.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            object_queries=object_queries,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = decoder_outputs[0]
+        # Sixth, compute logits, pred_boxes and pred_masks
+        logits = self.detr.class_labels_classifier(sequence_output)
+        pred_boxes = self.detr.bbox_predictor(sequence_output).sigmoid()
+        memory = encoder_outputs[0].permute(0, 2, 1).view(batch_size, self.config.d_model, height, width)
+        mask = flattened_mask.view(batch_size, height, width)
+        # FIXME h_boxes takes the last one computed, keep this in mind
+        # important: we need to reverse the mask, since in the original implementation the mask works reversed
+        # bbox_mask is of shape (batch_size, num_queries, number_of_attention_heads in bbox_attention, height/32, width/32)
+        bbox_mask = self.bbox_attention(sequence_output, memory, mask=~mask)
+        seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]])
+        pred_masks = seg_masks.view(batch_size, self.detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            outputs_class, outputs_coord = None, None
+            if self.config.auxiliary_loss:
+                intermediate = decoder_outputs.intermediate_hidden_states if return_dict else decoder_outputs[-1]
+                outputs_class = self.detr.class_labels_classifier(intermediate)
+                outputs_coord = self.detr.bbox_predictor(intermediate).sigmoid()
+            loss, loss_dict, auxiliary_outputs = self.loss_function(
+                logits, labels, device, pred_boxes, pred_masks, self.config, outputs_class, outputs_coord
+            )
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes, pred_masks) + auxiliary_outputs + decoder_outputs + encoder_outputs
+            else:
+                output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+        return DetrSegmentationOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            pred_masks=pred_masks,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+def _expand(tensor, length: int):
+    return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)
+# taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py
+class DetrMaskHeadSmallConv(nn.Module):
+    """
+    Simple convolutional head, using group norm. Upsampling is done using a FPN approach
+    """
+    def __init__(self, dim, fpn_dims, context_dim):
+        super().__init__()
+        if dim % 8 != 0:
+            raise ValueError(
+                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in"
+                " GroupNorm is set to 8"
+            )
+        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
+        self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
+        self.gn1 = nn.GroupNorm(8, dim)
+        self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
+        self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1])
+        self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
+        self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2])
+        self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
+        self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3])
+        self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
+        self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4])
+        self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
+        self.dim = dim
+        self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
+        self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
+        self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
+        # here we concatenate x, the projected feature map, of shape (batch_size, d_model, heigth/32, width/32) with
+        # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32).
+        # We expand the projected feature map to match the number of heads.
+        x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
+        x = self.lay1(x)
+        x = self.gn1(x)
+        x = nn.functional.relu(x)
+        x = self.lay2(x)
+        x = self.gn2(x)
+        x = nn.functional.relu(x)
+        cur_fpn = self.adapter1(fpns[0])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay3(x)
+        x = self.gn3(x)
+        x = nn.functional.relu(x)
+        cur_fpn = self.adapter2(fpns[1])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay4(x)
+        x = self.gn4(x)
+        x = nn.functional.relu(x)
+        cur_fpn = self.adapter3(fpns[2])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay5(x)
+        x = self.gn5(x)
+        x = nn.functional.relu(x)
+        x = self.out_lay(x)
+        return x
+class DetrMHAttentionMap(nn.Module):
+    """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+        self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+        self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+        self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
+    def forward(self, q, k, mask: Optional[Tensor] = None):
+        q = self.q_linear(q)
+        k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
+        queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
+        keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
+        weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
+        if mask is not None:
+            weights = weights.masked_fill(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min)
+        weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
+        weights = self.dropout(weights)
+        return weights
+__all__ = [
+    "DetrForObjectDetection",
+    "DetrForSegmentation",
+    "DetrModel",
+    "DetrPreTrainedModel",
+]

docs/transformers/build/lib/transformers/models/dialogpt/__init__.py ADDED Viewed

File without changes

docs/transformers/build/lib/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import torch
+from transformers.utils import WEIGHTS_NAME
+DIALOGPT_MODELS = ["small", "medium", "large"]
+OLD_KEY = "lm_head.decoder.weight"
+NEW_KEY = "lm_head.weight"
+def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str):
+    d = torch.load(checkpoint_path, weights_only=True)
+    d[NEW_KEY] = d.pop(OLD_KEY)
+    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
+    torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dialogpt_path", default=".", type=str)
+    args = parser.parse_args()
+    for MODEL in DIALOGPT_MODELS:
+        checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
+        pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
+        convert_dialogpt_checkpoint(
+            checkpoint_path,
+            pytorch_dump_folder_path,
+        )

docs/transformers/build/lib/transformers/models/diffllama/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_diffllama import *
+    from .modeling_diffllama import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/diffllama/configuration_diffllama.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# coding=utf-8
+# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Llama implementations in this library and Microsoft's
+# Differential Transformer implementations.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DiffLlama model configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+class DiffLlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DiffLlamaModel`]. It is used to instantiate an DiffLlama
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults
+    will yield a similar configuration to that of the [kajuma/DiffLlama-0.3B-handcut](https://huggingface.co/kajuma/DiffLlama-0.3B-handcut).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the DiffLlama model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DiffLlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 16):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'diffllama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'diffllama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'diffllama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'diffllama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        lambda_std_dev (`float`, *optional*, defaults to 0.1):
+            The standard deviation for initialization of parameter lambda in attention layer.
+        head_dim (`int`, *optional*):
+            The attention head dimension. If None, it will default to hidden_size // num_heads
+    ```python
+    >>> from transformers import DiffLlamaModel, DiffLlamaConfig
+    >>> # Initializing a DiffLlama diffllama-7b style configuration
+    >>> configuration = DiffLlamaConfig()
+    >>> # Initializing a model from the diffllama-7b style configuration
+    >>> model = DiffLlamaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "diffllama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=16,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        lambda_std_dev=0.1,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.lambda_std_dev = lambda_std_dev
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+__all__ = ["DiffLlamaConfig"]

docs/transformers/build/lib/transformers/models/esm/openfold_utils/rigid_utils.py ADDED Viewed

	@@ -0,0 +1,1242 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from functools import lru_cache
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple
+import numpy as np
+import torch
+def rot_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Performs matrix multiplication of two rotation matrix tensors. Written out by hand to avoid AMP downcasting.
+    Args:
+        a: [*, 3, 3] left multiplicand
+        b: [*, 3, 3] right multiplicand
+    Returns:
+        The product ab
+    """
+    def row_mul(i: int) -> torch.Tensor:
+        return torch.stack(
+            [
+                a[..., i, 0] * b[..., 0, 0] + a[..., i, 1] * b[..., 1, 0] + a[..., i, 2] * b[..., 2, 0],
+                a[..., i, 0] * b[..., 0, 1] + a[..., i, 1] * b[..., 1, 1] + a[..., i, 2] * b[..., 2, 1],
+                a[..., i, 0] * b[..., 0, 2] + a[..., i, 1] * b[..., 1, 2] + a[..., i, 2] * b[..., 2, 2],
+            ],
+            dim=-1,
+        )
+    return torch.stack(
+        [
+            row_mul(0),
+            row_mul(1),
+            row_mul(2),
+        ],
+        dim=-2,
+    )
+def rot_vec_mul(r: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+    """
+    Applies a rotation to a vector. Written out by hand to avoid transfer to avoid AMP downcasting.
+    Args:
+        r: [*, 3, 3] rotation matrices
+        t: [*, 3] coordinate tensors
+    Returns:
+        [*, 3] rotated coordinates
+    """
+    x, y, z = torch.unbind(t, dim=-1)
+    return torch.stack(
+        [
+            r[..., 0, 0] * x + r[..., 0, 1] * y + r[..., 0, 2] * z,
+            r[..., 1, 0] * x + r[..., 1, 1] * y + r[..., 1, 2] * z,
+            r[..., 2, 0] * x + r[..., 2, 1] * y + r[..., 2, 2] * z,
+        ],
+        dim=-1,
+    )
+@lru_cache(maxsize=None)
+def identity_rot_mats(
+    batch_dims: Tuple[int, ...],
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    rots = torch.eye(3, dtype=dtype, device=device, requires_grad=requires_grad)
+    rots = rots.view(*((1,) * len(batch_dims)), 3, 3)
+    rots = rots.expand(*batch_dims, -1, -1)
+    rots = rots.contiguous()
+    return rots
+@lru_cache(maxsize=None)
+def identity_trans(
+    batch_dims: Tuple[int, ...],
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    trans = torch.zeros((*batch_dims, 3), dtype=dtype, device=device, requires_grad=requires_grad)
+    return trans
+@lru_cache(maxsize=None)
+def identity_quats(
+    batch_dims: Tuple[int, ...],
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    quat = torch.zeros((*batch_dims, 4), dtype=dtype, device=device, requires_grad=requires_grad)
+    with torch.no_grad():
+        quat[..., 0] = 1
+    return quat
+_quat_elements: List[str] = ["a", "b", "c", "d"]
+_qtr_keys: List[str] = [l1 + l2 for l1 in _quat_elements for l2 in _quat_elements]
+_qtr_ind_dict: Dict[str, int] = {key: ind for ind, key in enumerate(_qtr_keys)}
+def _to_mat(pairs: List[Tuple[str, int]]) -> np.ndarray:
+    mat = np.zeros((4, 4))
+    for key, value in pairs:
+        ind = _qtr_ind_dict[key]
+        mat[ind // 4][ind % 4] = value
+    return mat
+_QTR_MAT = np.zeros((4, 4, 3, 3))
+_QTR_MAT[..., 0, 0] = _to_mat([("aa", 1), ("bb", 1), ("cc", -1), ("dd", -1)])
+_QTR_MAT[..., 0, 1] = _to_mat([("bc", 2), ("ad", -2)])
+_QTR_MAT[..., 0, 2] = _to_mat([("bd", 2), ("ac", 2)])
+_QTR_MAT[..., 1, 0] = _to_mat([("bc", 2), ("ad", 2)])
+_QTR_MAT[..., 1, 1] = _to_mat([("aa", 1), ("bb", -1), ("cc", 1), ("dd", -1)])
+_QTR_MAT[..., 1, 2] = _to_mat([("cd", 2), ("ab", -2)])
+_QTR_MAT[..., 2, 0] = _to_mat([("bd", 2), ("ac", -2)])
+_QTR_MAT[..., 2, 1] = _to_mat([("cd", 2), ("ab", 2)])
+_QTR_MAT[..., 2, 2] = _to_mat([("aa", 1), ("bb", -1), ("cc", -1), ("dd", 1)])
+def quat_to_rot(quat: torch.Tensor) -> torch.Tensor:
+    """
+    Converts a quaternion to a rotation matrix.
+    Args:
+        quat: [*, 4] quaternions
+    Returns:
+        [*, 3, 3] rotation matrices
+    """
+    # [*, 4, 4]
+    quat = quat[..., None] * quat[..., None, :]
+    # [4, 4, 3, 3]
+    mat = _get_quat("_QTR_MAT", dtype=quat.dtype, device=quat.device)
+    # [*, 4, 4, 3, 3]
+    shaped_qtr_mat = mat.view((1,) * len(quat.shape[:-2]) + mat.shape)
+    quat = quat[..., None, None] * shaped_qtr_mat
+    # [*, 3, 3]
+    return torch.sum(quat, dim=(-3, -4))
+def rot_to_quat(rot: torch.Tensor) -> torch.Tensor:
+    if rot.shape[-2:] != (3, 3):
+        raise ValueError("Input rotation is incorrectly shaped")
+    [[xx, xy, xz], [yx, yy, yz], [zx, zy, zz]] = [[rot[..., i, j] for j in range(3)] for i in range(3)]
+    k = [
+        [
+            xx + yy + zz,
+            zy - yz,
+            xz - zx,
+            yx - xy,
+        ],
+        [
+            zy - yz,
+            xx - yy - zz,
+            xy + yx,
+            xz + zx,
+        ],
+        [
+            xz - zx,
+            xy + yx,
+            yy - xx - zz,
+            yz + zy,
+        ],
+        [
+            yx - xy,
+            xz + zx,
+            yz + zy,
+            zz - xx - yy,
+        ],
+    ]
+    _, vectors = torch.linalg.eigh((1.0 / 3.0) * torch.stack([torch.stack(t, dim=-1) for t in k], dim=-2))
+    return vectors[..., -1]
+_QUAT_MULTIPLY = np.zeros((4, 4, 4))
+_QUAT_MULTIPLY[:, :, 0] = [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, -1]]
+_QUAT_MULTIPLY[:, :, 1] = [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, -1, 0]]
+_QUAT_MULTIPLY[:, :, 2] = [[0, 0, 1, 0], [0, 0, 0, -1], [1, 0, 0, 0], [0, 1, 0, 0]]
+_QUAT_MULTIPLY[:, :, 3] = [[0, 0, 0, 1], [0, 0, 1, 0], [0, -1, 0, 0], [1, 0, 0, 0]]
+_QUAT_MULTIPLY_BY_VEC = _QUAT_MULTIPLY[:, 1:, :]
+_CACHED_QUATS: Dict[str, np.ndarray] = {
+    "_QTR_MAT": _QTR_MAT,
+    "_QUAT_MULTIPLY": _QUAT_MULTIPLY,
+    "_QUAT_MULTIPLY_BY_VEC": _QUAT_MULTIPLY_BY_VEC,
+}
+@lru_cache(maxsize=None)
+def _get_quat(quat_key: str, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+    return torch.tensor(_CACHED_QUATS[quat_key], dtype=dtype, device=device)
+def quat_multiply(quat1: torch.Tensor, quat2: torch.Tensor) -> torch.Tensor:
+    """Multiply a quaternion by another quaternion."""
+    mat = _get_quat("_QUAT_MULTIPLY", dtype=quat1.dtype, device=quat1.device)
+    reshaped_mat = mat.view((1,) * len(quat1.shape[:-1]) + mat.shape)
+    return torch.sum(reshaped_mat * quat1[..., :, None, None] * quat2[..., None, :, None], dim=(-3, -2))
+def quat_multiply_by_vec(quat: torch.Tensor, vec: torch.Tensor) -> torch.Tensor:
+    """Multiply a quaternion by a pure-vector quaternion."""
+    mat = _get_quat("_QUAT_MULTIPLY_BY_VEC", dtype=quat.dtype, device=quat.device)
+    reshaped_mat = mat.view((1,) * len(quat.shape[:-1]) + mat.shape)
+    return torch.sum(reshaped_mat * quat[..., :, None, None] * vec[..., None, :, None], dim=(-3, -2))
+def invert_rot_mat(rot_mat: torch.Tensor) -> torch.Tensor:
+    return rot_mat.transpose(-1, -2)
+def invert_quat(quat: torch.Tensor) -> torch.Tensor:
+    quat_prime = quat.clone()
+    quat_prime[..., 1:] *= -1
+    inv = quat_prime / torch.sum(quat**2, dim=-1, keepdim=True)
+    return inv
+class Rotation:
+    """
+    A 3D rotation. Depending on how the object is initialized, the rotation is represented by either a rotation matrix
+    or a quaternion, though both formats are made available by helper functions. To simplify gradient computation, the
+    underlying format of the rotation cannot be changed in-place. Like Rigid, the class is designed to mimic the
+    behavior of a torch Tensor, almost as if each Rotation object were a tensor of rotations, in one format or another.
+    """
+    def __init__(
+        self,
+        rot_mats: Optional[torch.Tensor] = None,
+        quats: Optional[torch.Tensor] = None,
+        normalize_quats: bool = True,
+    ):
+        """
+        Args:
+            rot_mats:
+                A [*, 3, 3] rotation matrix tensor. Mutually exclusive with quats
+            quats:
+                A [*, 4] quaternion. Mutually exclusive with rot_mats. If normalize_quats is not True, must be a unit
+                quaternion
+            normalize_quats:
+                If quats is specified, whether to normalize quats
+        """
+        if (rot_mats is None and quats is None) or (rot_mats is not None and quats is not None):
+            raise ValueError("Exactly one input argument must be specified")
+        if (rot_mats is not None and rot_mats.shape[-2:] != (3, 3)) or (quats is not None and quats.shape[-1] != 4):
+            raise ValueError("Incorrectly shaped rotation matrix or quaternion")
+        # Force full-precision
+        if quats is not None:
+            quats = quats.to(dtype=torch.float32)
+        if rot_mats is not None:
+            rot_mats = rot_mats.to(dtype=torch.float32)
+        if quats is not None and normalize_quats:
+            quats = quats / torch.linalg.norm(quats, dim=-1, keepdim=True)
+        self._rot_mats = rot_mats
+        self._quats = quats
+    @staticmethod
+    def identity(
+        shape,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        requires_grad: bool = True,
+        fmt: str = "quat",
+    ) -> Rotation:
+        """
+        Returns an identity Rotation.
+        Args:
+            shape:
+                The "shape" of the resulting Rotation object. See documentation for the shape property
+            dtype:
+                The torch dtype for the rotation
+            device:
+                The torch device for the new rotation
+            requires_grad:
+                Whether the underlying tensors in the new rotation object should require gradient computation
+            fmt:
+                One of "quat" or "rot_mat". Determines the underlying format of the new object's rotation
+        Returns:
+            A new identity rotation
+        """
+        if fmt == "rot_mat":
+            rot_mats = identity_rot_mats(
+                shape,
+                dtype,
+                device,
+                requires_grad,
+            )
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif fmt == "quat":
+            quats = identity_quats(shape, dtype, device, requires_grad)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError(f"Invalid format: f{fmt}")
+    # Magic methods
+    def __getitem__(self, index: Any) -> Rotation:
+        """
+        Allows torch-style indexing over the virtual shape of the rotation object. See documentation for the shape
+        property.
+        Args:
+            index:
+                A torch index. E.g. (1, 3, 2), or (slice(None,))
+        Returns:
+            The indexed rotation
+        """
+        if type(index) is not tuple:
+            index = (index,)
+        if self._rot_mats is not None:
+            rot_mats = self._rot_mats[index + (slice(None), slice(None))]
+            return Rotation(rot_mats=rot_mats)
+        elif self._quats is not None:
+            quats = self._quats[index + (slice(None),)]
+            return Rotation(quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    def __mul__(self, right: torch.Tensor) -> Rotation:
+        """
+        Pointwise left multiplication of the rotation with a tensor. Can be used to e.g. mask the Rotation.
+        Args:
+            right:
+                The tensor multiplicand
+        Returns:
+            The product
+        """
+        if not (isinstance(right, torch.Tensor)):
+            raise TypeError("The other multiplicand must be a Tensor")
+        if self._rot_mats is not None:
+            rot_mats = self._rot_mats * right[..., None, None]
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif self._quats is not None:
+            quats = self._quats * right[..., None]
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    def __rmul__(self, left: torch.Tensor) -> Rotation:
+        """
+        Reverse pointwise multiplication of the rotation with a tensor.
+        Args:
+            left:
+                The left multiplicand
+        Returns:
+            The product
+        """
+        return self.__mul__(left)
+    # Properties
+    @property
+    def shape(self) -> torch.Size:
+        """
+        Returns the virtual shape of the rotation object. This shape is defined as the batch dimensions of the
+        underlying rotation matrix or quaternion. If the Rotation was initialized with a [10, 3, 3] rotation matrix
+        tensor, for example, the resulting shape would be [10].
+        Returns:
+            The virtual shape of the rotation object
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats.shape[:-2]
+        elif self._quats is not None:
+            return self._quats.shape[:-1]
+        else:
+            raise ValueError("Both rotations are None")
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        Returns the dtype of the underlying rotation.
+        Returns:
+            The dtype of the underlying rotation
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats.dtype
+        elif self._quats is not None:
+            return self._quats.dtype
+        else:
+            raise ValueError("Both rotations are None")
+    @property
+    def device(self) -> torch.device:
+        """
+        The device of the underlying rotation
+        Returns:
+            The device of the underlying rotation
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats.device
+        elif self._quats is not None:
+            return self._quats.device
+        else:
+            raise ValueError("Both rotations are None")
+    @property
+    def requires_grad(self) -> bool:
+        """
+        Returns the requires_grad property of the underlying rotation
+        Returns:
+            The requires_grad property of the underlying tensor
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats.requires_grad
+        elif self._quats is not None:
+            return self._quats.requires_grad
+        else:
+            raise ValueError("Both rotations are None")
+    def get_rot_mats(self) -> torch.Tensor:
+        """
+        Returns the underlying rotation as a rotation matrix tensor.
+        Returns:
+            The rotation as a rotation matrix tensor
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats
+        elif self._quats is not None:
+            return quat_to_rot(self._quats)
+        else:
+            raise ValueError("Both rotations are None")
+    def get_quats(self) -> torch.Tensor:
+        """
+        Returns the underlying rotation as a quaternion tensor.
+        Depending on whether the Rotation was initialized with a quaternion, this function may call torch.linalg.eigh.
+        Returns:
+            The rotation as a quaternion tensor.
+        """
+        if self._rot_mats is not None:
+            return rot_to_quat(self._rot_mats)
+        elif self._quats is not None:
+            return self._quats
+        else:
+            raise ValueError("Both rotations are None")
+    def get_cur_rot(self) -> torch.Tensor:
+        """
+        Return the underlying rotation in its current form
+        Returns:
+            The stored rotation
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats
+        elif self._quats is not None:
+            return self._quats
+        else:
+            raise ValueError("Both rotations are None")
+    # Rotation functions
+    def compose_q_update_vec(self, q_update_vec: torch.Tensor, normalize_quats: bool = True) -> Rotation:
+        """
+        Returns a new quaternion Rotation after updating the current object's underlying rotation with a quaternion
+        update, formatted as a [*, 3] tensor whose final three columns represent x, y, z such that (1, x, y, z) is the
+        desired (not necessarily unit) quaternion update.
+        Args:
+            q_update_vec:
+                A [*, 3] quaternion update tensor
+            normalize_quats:
+                Whether to normalize the output quaternion
+        Returns:
+            An updated Rotation
+        """
+        quats = self.get_quats()
+        new_quats = quats + quat_multiply_by_vec(quats, q_update_vec)
+        return Rotation(
+            rot_mats=None,
+            quats=new_quats,
+            normalize_quats=normalize_quats,
+        )
+    def compose_r(self, r: Rotation) -> Rotation:
+        """
+        Compose the rotation matrices of the current Rotation object with those of another.
+        Args:
+            r:
+                An update rotation object
+        Returns:
+            An updated rotation object
+        """
+        r1 = self.get_rot_mats()
+        r2 = r.get_rot_mats()
+        new_rot_mats = rot_matmul(r1, r2)
+        return Rotation(rot_mats=new_rot_mats, quats=None)
+    def compose_q(self, r: Rotation, normalize_quats: bool = True) -> Rotation:
+        """
+        Compose the quaternions of the current Rotation object with those of another.
+        Depending on whether either Rotation was initialized with quaternions, this function may call
+        torch.linalg.eigh.
+        Args:
+            r:
+                An update rotation object
+        Returns:
+            An updated rotation object
+        """
+        q1 = self.get_quats()
+        q2 = r.get_quats()
+        new_quats = quat_multiply(q1, q2)
+        return Rotation(rot_mats=None, quats=new_quats, normalize_quats=normalize_quats)
+    def apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the current Rotation as a rotation matrix to a set of 3D coordinates.
+        Args:
+            pts:
+                A [*, 3] set of points
+        Returns:
+            [*, 3] rotated points
+        """
+        rot_mats = self.get_rot_mats()
+        return rot_vec_mul(rot_mats, pts)
+    def invert_apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+        The inverse of the apply() method.
+        Args:
+            pts:
+                A [*, 3] set of points
+        Returns:
+            [*, 3] inverse-rotated points
+        """
+        rot_mats = self.get_rot_mats()
+        inv_rot_mats = invert_rot_mat(rot_mats)
+        return rot_vec_mul(inv_rot_mats, pts)
+    def invert(self) -> Rotation:
+        """
+        Returns the inverse of the current Rotation.
+        Returns:
+            The inverse of the current Rotation
+        """
+        if self._rot_mats is not None:
+            return Rotation(rot_mats=invert_rot_mat(self._rot_mats), quats=None)
+        elif self._quats is not None:
+            return Rotation(
+                rot_mats=None,
+                quats=invert_quat(self._quats),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+    # "Tensor" stuff
+    def unsqueeze(self, dim: int) -> Rotation:
+        """
+        Analogous to torch.unsqueeze. The dimension is relative to the shape of the Rotation object.
+        Args:
+            dim: A positive or negative dimension index.
+        Returns:
+            The unsqueezed Rotation.
+        """
+        if dim >= len(self.shape):
+            raise ValueError("Invalid dimension")
+        if self._rot_mats is not None:
+            rot_mats = self._rot_mats.unsqueeze(dim if dim >= 0 else dim - 2)
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif self._quats is not None:
+            quats = self._quats.unsqueeze(dim if dim >= 0 else dim - 1)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    @staticmethod
+    def cat(rs: Sequence[Rotation], dim: int) -> Rotation:
+        """
+        Concatenates rotations along one of the batch dimensions. Analogous to torch.cat().
+        Note that the output of this operation is always a rotation matrix, regardless of the format of input
+        rotations.
+        Args:
+            rs:
+                A list of rotation objects
+            dim:
+                The dimension along which the rotations should be concatenated
+        Returns:
+            A concatenated Rotation object in rotation matrix format
+        """
+        rot_mats = torch.cat(
+            [r.get_rot_mats() for r in rs],
+            dim=dim if dim >= 0 else dim - 2,
+        )
+        return Rotation(rot_mats=rot_mats, quats=None)
+    def map_tensor_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rotation:
+        """
+        Apply a Tensor -> Tensor function to underlying rotation tensors, mapping over the rotation dimension(s). Can
+        be used e.g. to sum out a one-hot batch dimension.
+        Args:
+            fn:
+                A Tensor -> Tensor function to be mapped over the Rotation
+        Returns:
+            The transformed Rotation object
+        """
+        if self._rot_mats is not None:
+            rot_mats = self._rot_mats.view(self._rot_mats.shape[:-2] + (9,))
+            rot_mats = torch.stack(list(map(fn, torch.unbind(rot_mats, dim=-1))), dim=-1)
+            rot_mats = rot_mats.view(rot_mats.shape[:-1] + (3, 3))
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif self._quats is not None:
+            quats = torch.stack(list(map(fn, torch.unbind(self._quats, dim=-1))), dim=-1)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    def cuda(self) -> Rotation:
+        """
+        Analogous to the cuda() method of torch Tensors
+        Returns:
+            A copy of the Rotation in CUDA memory
+        """
+        if self._rot_mats is not None:
+            return Rotation(rot_mats=self._rot_mats.cuda(), quats=None)
+        elif self._quats is not None:
+            return Rotation(rot_mats=None, quats=self._quats.cuda(), normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    def to(self, device: Optional[torch.device], dtype: Optional[torch.dtype]) -> Rotation:
+        """
+        Analogous to the to() method of torch Tensors
+        Args:
+            device:
+                A torch device
+            dtype:
+                A torch dtype
+        Returns:
+            A copy of the Rotation using the new device and dtype
+        """
+        if self._rot_mats is not None:
+            return Rotation(
+                rot_mats=self._rot_mats.to(device=device, dtype=dtype),
+                quats=None,
+            )
+        elif self._quats is not None:
+            return Rotation(
+                rot_mats=None,
+                quats=self._quats.to(device=device, dtype=dtype),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+    def detach(self) -> Rotation:
+        """
+        Returns a copy of the Rotation whose underlying Tensor has been detached from its torch graph.
+        Returns:
+            A copy of the Rotation whose underlying Tensor has been detached from its torch graph
+        """
+        if self._rot_mats is not None:
+            return Rotation(rot_mats=self._rot_mats.detach(), quats=None)
+        elif self._quats is not None:
+            return Rotation(
+                rot_mats=None,
+                quats=self._quats.detach(),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+class Rigid:
+    """
+    A class representing a rigid transformation. Little more than a wrapper around two objects: a Rotation object and a
+    [*, 3] translation Designed to behave approximately like a single torch tensor with the shape of the shared batch
+    dimensions of its component parts.
+    """
+    def __init__(self, rots: Optional[Rotation], trans: Optional[torch.Tensor]):
+        """
+        Args:
+            rots: A [*, 3, 3] rotation tensor
+            trans: A corresponding [*, 3] translation tensor
+        """
+        # (we need device, dtype, etc. from at least one input)
+        batch_dims, dtype, device, requires_grad = None, None, None, None
+        if trans is not None:
+            batch_dims = trans.shape[:-1]
+            dtype = trans.dtype
+            device = trans.device
+            requires_grad = trans.requires_grad
+        elif rots is not None:
+            batch_dims = rots.shape
+            dtype = rots.dtype
+            device = rots.device
+            requires_grad = rots.requires_grad
+        else:
+            raise ValueError("At least one input argument must be specified")
+        if rots is None:
+            rots = Rotation.identity(
+                batch_dims,
+                dtype,
+                device,
+                requires_grad,
+            )
+        elif trans is None:
+            trans = identity_trans(
+                batch_dims,
+                dtype,
+                device,
+                requires_grad,
+            )
+        assert rots is not None
+        assert trans is not None
+        if (rots.shape != trans.shape[:-1]) or (rots.device != trans.device):
+            raise ValueError("Rots and trans incompatible")
+        # Force full precision. Happens to the rotations automatically.
+        trans = trans.to(dtype=torch.float32)
+        self._rots = rots
+        self._trans = trans
+    @staticmethod
+    def identity(
+        shape: Tuple[int, ...],
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        requires_grad: bool = True,
+        fmt: str = "quat",
+    ) -> Rigid:
+        """
+        Constructs an identity transformation.
+        Args:
+            shape:
+                The desired shape
+            dtype:
+                The dtype of both internal tensors
+            device:
+                The device of both internal tensors
+            requires_grad:
+                Whether grad should be enabled for the internal tensors
+        Returns:
+            The identity transformation
+        """
+        return Rigid(
+            Rotation.identity(shape, dtype, device, requires_grad, fmt=fmt),
+            identity_trans(shape, dtype, device, requires_grad),
+        )
+    def __getitem__(self, index: Any) -> Rigid:
+        """
+        Indexes the affine transformation with PyTorch-style indices. The index is applied to the shared dimensions of
+        both the rotation and the translation.
+        E.g.::
+            r = Rotation(rot_mats=torch.rand(10, 10, 3, 3), quats=None) t = Rigid(r, torch.rand(10, 10, 3)) indexed =
+            t[3, 4:6] assert(indexed.shape == (2,)) assert(indexed.get_rots().shape == (2,))
+            assert(indexed.get_trans().shape == (2, 3))
+        Args:
+            index: A standard torch tensor index. E.g. 8, (10, None, 3),
+            or (3, slice(0, 1, None))
+        Returns:
+            The indexed tensor
+        """
+        if type(index) is not tuple:
+            index = (index,)
+        return Rigid(
+            self._rots[index],
+            self._trans[index + (slice(None),)],
+        )
+    def __mul__(self, right: torch.Tensor) -> Rigid:
+        """
+        Pointwise left multiplication of the transformation with a tensor. Can be used to e.g. mask the Rigid.
+        Args:
+            right:
+                The tensor multiplicand
+        Returns:
+            The product
+        """
+        if not (isinstance(right, torch.Tensor)):
+            raise TypeError("The other multiplicand must be a Tensor")
+        new_rots = self._rots * right
+        new_trans = self._trans * right[..., None]
+        return Rigid(new_rots, new_trans)
+    def __rmul__(self, left: torch.Tensor) -> Rigid:
+        """
+        Reverse pointwise multiplication of the transformation with a tensor.
+        Args:
+            left:
+                The left multiplicand
+        Returns:
+            The product
+        """
+        return self.__mul__(left)
+    @property
+    def shape(self) -> torch.Size:
+        """
+        Returns the shape of the shared dimensions of the rotation and the translation.
+        Returns:
+            The shape of the transformation
+        """
+        return self._trans.shape[:-1]
+    @property
+    def device(self) -> torch.device:
+        """
+        Returns the device on which the Rigid's tensors are located.
+        Returns:
+            The device on which the Rigid's tensors are located
+        """
+        return self._trans.device
+    def get_rots(self) -> Rotation:
+        """
+        Getter for the rotation.
+        Returns:
+            The rotation object
+        """
+        return self._rots
+    def get_trans(self) -> torch.Tensor:
+        """
+        Getter for the translation.
+        Returns:
+            The stored translation
+        """
+        return self._trans
+    def compose_q_update_vec(self, q_update_vec: torch.Tensor) -> Rigid:
+        """
+        Composes the transformation with a quaternion update vector of shape [*, 6], where the final 6 columns
+        represent the x, y, and z values of a quaternion of form (1, x, y, z) followed by a 3D translation.
+        Args:
+            q_vec: The quaternion update vector.
+        Returns:
+            The composed transformation.
+        """
+        q_vec, t_vec = q_update_vec[..., :3], q_update_vec[..., 3:]
+        new_rots = self._rots.compose_q_update_vec(q_vec)
+        trans_update = self._rots.apply(t_vec)
+        new_translation = self._trans + trans_update
+        return Rigid(new_rots, new_translation)
+    def compose(self, r: Rigid) -> Rigid:
+        """
+        Composes the current rigid object with another.
+        Args:
+            r:
+                Another Rigid object
+        Returns:
+            The composition of the two transformations
+        """
+        new_rot = self._rots.compose_r(r._rots)
+        new_trans = self._rots.apply(r._trans) + self._trans
+        return Rigid(new_rot, new_trans)
+    def apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+        Applies the transformation to a coordinate tensor.
+        Args:
+            pts: A [*, 3] coordinate tensor.
+        Returns:
+            The transformed points.
+        """
+        rotated = self._rots.apply(pts)
+        return rotated + self._trans
+    def invert_apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+        Applies the inverse of the transformation to a coordinate tensor.
+        Args:
+            pts: A [*, 3] coordinate tensor
+        Returns:
+            The transformed points.
+        """
+        pts = pts - self._trans
+        return self._rots.invert_apply(pts)
+    def invert(self) -> Rigid:
+        """
+        Inverts the transformation.
+        Returns:
+            The inverse transformation.
+        """
+        rot_inv = self._rots.invert()
+        trn_inv = rot_inv.apply(self._trans)
+        return Rigid(rot_inv, -1 * trn_inv)
+    def map_tensor_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rigid:
+        """
+        Apply a Tensor -> Tensor function to underlying translation and rotation tensors, mapping over the
+        translation/rotation dimensions respectively.
+        Args:
+            fn:
+                A Tensor -> Tensor function to be mapped over the Rigid
+        Returns:
+            The transformed Rigid object
+        """
+        new_rots = self._rots.map_tensor_fn(fn)
+        new_trans = torch.stack(list(map(fn, torch.unbind(self._trans, dim=-1))), dim=-1)
+        return Rigid(new_rots, new_trans)
+    def to_tensor_4x4(self) -> torch.Tensor:
+        """
+        Converts a transformation to a homogeneous transformation tensor.
+        Returns:
+            A [*, 4, 4] homogeneous transformation tensor
+        """
+        tensor = self._trans.new_zeros((*self.shape, 4, 4))
+        tensor[..., :3, :3] = self._rots.get_rot_mats()
+        tensor[..., :3, 3] = self._trans
+        tensor[..., 3, 3] = 1
+        return tensor
+    @staticmethod
+    def from_tensor_4x4(t: torch.Tensor) -> Rigid:
+        """
+        Constructs a transformation from a homogeneous transformation tensor.
+        Args:
+            t: [*, 4, 4] homogeneous transformation tensor
+        Returns:
+            T object with shape [*]
+        """
+        if t.shape[-2:] != (4, 4):
+            raise ValueError("Incorrectly shaped input tensor")
+        rots = Rotation(rot_mats=t[..., :3, :3], quats=None)
+        trans = t[..., :3, 3]
+        return Rigid(rots, trans)
+    def to_tensor_7(self) -> torch.Tensor:
+        """
+        Converts a transformation to a tensor with 7 final columns, four for the quaternion followed by three for the
+        translation.
+        Returns:
+            A [*, 7] tensor representation of the transformation
+        """
+        tensor = self._trans.new_zeros((*self.shape, 7))
+        tensor[..., :4] = self._rots.get_quats()
+        tensor[..., 4:] = self._trans
+        return tensor
+    @staticmethod
+    def from_tensor_7(t: torch.Tensor, normalize_quats: bool = False) -> Rigid:
+        if t.shape[-1] != 7:
+            raise ValueError("Incorrectly shaped input tensor")
+        quats, trans = t[..., :4], t[..., 4:]
+        rots = Rotation(rot_mats=None, quats=quats, normalize_quats=normalize_quats)
+        return Rigid(rots, trans)
+    @staticmethod
+    def from_3_points(
+        p_neg_x_axis: torch.Tensor, origin: torch.Tensor, p_xy_plane: torch.Tensor, eps: float = 1e-8
+    ) -> Rigid:
+        """
+        Implements algorithm 21. Constructs transformations from sets of 3 points using the Gram-Schmidt algorithm.
+        Args:
+            p_neg_x_axis: [*, 3] coordinates
+            origin: [*, 3] coordinates used as frame origins
+            p_xy_plane: [*, 3] coordinates
+            eps: Small epsilon value
+        Returns:
+            A transformation object of shape [*]
+        """
+        p_neg_x_axis_unbound = torch.unbind(p_neg_x_axis, dim=-1)
+        origin_unbound = torch.unbind(origin, dim=-1)
+        p_xy_plane_unbound = torch.unbind(p_xy_plane, dim=-1)
+        e0 = [c1 - c2 for c1, c2 in zip(origin_unbound, p_neg_x_axis_unbound)]
+        e1 = [c1 - c2 for c1, c2 in zip(p_xy_plane_unbound, origin_unbound)]
+        denom = torch.sqrt(sum(c * c for c in e0) + eps * torch.ones_like(e0[0]))
+        e0 = [c / denom for c in e0]
+        dot = sum((c1 * c2 for c1, c2 in zip(e0, e1)))
+        e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)]
+        denom = torch.sqrt(sum((c * c for c in e1)) + eps * torch.ones_like(e1[0]))
+        e1 = [c / denom for c in e1]
+        e2 = [
+            e0[1] * e1[2] - e0[2] * e1[1],
+            e0[2] * e1[0] - e0[0] * e1[2],
+            e0[0] * e1[1] - e0[1] * e1[0],
+        ]
+        rots = torch.stack([c for tup in zip(e0, e1, e2) for c in tup], dim=-1)
+        rots = rots.reshape(rots.shape[:-1] + (3, 3))
+        rot_obj = Rotation(rot_mats=rots, quats=None)
+        return Rigid(rot_obj, torch.stack(origin_unbound, dim=-1))
+    def unsqueeze(self, dim: int) -> Rigid:
+        """
+        Analogous to torch.unsqueeze. The dimension is relative to the shared dimensions of the rotation/translation.
+        Args:
+            dim: A positive or negative dimension index.
+        Returns:
+            The unsqueezed transformation.
+        """
+        if dim >= len(self.shape):
+            raise ValueError("Invalid dimension")
+        rots = self._rots.unsqueeze(dim)
+        trans = self._trans.unsqueeze(dim if dim >= 0 else dim - 1)
+        return Rigid(rots, trans)
+    @staticmethod
+    def cat(ts: Sequence[Rigid], dim: int) -> Rigid:
+        """
+        Concatenates transformations along a new dimension.
+        Args:
+            ts:
+                A list of T objects
+            dim:
+                The dimension along which the transformations should be concatenated
+        Returns:
+            A concatenated transformation object
+        """
+        rots = Rotation.cat([t._rots for t in ts], dim)
+        trans = torch.cat([t._trans for t in ts], dim=dim if dim >= 0 else dim - 1)
+        return Rigid(rots, trans)
+    def apply_rot_fn(self, fn: Callable[[Rotation], Rotation]) -> Rigid:
+        """
+        Applies a Rotation -> Rotation function to the stored rotation object.
+        Args:
+            fn: A function of type Rotation -> Rotation
+        Returns:
+            A transformation object with a transformed rotation.
+        """
+        return Rigid(fn(self._rots), self._trans)
+    def apply_trans_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rigid:
+        """
+        Applies a Tensor -> Tensor function to the stored translation.
+        Args:
+            fn:
+                A function of type Tensor -> Tensor to be applied to the translation
+        Returns:
+            A transformation object with a transformed translation.
+        """
+        return Rigid(self._rots, fn(self._trans))
+    def scale_translation(self, trans_scale_factor: float) -> Rigid:
+        """
+        Scales the translation by a constant factor.
+        Args:
+            trans_scale_factor:
+                The constant factor
+        Returns:
+            A transformation object with a scaled translation.
+        """
+        return self.apply_trans_fn(lambda t: t * trans_scale_factor)
+    def stop_rot_gradient(self) -> Rigid:
+        """
+        Detaches the underlying rotation object
+        Returns:
+            A transformation object with detached rotations
+        """
+        return self.apply_rot_fn(lambda r: r.detach())
+    @staticmethod
+    def make_transform_from_reference(
+        n_xyz: torch.Tensor, ca_xyz: torch.Tensor, c_xyz: torch.Tensor, eps: float = 1e-20
+    ) -> Rigid:
+        """
+        Returns a transformation object from reference coordinates.
+        Note that this method does not take care of symmetries. If you provide the atom positions in the non-standard
+        way, the N atom will end up not at [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You
+        need to take care of such cases in your code.
+        Args:
+            n_xyz: A [*, 3] tensor of nitrogen xyz coordinates.
+            ca_xyz: A [*, 3] tensor of carbon alpha xyz coordinates.
+            c_xyz: A [*, 3] tensor of carbon xyz coordinates.
+        Returns:
+            A transformation object. After applying the translation and rotation to the reference backbone, the
+            coordinates will approximately equal to the input coordinates.
+        """
+        translation = -1 * ca_xyz
+        n_xyz = n_xyz + translation
+        c_xyz = c_xyz + translation
+        c_x, c_y, c_z = [c_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + c_x**2 + c_y**2)
+        sin_c1 = -c_y / norm
+        cos_c1 = c_x / norm
+        c1_rots = sin_c1.new_zeros((*sin_c1.shape, 3, 3))
+        c1_rots[..., 0, 0] = cos_c1
+        c1_rots[..., 0, 1] = -1 * sin_c1
+        c1_rots[..., 1, 0] = sin_c1
+        c1_rots[..., 1, 1] = cos_c1
+        c1_rots[..., 2, 2] = 1
+        norm = torch.sqrt(eps + c_x**2 + c_y**2 + c_z**2)
+        sin_c2 = c_z / norm
+        cos_c2 = torch.sqrt(c_x**2 + c_y**2) / norm
+        c2_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        c2_rots[..., 0, 0] = cos_c2
+        c2_rots[..., 0, 2] = sin_c2
+        c2_rots[..., 1, 1] = 1
+        c2_rots[..., 2, 0] = -1 * sin_c2
+        c2_rots[..., 2, 2] = cos_c2
+        c_rots = rot_matmul(c2_rots, c1_rots)
+        n_xyz = rot_vec_mul(c_rots, n_xyz)
+        _, n_y, n_z = [n_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + n_y**2 + n_z**2)
+        sin_n = -n_z / norm
+        cos_n = n_y / norm
+        n_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        n_rots[..., 0, 0] = 1
+        n_rots[..., 1, 1] = cos_n
+        n_rots[..., 1, 2] = -1 * sin_n
+        n_rots[..., 2, 1] = sin_n
+        n_rots[..., 2, 2] = cos_n
+        rots = rot_matmul(n_rots, c_rots)
+        rots = rots.transpose(-1, -2)
+        translation = -1 * translation
+        rot_obj = Rotation(rot_mats=rots, quats=None)
+        return Rigid(rot_obj, translation)
+    def cuda(self) -> Rigid:
+        """
+        Moves the transformation object to GPU memory
+        Returns:
+            A version of the transformation on GPU
+        """
+        return Rigid(self._rots.cuda(), self._trans.cuda())

docs/transformers/build/lib/transformers/models/falcon/configuration_falcon.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# coding=utf-8
+# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Falcon configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+class FalconConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65024):
+            Vocabulary size of the Falcon model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FalconModel`]
+        hidden_size (`int`, *optional*, defaults to 4544):
+            Dimension of the hidden representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 71):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_ln_in_parallel_attn (`int`, *optional*):
+            Set to 2 if separate layer norms are to be used for the MLP and the attention output when using parallel
+            attention, otherwise, 1.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for MLP layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for attention layers.
+        num_kv_heads (`int`, *optional*):
+            Number of key-value heads to use per attention layer. If unset, defaults to the same value as
+            `num_attention_heads`.
+        alibi (`bool`, *optional*, defaults to `False`):
+            Whether to use ALiBi positional biases during self-attention.
+        new_decoder_architecture (`bool`, *optional*, defaults to `False`):
+            Whether to use the new (Falcon-40B) decoder architecture. If `True`, the `multi_query` and `parallel_attn`
+            arguments are ignored, as the new decoder always uses parallel attention.
+        multi_query (`bool`, *optional*, defaults to `True`):
+            Whether to use multi-query attention in the decoder. Ignored when `new_decoder_architecture` is `True`.
+        parallel_attn (`bool`, *optional*, defaults to `True`):
+            Whether to compute attention in parallel with the feedforward layer. If False, they are consecutive
+            instead, as in the original Transformer architecture. Ignored when `new_decoder_architecture` is `True`.
+        bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias on Linear layers.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with, when `alibi` is `False`. Pretrained
+            Falcon models with RoPE support up to 2048 tokens.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        bos_token_id (`int`, *optional*, defaults to 11):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 11):
+            The id of the "end-of-sequence" token.
+        ffn_hidden_size (`int`, *optional*):
+            The hidden size of the feedforward layer in the Transformer decoder.
+            defaults to 4x hidden dim
+        activation (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used in the feedforward layer.
+    Example:
+    ```python
+    >>> from transformers import FalconModel, FalconConfig
+    >>> # Initializing a small (2-layer) Falcon configuration
+    >>> configuration = FalconConfig(num_hidden_layers=2)
+    >>> # Initializing a model from the small configuration
+    >>> model = FalconModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "falcon"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=65024,
+        hidden_size=4544,
+        num_hidden_layers=32,
+        num_attention_heads=71,
+        num_ln_in_parallel_attn=None,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        num_kv_heads=None,
+        alibi=False,
+        new_decoder_architecture=False,
+        multi_query=True,
+        parallel_attn=True,
+        bias=False,
+        max_position_embeddings=2048,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        bos_token_id=11,
+        eos_token_id=11,
+        ffn_hidden_size=None,
+        activation="gelu",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        # Backward compatibility with n_embed kwarg
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.num_kv_heads = num_attention_heads if num_kv_heads is None else num_kv_heads
+        self.alibi = alibi
+        self.new_decoder_architecture = new_decoder_architecture
+        self.multi_query = multi_query  # Ignored when new_decoder_architecture is True
+        self.parallel_attn = parallel_attn
+        self.bias = bias
+        self.num_ln_in_parallel_attn = num_ln_in_parallel_attn
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.activation = activation
+        if ffn_hidden_size is None:
+            self.ffn_hidden_size = hidden_size * 4
+        else:
+            self.ffn_hidden_size = ffn_hidden_size
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+    @property
+    def head_dim(self):
+        return self.hidden_size // self.num_attention_heads
+    @property
+    def rotary(self):
+        return not self.alibi
+__all__ = ["FalconConfig"]

docs/transformers/build/lib/transformers/models/falcon/convert_custom_code_checkpoint.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import json
+from argparse import ArgumentParser
+from pathlib import Path
+"""
+This script converts Falcon custom code checkpoints to modern Falcon checkpoints that use code in the Transformers
+library. After conversion, performance (especially for generation) should improve and the checkpoint can be loaded
+without needing trust_remote_code=True.
+"""
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=Path,
+        required=True,
+        help="Directory containing a custom code checkpoint to convert to a modern Falcon checkpoint.",
+    )
+    args = parser.parse_args()
+    if not args.checkpoint_dir.is_dir():
+        raise ValueError("--checkpoint_dir argument should be a directory!")
+    if (
+        not (args.checkpoint_dir / "configuration_RW.py").is_file()
+        or not (args.checkpoint_dir / "modelling_RW.py").is_file()
+    ):
+        raise ValueError(
+            "The model directory should contain configuration_RW.py and modelling_RW.py files! Are you sure this is a custom code checkpoint?"
+        )
+    (args.checkpoint_dir / "configuration_RW.py").unlink()
+    (args.checkpoint_dir / "modelling_RW.py").unlink()
+    config = args.checkpoint_dir / "config.json"
+    text = config.read_text()
+    text = text.replace("RWForCausalLM", "FalconForCausalLM")
+    text = text.replace("RefinedWebModel", "falcon")
+    text = text.replace("RefinedWeb", "falcon")
+    json_config = json.loads(text)
+    del json_config["auto_map"]
+    if "n_head" in json_config:
+        json_config["num_attention_heads"] = json_config.pop("n_head")
+    if "n_layer" in json_config:
+        json_config["num_hidden_layers"] = json_config.pop("n_layer")
+    if "n_head_kv" in json_config:
+        json_config["num_kv_heads"] = json_config.pop("n_head_kv")
+        json_config["new_decoder_architecture"] = True
+    else:
+        json_config["new_decoder_architecture"] = False
+    bos_token_id = json_config.get("bos_token_id", 1)
+    eos_token_id = json_config.get("eos_token_id", 2)
+    config.unlink()
+    config.write_text(json.dumps(json_config, indent=2, sort_keys=True))
+    tokenizer_config = args.checkpoint_dir / "tokenizer_config.json"
+    if tokenizer_config.is_file():
+        text = tokenizer_config.read_text()
+        json_config = json.loads(text)
+        if json_config["tokenizer_class"] == "PreTrainedTokenizerFast":
+            json_config["model_input_names"] = ["input_ids", "attention_mask"]
+            tokenizer_config.unlink()
+            tokenizer_config.write_text(json.dumps(json_config, indent=2, sort_keys=True))
+    generation_config_path = args.checkpoint_dir / "generation_config.json"
+    generation_dict = {
+        "_from_model_config": True,
+        "bos_token_id": bos_token_id,
+        "eos_token_id": eos_token_id,
+        "transformers_version": "4.33.0.dev0",
+    }
+    generation_config_path.write_text(json.dumps(generation_dict, indent=2, sort_keys=True))
+    print("Done! Please double-check that the new checkpoint works as expected.")

docs/transformers/build/lib/transformers/models/falcon/modeling_falcon.py ADDED Viewed

	@@ -0,0 +1,1566 @@

+# coding=utf-8
+# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Falcon model."""
+import math
+from typing import TYPE_CHECKING, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from torch.nn import functional as F
+from ...activations import get_activation
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+)
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_falcon import FalconConfig
+if TYPE_CHECKING:
+    from ...configuration_utils import PretrainedConfig
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "Rocketknight1/falcon-rw-1b"
+_CONFIG_FOR_DOC = "FalconConfig"
+# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
+# In order not to degrade the quality of our HF-port, we keep these characteristics in the final model.
+class FalconLinear(nn.Linear):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        hidden_states = input @ self.weight.T
+        if self.bias is None:
+            return hidden_states
+        return hidden_states + self.bias
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Falcon
+class FalconRotaryEmbedding(nn.Module):
+    def __init__(self, config: FalconConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+    if closest_power_of_2 != num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+        )
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # => the query_length dimension will then be broadcasted correctly
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+    alibi = slopes[..., None].bfloat16() * arange_tensor
+    return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
+# Copied from transformers.models.bloom.modeling_bloom.dropout_add
+def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
+    """
+    Dropout add function
+    Args:
+        x (`torch.tensor`):
+            input tensor
+        residual (`torch.tensor`):
+            residual tensor
+        prob (`float`):
+            dropout probability
+        training (`bool`):
+            training mode
+    """
+    out = F.dropout(x, p=prob, training=training)
+    out = residual + out
+    return out
+class FalconAttention(nn.Module):
+    def __init__(self, config: FalconConfig, layer_idx=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.split_size = self.hidden_size
+        self.hidden_dropout = config.hidden_dropout
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self._use_sdpa = config._attn_implementation == "sdpa"
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        if self.head_dim * self.num_heads != self.hidden_size:
+            raise ValueError(
+                f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        # Layer-wise attention scaling
+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
+        self.beta = self.inv_norm_factor
+        if config.new_decoder_architecture:
+            qkv_out_dim = (config.num_kv_heads * 2 + config.num_attention_heads) * self.head_dim
+        elif config.multi_query:
+            qkv_out_dim = self.hidden_size + 2 * self.head_dim
+        else:
+            qkv_out_dim = 3 * self.hidden_size
+        self.query_key_value = FalconLinear(self.hidden_size, qkv_out_dim, bias=config.bias)
+        self.new_decoder_architecture = config.new_decoder_architecture
+        self.multi_query = config.multi_query
+        self.dense = FalconLinear(self.hidden_size, self.hidden_size, bias=config.bias)
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+        self.num_kv_heads = config.num_kv_heads if (self.new_decoder_architecture or not self.multi_query) else 1
+        # TODO (raushan): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
+        if config.rotary:
+            self.rotary_emb = FalconRotaryEmbedding(config=self.config)
+    def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv`
+        Args:
+            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]
+        Returns:
+            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
+            value: [batch_size, seq_length, num_heads, head_dim]
+        """
+        if self.new_decoder_architecture:
+            batch, seq_len, _ = fused_qkv.shape
+            qkv = fused_qkv.view(batch, seq_len, -1, self.num_heads // self.num_kv_heads + 2, self.head_dim)
+            query = qkv[:, :, :, :-2]
+            key = qkv[:, :, :, [-2]]
+            value = qkv[:, :, :, [-1]]
+            key = torch.broadcast_to(key, query.shape)
+            value = torch.broadcast_to(value, query.shape)
+            query, key, value = [x.flatten(2, 3) for x in (query, key, value)]
+            return query, key, value
+        elif not self.multi_query:
+            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
+            return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
+        else:
+            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim)
+            return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :]
+    # Copied from transformers.models.bloom.modeling_bloom.BloomAttention._merge_heads
+    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Merge heads together over the last dimension
+        Args:
+            x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]
+        Returns:
+            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
+        """
+        # What we want to achieve is:
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
+        batch_size_and_num_heads, seq_length, _ = x.shape
+        batch_size = batch_size_and_num_heads // self.num_heads
+        # First view to decompose the batch size
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
+        x = x.view(batch_size, self.num_heads, seq_length, self.head_dim)
+        # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
+        x = x.permute(0, 2, 1, 3)
+        # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
+        return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Cache] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ):
+        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+        num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+        batch_size, query_length, _, _ = query_layer.shape
+        query_layer = query_layer.transpose(1, 2).reshape(batch_size, self.num_heads, query_length, self.head_dim)
+        key_layer = key_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim)
+        value_layer = value_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim)
+        if alibi is None:
+            cos, sin = position_embeddings
+            query_layer, key_layer = apply_rotary_pos_emb(query_layer, key_layer, cos, sin)
+        if layer_past is not None:
+            cache_kwargs = {"cache_position": cache_position}
+            if alibi is None:
+                cache_kwargs.update({"sin": sin, "cos": cos})
+            key_layer, value_layer = layer_past.update(key_layer, value_layer, self.layer_idx, cache_kwargs)
+        kv_length = key_layer.shape[-2]
+        if self._use_sdpa and query_layer.device.type == "cuda" and attention_mask is not None:
+            # For torch<=2.1.2, SDPA with memory-efficient backend is bugged with non-contiguous inputs with custom attn_mask,
+            # Reference: https://github.com/pytorch/pytorch/issues/112577.
+            query_layer = query_layer.contiguous()
+            key_layer = key_layer.contiguous()
+            value_layer = value_layer.contiguous()
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :, :, : key_layer.shape[-2]]
+        if alibi is None:
+            if self._use_sdpa and not output_attentions:
+                # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
+                # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+                # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not
+                # create a causal mask in case query_length == 1.
+                is_causal = True if self.is_causal and attention_mask is None and query_length > 1 else False
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
+                    query_layer,
+                    key_layer,
+                    value_layer,
+                    attn_mask=attention_mask,
+                    dropout_p=0.0,
+                    is_causal=is_causal,
+                )
+                attention_scores = None
+            else:
+                attention_scores = query_layer @ key_layer.transpose(-1, -2)
+                attention_scores /= math.sqrt(self.head_dim)
+                attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype)
+                # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi).
+                attn_output = attention_scores @ value_layer
+            attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
+            attn_output = attn_output.permute(0, 2, 1, 3)
+            attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+            attn_output = self.dense(attn_output)
+            if output_attentions:
+                return attn_output, layer_past, attention_scores
+            else:
+                return attn_output, layer_past
+        else:
+            if self._use_sdpa and not output_attentions and head_mask is None:
+                # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
+                # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+                is_causal = True if self.is_causal and attention_mask is None and query_length > 1 else False
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
+                    query_layer,
+                    key_layer,
+                    value_layer,
+                    attn_mask=attention_mask,
+                    dropout_p=self.attention_dropout.p if self.training else 0.0,
+                    is_causal=is_causal,
+                )
+                attn_output = attn_output.transpose(1, 2)
+                attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+                attn_output = self.dense(attn_output)
+            else:
+                matmul_result = query_layer @ key_layer.transpose(-1, -2)
+                # change view to [batch_size, num_heads, q_length, kv_length]
+                attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length)
+                # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+                input_dtype = attention_scores.dtype
+                # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+                if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
+                    attention_scores = attention_scores.to(torch.float32)
+                attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)
+                attention_logits *= self.inv_norm_factor
+                attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype)
+                # [batch_size, num_heads, q_length, kv_length]
+                attention_probs = self.attention_dropout(attention_probs)
+                if head_mask is not None:
+                    attention_probs = attention_probs * head_mask
+                # change view [batch_size, num_heads, q_length, kv_length]
+                attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length)
+                # matmul: [batch_size * num_heads, q_length, head_dim]
+                attn_output = (attention_probs_reshaped @ value_layer).flatten(0, 1)
+                # change view [batch_size, q_length, num_heads * head_dim]
+                attn_output = self._merge_heads(attn_output)
+                attn_output = self.dense(attn_output)
+            if output_attentions:
+                return attn_output, layer_past, attention_probs
+            else:
+                return attn_output, layer_past
+class FalconFlashAttention2(FalconAttention):
+    """
+    Falcon flash attention module. This module inherits from `FalconAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Cache] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ):
+        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+        num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+        batch_size, query_length, _, _ = query_layer.shape
+        query_layer = query_layer.transpose(1, 2).reshape(batch_size, self.num_heads, query_length, self.head_dim)
+        key_layer = key_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim)
+        value_layer = value_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim)
+        if alibi is None:
+            cos, sin = position_embeddings
+            query_layer, key_layer = apply_rotary_pos_emb(query_layer, key_layer, cos, sin)
+        if layer_past is not None:
+            cache_kwargs = {"cache_position": cache_position}
+            if alibi is None:
+                cache_kwargs.update({"sin": sin, "cos": cos})
+            key_layer, value_layer = layer_past.update(key_layer, value_layer, self.layer_idx, cache_kwargs)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_layer = query_layer.transpose(1, 2)
+        key_layer = key_layer.transpose(1, 2)
+        value_layer = value_layer.transpose(1, 2)
+        if alibi is not None:
+            raise ValueError("`alibi` is not supported when `use_flash_attn` is True")
+        attn_dropout = self.config.attention_dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_layer.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.query_key_value.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_layer = query_layer.to(target_dtype)
+            key_layer = key_layer.to(target_dtype)
+            value_layer = value_layer.to(target_dtype)
+        attn_output = _flash_attention_forward(
+            query_layer,
+            key_layer,
+            value_layer,
+            attention_mask,
+            query_length,
+            position_ids=position_ids,
+            dropout=attn_dropout,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+        attn_weights = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+        attn_output = self.dense(attn_weights)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, layer_past, attn_weights
+class FalconMLP(nn.Module):
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.dense_h_to_4h = FalconLinear(hidden_size, config.ffn_hidden_size, bias=config.bias)
+        self.act = get_activation(config.activation)
+        self.dense_4h_to_h = FalconLinear(config.ffn_hidden_size, hidden_size, bias=config.bias)
+        self.hidden_dropout = config.hidden_dropout
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.act(self.dense_h_to_4h(x))
+        x = self.dense_4h_to_h(x)
+        return x
+FALCON_ATTENTION_CLASSES = {
+    "eager": FalconAttention,
+    "sdpa": FalconAttention,  # FalconAttention originally implemented both a forward with & without SDPA
+    "flash_attention_2": FalconFlashAttention2,
+}
+class FalconDecoderLayer(nn.Module):
+    def __init__(self, config: FalconConfig, layer_idx=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.self_attention = FALCON_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.mlp = FalconMLP(config)
+        self.hidden_dropout = config.hidden_dropout
+        self.config = config
+        if config.num_ln_in_parallel_attn is None and config.new_decoder_architecture:
+            config.num_ln_in_parallel_attn = 2
+        if not config.parallel_attn:
+            self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+            self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        else:
+            if config.num_ln_in_parallel_attn == 2:
+                # The layer norm before self-attention
+                self.ln_attn = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+                # The layer norm before the MLP
+                self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+            else:
+                self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Union[Cache, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ):
+        residual = hidden_states
+        if self.config.new_decoder_architecture and self.config.num_ln_in_parallel_attn == 2:
+            attention_layernorm_out = self.ln_attn(hidden_states)
+            mlp_layernorm_out = self.ln_mlp(hidden_states)
+        else:
+            attention_layernorm_out = self.input_layernorm(hidden_states)
+        # Self attention.
+        attn_outputs = self.self_attention(
+            attention_layernorm_out,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            alibi=alibi,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        attention_output = attn_outputs[0]
+        if not self.config.new_decoder_architecture:
+            if self.config.parallel_attn:
+                mlp_layernorm_out = attention_layernorm_out
+            else:
+                residual = dropout_add(
+                    attention_output, residual, self.config.attention_dropout, training=self.training
+                )
+                mlp_layernorm_out = self.post_attention_layernorm(residual)
+        if (
+            self.config.new_decoder_architecture
+            and self.config.parallel_attn
+            and self.config.num_ln_in_parallel_attn == 1
+        ):
+            mlp_layernorm_out = attention_layernorm_out
+        outputs = attn_outputs[1:]
+        # MLP.
+        mlp_output = self.mlp(mlp_layernorm_out)
+        if self.config.new_decoder_architecture or self.config.parallel_attn:
+            mlp_output += attention_output
+        output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training)
+        if use_cache:
+            outputs = (output,) + outputs
+        else:
+            outputs = (output,) + outputs[1:]
+        return outputs  # hidden_states, past_kv, attentions
+FALCON_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`FalconConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+FALCON_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
+            `past_key_values`).
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+class FalconPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = FalconConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["FalconDecoderLayer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear) or isinstance(module, FalconLinear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    # Adapted from transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa
+    @classmethod
+    def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> "PretrainedConfig":
+        _is_bettertransformer = getattr(cls, "use_bettertransformer", False)
+        if _is_bettertransformer:
+            return config
+        if not hard_check_only:
+            config._attn_implementation = "sdpa"
+        return config
+@add_start_docstrings(
+    "The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.",
+    FALCON_START_DOCSTRING,
+)
+class FalconModel(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_alibi = config.alibi
+        # Embedding + LN Embedding
+        self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
+        # Transformer blocks
+        self.h = nn.ModuleList([FalconDecoderLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
+        # Final Layer Norm
+        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        self.rotary_emb = FalconRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.word_embeddings
+    def set_input_embeddings(self, new_embeddings: torch.Tensor):
+        self.word_embeddings = new_embeddings
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+        # Compute alibi tensor: check build_alibi_tensor documentation
+        alibi = None
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        batch_size, seq_length, _ = inputs_embeds.shape
+        if self.use_alibi:
+            mask = (
+                torch.ones(
+                    (batch_size, seq_length + past_key_values_length), device=inputs_embeds.device, dtype=torch.long
+                )
+                if attention_mask is None
+                else attention_mask
+            )
+            alibi = build_alibi_tensor(mask, self.num_heads, dtype=inputs_embeds.dtype)
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions, head_mask, alibi
+        )
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape batch_size x num_heads x N x N
+        # head_mask has shape n_layer x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        next_decoder_cache = None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, block in enumerate(self.h):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                outputs = self._gradient_checkpointing_func(
+                    block.__call__,
+                    hidden_states,
+                    alibi,
+                    causal_mask,
+                    position_ids,
+                    head_mask[i],
+                    past_key_values,
+                    use_cache,
+                    output_attentions,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=past_key_values,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    head_mask=head_mask[i],
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    alibi=alibi,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = outputs[0]
+            if use_cache is True:
+                next_decoder_cache = outputs[1]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+        # Add last hidden state
+        hidden_states = self.ln_f(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, next_cache, all_hidden_states, all_self_attentions] if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+        head_mask: torch.Tensor,
+        alibi: torch.Tensor,
+    ):
+        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not using_static_cache
+            and not output_attentions
+            and head_mask is None
+            and alibi is None
+        ):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        batch_size, sequence_length, _ = input_tensor.shape
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        # We take care to integrate alibi bias in the causal_mask here
+        if head_mask is None and alibi is not None:
+            alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:])
+            causal_mask = torch.masked_fill(
+                alibi / math.sqrt(self.config.hidden_size // self.num_heads),
+                causal_mask < -1,
+                min_dtype,
+            )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    # Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to place the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+@add_start_docstrings(
+    "The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).",
+    FALCON_START_DOCSTRING,
+)
+class FalconForCausalLM(FalconPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.transformer = FalconModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings: torch.Tensor):
+        self.lm_head = new_embeddings
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = transformer_outputs[0]
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        lm_logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                lm_logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    def _reorder_cache(
+        self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+        Output shares the same memory storage as `past`.
+        """
+        # Get a copy of `beam_idx` on all the devices where we need those indices.
+        device_to_beam_idx = {
+            past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
+        }
+        reordered_past = tuple(
+            (
+                layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
+                layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
+            )
+            for layer_past in past
+        )
+        return reordered_past
+@add_start_docstrings(
+    """
+    The Falcon Model transformer with a sequence classification head on top (linear layer).
+    [`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    FALCON_START_DOCSTRING,
+)
+class FalconForSequenceClassification(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = FalconModel(config)
+        self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Falcon Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    FALCON_START_DOCSTRING,
+)
+class FalconForTokenClassification(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = FalconModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+        loss = None
+        if labels is not None:
+            batch_size, seq_length = labels.shape
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+            )
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    The Falcon Model transformer with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    FALCON_START_DOCSTRING,
+)
+class FalconForQuestionAnswering(FalconPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = FalconModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = [
+    "FalconForCausalLM",
+    "FalconModel",
+    "FalconPreTrainedModel",
+    "FalconForSequenceClassification",
+    "FalconForTokenClassification",
+    "FalconForQuestionAnswering",
+]

docs/transformers/build/lib/transformers/models/falcon_mamba/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_falcon_mamba import *
+    from .modeling_falcon_mamba import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/falcon_mamba/configuration_falcon_mamba.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FALCONMAMBA configuration"""
+import math
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+class FalconMambaConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`FalconMambaModel`]. It is used to instantiate a FALCON_MAMBA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the FALCON_MAMBA
+    [tiiuae/falcon-mamba-7b](https://huggingface.co/tiiuae/falcon-mamba-7b) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50280):
+            Vocabulary size of the FALCON_MAMBA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FalconMambaModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        state_size (`int`, *optional*, defaults to 16): shape of the state space latents.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the model.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon to use in the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the beginning of sentence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 0):
+            The id of the end of sentence token in the vocabulary.
+        expand (`int`, *optional*, defaults to 2): Expanding factor used to determine the intermediate size.
+        conv_kernel (`int`, *optional*, defaults to 4): Size of the convolution kernel.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether or not to use bias in ["in_proj", "out_proj"] of the mixer block
+        use_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use bias in the convolution layer of the mixer block.
+        hidden_act (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.1):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        residual_in_fp32 (`bool`, *optional*, defaults to `True`):
+            Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model
+        time_step_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        time_step_scale (`float`, *optional*, defaults to 1.0):
+            Scale used used to scale `dt_proj.bias`.
+        time_step_min (`float`, *optional*, defaults to 0.001):
+            Minimum `time_step` used to bound `dt_proj.bias`.
+        time_step_max (`float`, *optional*, defaults to 0.1):
+            Maximum `time_step` used to bound `dt_proj.bias`.
+        time_step_init_scheme (`float`, *optional*, defaults to `"random"`):
+            Init scheme used for `dt_proj.weight`. Should be one of `["random","uniform"]`
+        time_step_floor (`float`, *optional*, defaults to 0.0001):
+            Minimum clamping value of the `dt_proj.bias` layer initialization.
+        rescale_prenorm_residual (`bool`, *optional*, defaults to `False`):
+            Whether or not to rescale `out_proj` weights when initializing.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the cache should be used.
+        use_mambapy (`bool`, *optional*, defaults to `False`):
+            Determines the fallback strategy during training if the CUDA-based official implementation of FalconMamba is not available. If `True`, the falcon_mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
+        mixer_rms_eps (`float`, *optional*, defaults to 1e-06):
+            The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states.
+    Example:
+    ```python
+    >>> from transformers import FalconMambaConfig, FalconMambaModel
+    >>> # Initializing a FalconMamba configuration
+    >>> configuration = FalconMambaConfig()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = FalconMambaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "falcon_mamba"
+    def __init__(
+        self,
+        vocab_size=50280,
+        hidden_size=768,
+        state_size=16,
+        num_hidden_layers=32,
+        layer_norm_epsilon=1e-5,
+        pad_token_id=0,
+        bos_token_id=0,
+        eos_token_id=0,
+        expand=2,
+        conv_kernel=4,
+        use_bias=False,
+        use_conv_bias=True,
+        hidden_act="silu",
+        initializer_range=0.1,
+        residual_in_fp32=True,
+        time_step_rank="auto",
+        time_step_scale=1.0,
+        time_step_min=0.001,
+        time_step_max=0.1,
+        time_step_init_scheme="random",
+        time_step_floor=1e-4,
+        rescale_prenorm_residual=False,
+        use_cache=True,
+        use_mambapy=False,
+        mixer_rms_eps=1e-6,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.state_size = state_size
+        self.num_hidden_layers = num_hidden_layers
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.conv_kernel = conv_kernel
+        self.expand = expand
+        self.intermediate_size = int(expand * self.hidden_size)
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.use_bias = use_bias
+        self.use_conv_bias = use_conv_bias
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
+        self.time_step_scale = time_step_scale
+        self.time_step_min = time_step_min
+        self.time_step_max = time_step_max
+        self.time_step_init_scheme = time_step_init_scheme
+        self.time_step_floor = time_step_floor
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+        self.residual_in_fp32 = residual_in_fp32
+        self.use_cache = use_cache
+        self.use_mambapy = use_mambapy
+        self.mixer_rms_eps = mixer_rms_eps
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
+__all__ = ["FalconMambaConfig"]

docs/transformers/build/lib/transformers/models/falcon_mamba/modeling_falcon_mamba.py ADDED Viewed

	@@ -0,0 +1,873 @@

+# coding=utf-8
+# Copyright 2024 Tri Dao, Albert Gu, Technological Innovation Institute and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch FALCONMAMBA model."""
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from ...activations import ACT2FN
+from ...cache_utils import MambaCache
+from ...generation import GenerationMixin
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available, is_mambapy_available
+from .configuration_falcon_mamba import FalconMambaConfig
+logger = logging.get_logger(__name__)
+if is_mambapy_available():
+    from mambapy.pscan import pscan
+else:
+    pscan = None
+if is_mamba_ssm_available():
+    from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+    from ...kernels.falcon_mamba import mamba_inner_fn
+else:
+    selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+is_fast_path_available = all(
+    (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
+)
+_CHECKPOINT_FOR_DOC = "tiiuae/falcon-mamba-7b"
+_CONFIG_FOR_DOC = "FalconMambaConfig"
+def rms_forward(hidden_states, variance_epsilon=1e-6):
+    """
+    Calculates simple RMSNorm with no learnable weights. `MambaRMSNorm` will
+    leverage this in order to multiply the final result with the RMSNorm weight
+    Args:
+        hidden_states (`torch.Tensor`):
+            Hidden states to normalize
+        variance_epsilon (`float`):
+            The eps value to add in the square root scaling factor
+    """
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
+    return hidden_states.to(input_dtype)
+class FalconMambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see FalconMamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between FalconMamba and the linear time invariant S4,
+    and is why FalconMamba is called **selective** state spaces)
+    """
+    def __init__(self, config: FalconMambaConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.state_size
+        self.conv_kernel_size = config.conv_kernel
+        self.intermediate_size = config.intermediate_size
+        self.time_step_rank = int(config.time_step_rank)
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.use_conv_bias
+        self.conv1d = nn.Conv1d(
+            in_channels=self.intermediate_size,
+            out_channels=self.intermediate_size,
+            bias=config.use_conv_bias,
+            kernel_size=config.conv_kernel,
+            groups=self.intermediate_size,
+            padding=config.conv_kernel - 1,
+        )
+        self.activation = config.hidden_act
+        self.act = ACT2FN[config.hidden_act]
+        self.use_mambapy = config.use_mambapy
+        # projection of the input hidden states
+        self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias)
+        # selective projection used to make dt, B and C input dependent
+        self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False)
+        # time step projection (discretization)
+        self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True)
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.ssm_state_size + 1, dtype=torch.float32)[None, :]
+        A = A.expand(self.intermediate_size, -1).contiguous()
+        self.A_log = nn.Parameter(torch.log(A))
+        self.D = nn.Parameter(torch.ones(self.intermediate_size))
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
+        self.use_bias = config.use_bias
+        # Triton expects to pass RMS weights even if they are non learnable, thus we need to create these weights here
+        self.register_buffer(
+            "b_c_rms", torch.nn.Parameter(torch.ones(self.ssm_state_size), requires_grad=False), persistent=False
+        )
+        self.register_buffer(
+            "dt_rms", torch.nn.Parameter(torch.ones(self.intermediate_size), requires_grad=False), persistent=False
+        )
+        self.rms_eps = config.mixer_rms_eps
+        if not is_fast_path_available:
+            if self.use_mambapy:
+                if is_mambapy_available():
+                    logger.warning_once(
+                        "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
+                        " is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation and"
+                        " https://github.com/Dao-AILab/causal-conv1d"
+                    )
+                else:
+                    raise ImportError(
+                        "use_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py."
+                    )
+            else:
+                logger.warning_once(
+                    "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
+                    " is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and"
+                    " https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py."
+                )
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[MambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states).transpose(1, 2)
+        if self.training and cache_params is None:  # Doesn't support outputting the states -> used for training
+            contextualized_states = mamba_inner_fn(
+                projected_states,
+                self.conv1d.weight,
+                self.conv1d.bias if self.use_conv_bias else None,
+                self.x_proj.weight,
+                self.dt_proj.weight,
+                self.out_proj.weight,
+                self.out_proj.bias.float() if self.use_bias else None,
+                -torch.exp(self.A_log.float()),
+                None,  # input-dependent B
+                None,  # input-dependent C
+                self.D.float(),
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+                b_rms_weight=self.b_c_rms,
+                c_rms_weight=self.b_c_rms,
+                dt_rms_weight=self.dt_rms,
+                b_c_dt_rms_eps=self.rms_eps,
+            )
+        else:
+            hidden_states, gate = projected_states.chunk(2, dim=1)
+            if attention_mask is not None:
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+            # 2. Convolution sequence transformation
+            conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
+            if cache_params is not None and cache_position[0] > 0:
+                hidden_states = causal_conv1d_update(
+                    hidden_states.squeeze(-1),
+                    cache_params.conv_states[self.layer_idx],
+                    conv_weights,
+                    self.conv1d.bias,
+                    self.activation,
+                )
+                hidden_states = hidden_states.unsqueeze(-1)
+            else:
+                if cache_params is not None:
+                    conv_states = nn.functional.pad(
+                        hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0)
+                    )
+                    cache_params.update_conv_state(self.layer_idx, conv_states, cache_position)
+                hidden_states = causal_conv1d_fn(
+                    hidden_states, conv_weights, self.conv1d.bias, activation=self.activation
+                )
+            if attention_mask is not None:
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+            # 3. State Space Model sequence transformation
+            # 3.a. input varying initialization of time_step, B and C
+            ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))
+            time_step, B, C = torch.split(
+                ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+            )
+            B = rms_forward(B, variance_epsilon=self.rms_eps)
+            C = rms_forward(C, variance_epsilon=self.rms_eps)
+            time_step = rms_forward(time_step, variance_epsilon=self.rms_eps)
+            # In case the model has been quantized, we need a hack to properly call the `nn.Linear` module
+            # at the price of a small overhead.
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                discrete_time_step = (self.dt_proj(time_step) - self.dt_proj.bias).transpose(1, 2)
+            else:
+                discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)
+            A = -torch.exp(self.A_log.float())
+            # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+            time_proj_bias = self.dt_proj.bias.float() if hasattr(self.dt_proj, "bias") else None
+            if cache_params is not None and cache_position[0] > 0:
+                scan_outputs = selective_state_update(
+                    cache_params.ssm_states[self.layer_idx],
+                    hidden_states[..., 0],
+                    discrete_time_step[..., 0],
+                    A,
+                    B[:, 0],
+                    C[:, 0],
+                    self.D,
+                    gate[..., 0],
+                    time_proj_bias,
+                    dt_softplus=True,
+                ).unsqueeze(-1)
+            else:
+                scan_outputs, ssm_state = selective_scan_fn(
+                    hidden_states,
+                    discrete_time_step,
+                    A,
+                    B.transpose(1, 2),
+                    C.transpose(1, 2),
+                    self.D.float(),
+                    gate,
+                    time_proj_bias,
+                    delta_softplus=True,
+                    return_last_state=True,
+                )
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.update_ssm_state(self.layer_idx, ssm_state)
+            # 4. Final linear projection
+            contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))
+        return contextualized_states
+    def slow_forward(
+        self,
+        input_states,
+        cache_params: Optional[MambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(input_states).transpose(1, 2)  # [batch, 2 * intermediate_size, seq_len]
+        hidden_states, gate = projected_states.chunk(2, dim=1)
+        if attention_mask is not None:
+            hidden_states = hidden_states * attention_mask.unsqueeze(1)
+        # 2. Convolution sequence transformation
+        if cache_params is not None:
+            ssm_state = cache_params.ssm_states[self.layer_idx].clone()
+            ssm_state = ssm_state.to(hidden_states.device)
+            # use `cache_position.shape[0]` to check whether we are in prefill
+            # stage, it's equivalent to check `cache_position[0] == 0`, which
+            # breaks dynamo fullgraph constraints
+            if cache_position is not None and cache_position.shape[0] == self.conv_kernel_size:
+                conv_state = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0))
+                cache_params.update_conv_state(self.layer_idx, conv_state, cache_position)
+                hidden_states = self.act(
+                    self.conv1d(hidden_states)[..., :seq_len]
+                )  # [batch, intermediate_size, seq_len]
+            else:
+                conv_state = cache_params.update_conv_state(self.layer_idx, hidden_states, cache_position)
+                conv_state = conv_state.to(self.conv1d.weight.device)
+                hidden_states = torch.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                hidden_states = (
+                    self.act(hidden_states).to(dtype).unsqueeze(-1)
+                )  # [batch, intermediate_size, 1] : decoding
+        else:
+            ssm_state = torch.zeros(
+                (batch_size, self.intermediate_size, self.ssm_state_size), device=hidden_states.device, dtype=dtype
+            )
+            hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])  # [batch, intermediate_size, seq_len]
+        if attention_mask is not None:
+            hidden_states = hidden_states * attention_mask.unsqueeze(1)
+        # 3. State Space Model sequence transformation
+        # 3.a. Selection:  [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2]
+        ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+        B = rms_forward(B, variance_epsilon=self.rms_eps)
+        C = rms_forward(C, variance_epsilon=self.rms_eps)
+        time_step = rms_forward(time_step, variance_epsilon=self.rms_eps)
+        discrete_time_step = self.dt_proj(time_step)  # [batch, seq_len, intermediate_size]
+        discrete_time_step = nn.functional.softplus(discrete_time_step).transpose(
+            1, 2
+        )  # [batch, intermediate_size, seq_len]
+        # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
+        A = -torch.exp(self.A_log.float())  # [intermediate_size, ssm_state_size]
+        discrete_A = torch.exp(
+            A[None, :, None, :] * discrete_time_step[:, :, :, None]
+        )  # [batch, intermediate_size, seq_len, ssm_state_size]
+        discrete_B = (
+            discrete_time_step[:, :, :, None] * B[:, None, :, :].float()
+        )  # [batch, intermediate_size, seq_len, ssm_state_size]
+        deltaB_u = discrete_B * hidden_states[:, :, :, None].float()
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        if self.use_mambapy and self.training and cache_params is None:
+            hs = pscan(
+                discrete_A.transpose(1, 2), deltaB_u.transpose(1, 2)
+            )  # [batch, seq_len, intermediate_size, ssm_state_size]
+            scan_output = (hs @ C.unsqueeze(-1)).squeeze(3).transpose(1, 2)  # [batch, intermediate_size, seq_len]
+            scan_output = scan_output + hidden_states * self.D[None, :, None]
+            scan_output = scan_output * self.act(gate)
+        else:
+            scan_outputs = []
+            for i in range(seq_len):
+                ssm_state = (
+                    discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]
+                )  # [batch, intermediate_size, ssm_state]
+                scan_output = torch.matmul(
+                    ssm_state.to(dtype), C[:, i, :].unsqueeze(-1)
+                )  # [batch, intermediate_size, 1]
+                scan_outputs.append(scan_output[:, :, 0])
+            scan_output = torch.stack(scan_outputs, dim=-1)  # [batch, intermediate_size, seq_len]
+            scan_output = scan_output + (hidden_states * self.D[None, :, None])
+            scan_output = scan_output * self.act(gate)
+            if cache_params is not None:
+                cache_params.update_ssm_state(self.layer_idx, ssm_state)
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.transpose(1, 2))  # [batch, seq_len, hidden_size]
+        return contextualized_states
+    # Copied from transformers.models.mamba.modeling_mamba.MambaMixer.forward
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[MambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        if is_fast_path_available and "cuda" in self.x_proj.weight.device.type and not torch._dynamo.is_compiling():
+            return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask)
+        return self.slow_forward(hidden_states, cache_params, cache_position, attention_mask)
+# Copied from transformers.models.mamba.modeling_mamba.MambaRMSNorm with Mamba->FalconMamba
+class FalconMambaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        FalconMambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def extra_repr(self):
+        return f"{self.weight.shape[0]}, eps={self.variance_epsilon}"
+    # Ignore copy
+    def forward(self, hidden_states):
+        return self.weight.to(hidden_states.device) * rms_forward(
+            hidden_states, variance_epsilon=self.variance_epsilon
+        )
+# Copied from transformers.models.mamba.modeling_mamba.MambaBlock with Mamba->FalconMamba,FalconMambaCache->MambaCache
+class FalconMambaBlock(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.residual_in_fp32 = config.residual_in_fp32
+        self.norm = FalconMambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.mixer = FalconMambaMixer(config, layer_idx=layer_idx)
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[MambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))
+        if self.residual_in_fp32:
+            residual = residual.to(torch.float32)
+        hidden_states = self.mixer(
+            hidden_states, cache_params=cache_params, cache_position=cache_position, attention_mask=attention_mask
+        )
+        hidden_states = residual + hidden_states
+        return hidden_states
+# Copied from transformers.models.mamba.modeling_mamba.MambaPreTrainedModel with Mamba->FalconMamba
+class FalconMambaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = FalconMambaConfig
+    base_model_prefix = "backbone"
+    _no_split_modules = ["FalconMambaBlock", "FalconMambaMixer"]
+    supports_gradient_checkpointing = True
+    _is_stateful = True
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, FalconMambaMixer):
+            module.A_log._no_weight_decay = True
+            module.D._no_weight_decay = True
+            dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
+            if self.config.time_step_init_scheme == "constant":
+                nn.init.constant_(module.dt_proj.weight, dt_init_std)
+            elif self.config.time_step_init_scheme == "random":
+                nn.init.uniform_(module.dt_proj.weight, -dt_init_std, dt_init_std)
+            dt = torch.exp(
+                torch.rand(self.config.intermediate_size)
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clamp(min=self.config.time_step_floor)
+            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            with torch.no_grad():
+                module.dt_proj.bias.copy_(inv_dt)
+            module.dt_proj.bias._no_reinit = True
+        if isinstance(module, nn.Linear):
+            if module.bias is not None:
+                if not getattr(module.bias, "_no_reinit", False):
+                    nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=self.config.initializer_range)
+        if self.config.rescale_prenorm_residual:
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            for name, p in module.named_parameters():
+                if name in ["out_proj.weight"]:
+                    # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                    # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                    # We need to reinit p since this code could be called multiple times
+                    # Having just p *= scale would repeatedly scale it down
+                    nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                    with torch.no_grad():
+                        p /= math.sqrt(self.config.num_hidden_layers)
+@dataclass
+# Copied from transformers.models.mamba.modeling_mamba.MambaOutput with MAMBA->FALCONMAMBA,Mamba->FalconMamba,FalconMambaCache->MambaCache
+class FalconMambaOutput(ModelOutput):
+    """
+    Class for the FALCONMAMBA model outputs.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        cache_params (`MambaCache`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+            Includes both the State space model state matrices after the selective scan, and the Convolutional states
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    cache_params: Optional[MambaCache] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+# Copied from transformers.models.mamba.modeling_mamba.MambaCausalLMOutput with Mamba->FalconMamba,FalconMambaCache->MambaCache
+class FalconMambaCausalLMOutput(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        cache_params (`MambaCache`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+            Includes both the State space model state matrices after the selective scan, and the Convolutional states
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    cache_params: Optional[MambaCache] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+FALCONMAMBA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`FalconMambaConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+FALCONMAMBA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            Indices of input sequence tokens in the vocabulary.
+            If `cache_params.seqlen_offset>0`, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        cache_params (`MambaCache`, *optional*):
+            If passed along, the model uses the previous state in all the blocks (which will give the output for the
+            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
+        use_cache (`bool`, *optional*):
+            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare FALCONMAMBA Model transformer outputting raw hidden-states without any specific head on top.",
+    FALCONMAMBA_START_DOCSTRING,
+)
+class FalconMambaModel(FalconMambaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [FalconMambaBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+        self.norm_f = FalconMambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+    @add_start_docstrings_to_model_forward(FALCONMAMBA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=FalconMambaOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        cache_params: Optional[MambaCache] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, FalconMambaOutput]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):  # ^ is python for xor
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+        if self.gradient_checkpointing and self.training and use_cache:
+            use_cache = False
+        if use_cache:
+            if cache_params is None:
+                cache_params = MambaCache(
+                    self.config, inputs_embeds.size(0), device=inputs_embeds.device, dtype=inputs_embeds.dtype
+                )
+                cache_position = torch.arange(0, self.config.conv_kernel, device=inputs_embeds.device)
+            elif cache_position is None:
+                # cases when we do manual forward instead of using `model.generate` which will initiate
+                # `cache_position` and makes sure it is not None, throw error here instead of doing some
+                # hack to conjecture the current cache position
+                raise ValueError(
+                    "You have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, "
+                    "you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will "
+                    "be initialized for you automatically"
+                )
+        else:
+            cache_params = None
+        hidden_states = inputs_embeds
+        all_hidden_states = () if output_hidden_states else None
+        for mixer_block in self.layers:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    mixer_block.__call__, hidden_states, cache_params, cache_position, attention_mask
+                )
+            else:
+                hidden_states = mixer_block(
+                    hidden_states,
+                    cache_params=cache_params,
+                    cache_position=cache_position,
+                    attention_mask=attention_mask,
+                )
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+        hidden_states = self.norm_f(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None)
+        return FalconMambaOutput(
+            last_hidden_state=hidden_states,
+            cache_params=cache_params if use_cache else None,
+            hidden_states=all_hidden_states,
+        )
+@add_start_docstrings(
+    """
+    The FALCONMAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    FALCONMAMBA_START_DOCSTRING,
+)
+# Copied from transformers.models.mamba.modeling_mamba.MambaForCausalLM with MAMBA->FALCONMAMBA,Mamba->FalconMamba,mamba->falcon_mamba,FalconMambaCache->MambaCache
+class FalconMambaForCausalLM(FalconMambaPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.backbone = FalconMambaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def get_input_embeddings(self):
+        return self.backbone.get_input_embeddings()
+    def set_input_embeddings(self, new_embeddings):
+        return self.backbone.set_input_embeddings(new_embeddings)
+    def _update_model_kwargs_for_generation(
+        self, outputs: ModelOutput, model_kwargs: Dict[str, Any], num_new_tokens: int = 1, **kwargs
+    ) -> Dict[str, Any]:
+        model_kwargs["cache_params"] = outputs.get("cache_params", None)
+        if (
+            model_kwargs.get("use_cache", True)
+            and "cache_position" in model_kwargs
+            and model_kwargs["cache_position"] is not None
+        ):
+            model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+        return model_kwargs
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        inputs_embeds=None,
+        use_cache=None,
+        cache_params: Optional[MambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ):
+        # Overwritten -- uses `cache_params` as opposed to `past_key_values`
+        if use_cache:
+            # `cache_position` should have been initialized in `generate`
+            if cache_position is None:
+                raise ValueError(
+                    "`cache_position` should not be None as it should have been initialized in "
+                    "`model.generate`, you are responsible for passing in a valid `cache_position` if "
+                    "you are calling `prepare_inputs_for_generation` directly with `use_cache=True`"
+                )
+            if cache_position[0] > 0:
+                input_ids = input_ids[:, -1].unsqueeze(-1)
+                if attention_mask is not None:
+                    attention_mask = None
+            else:
+                # we initialize the `cache_position` to full size of `conv_states` at prefill stage
+                # considering padding will be applied when input length is shorter, and truncation
+                # will be applied when it is longer, so it will be equivalent to always have it match
+                # the length of `cache_params.conv_states`, which is `config.conv_kernel`
+                cache_position = torch.arange(0, self.config.conv_kernel, device=input_ids.device)
+        if inputs_embeds is not None and cache_params is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}
+        model_inputs.update(
+            {
+                "cache_params": cache_params,
+                "use_cache": use_cache,
+                "cache_position": cache_position,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @add_start_docstrings_to_model_forward(FALCONMAMBA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=FalconMambaCausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_params: Optional[MambaCache] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,  # for now we need this for generation
+    ) -> Union[Tuple, FalconMambaCausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        falcon_mamba_outputs = self.backbone(
+            input_ids,
+            cache_params=cache_params,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            attention_mask=attention_mask,
+        )
+        hidden_states = falcon_mamba_outputs[0]
+        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits,) + falcon_mamba_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return FalconMambaCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            cache_params=falcon_mamba_outputs.cache_params,
+            hidden_states=falcon_mamba_outputs.hidden_states,
+        )
+__all__ = ["FalconMambaForCausalLM", "FalconMambaModel", "FalconMambaPreTrainedModel"]

docs/transformers/build/lib/transformers/models/fastspeech2_conformer/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_fastspeech2_conformer import *
+    from .modeling_fastspeech2_conformer import *
+    from .tokenization_fastspeech2_conformer import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py ADDED Viewed

	@@ -0,0 +1,480 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FastSpeech2Conformer model configuration"""
+from typing import Dict
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+class FastSpeech2ConformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FastSpeech2ConformerModel`]. It is used to
+    instantiate a FastSpeech2Conformer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    FastSpeech2Conformer [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer)
+    architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 384):
+            The dimensionality of the hidden layers.
+        vocab_size (`int`, *optional*, defaults to 78):
+            The size of the vocabulary.
+        num_mel_bins (`int`, *optional*, defaults to 80):
+            The number of mel filters used in the filter bank.
+        encoder_num_attention_heads (`int`, *optional*, defaults to 2):
+            The number of attention heads in the encoder.
+        encoder_layers (`int`, *optional*, defaults to 4):
+            The number of layers in the encoder.
+        encoder_linear_units (`int`, *optional*, defaults to 1536):
+            The number of units in the linear layer of the encoder.
+        decoder_layers (`int`, *optional*, defaults to 4):
+            The number of layers in the decoder.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 2):
+            The number of attention heads in the decoder.
+        decoder_linear_units (`int`, *optional*, defaults to 1536):
+            The number of units in the linear layer of the decoder.
+        speech_decoder_postnet_layers (`int`, *optional*, defaults to 5):
+            The number of layers in the post-net of the speech decoder.
+        speech_decoder_postnet_units (`int`, *optional*, defaults to 256):
+            The number of units in the post-net layers of the speech decoder.
+        speech_decoder_postnet_kernel (`int`, *optional*, defaults to 5):
+            The kernel size in the post-net of the speech decoder.
+        positionwise_conv_kernel_size (`int`, *optional*, defaults to 3):
+            The size of the convolution kernel used in the position-wise layer.
+        encoder_normalize_before (`bool`, *optional*, defaults to `False`):
+            Specifies whether to normalize before encoder layers.
+        decoder_normalize_before (`bool`, *optional*, defaults to `False`):
+            Specifies whether to normalize before decoder layers.
+        encoder_concat_after (`bool`, *optional*, defaults to `False`):
+            Specifies whether to concatenate after encoder layers.
+        decoder_concat_after (`bool`, *optional*, defaults to `False`):
+            Specifies whether to concatenate after decoder layers.
+        reduction_factor (`int`, *optional*, defaults to 1):
+            The factor by which the speech frame rate is reduced.
+        speaking_speed (`float`, *optional*, defaults to 1.0):
+            The speed of the speech produced.
+        use_macaron_style_in_conformer (`bool`, *optional*, defaults to `True`):
+            Specifies whether to use macaron style in the conformer.
+        use_cnn_in_conformer (`bool`, *optional*, defaults to `True`):
+            Specifies whether to use convolutional neural networks in the conformer.
+        encoder_kernel_size (`int`, *optional*, defaults to 7):
+            The kernel size used in the encoder.
+        decoder_kernel_size (`int`, *optional*, defaults to 31):
+            The kernel size used in the decoder.
+        duration_predictor_layers (`int`, *optional*, defaults to 2):
+            The number of layers in the duration predictor.
+        duration_predictor_channels (`int`, *optional*, defaults to 256):
+            The number of channels in the duration predictor.
+        duration_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            The kernel size used in the duration predictor.
+        energy_predictor_layers (`int`, *optional*, defaults to 2):
+            The number of layers in the energy predictor.
+        energy_predictor_channels (`int`, *optional*, defaults to 256):
+            The number of channels in the energy predictor.
+        energy_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            The kernel size used in the energy predictor.
+        energy_predictor_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout rate in the energy predictor.
+        energy_embed_kernel_size (`int`, *optional*, defaults to 1):
+            The kernel size used in the energy embed layer.
+        energy_embed_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout rate in the energy embed layer.
+        stop_gradient_from_energy_predictor (`bool`, *optional*, defaults to `False`):
+            Specifies whether to stop gradients from the energy predictor.
+        pitch_predictor_layers (`int`, *optional*, defaults to 5):
+            The number of layers in the pitch predictor.
+        pitch_predictor_channels (`int`, *optional*, defaults to 256):
+            The number of channels in the pitch predictor.
+        pitch_predictor_kernel_size (`int`, *optional*, defaults to 5):
+            The kernel size used in the pitch predictor.
+        pitch_predictor_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout rate in the pitch predictor.
+        pitch_embed_kernel_size (`int`, *optional*, defaults to 1):
+            The kernel size used in the pitch embed layer.
+        pitch_embed_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout rate in the pitch embed layer.
+        stop_gradient_from_pitch_predictor (`bool`, *optional*, defaults to `True`):
+            Specifies whether to stop gradients from the pitch predictor.
+        encoder_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The dropout rate in the encoder.
+        encoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The positional dropout rate in the encoder.
+        encoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The attention dropout rate in the encoder.
+        decoder_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The dropout rate in the decoder.
+        decoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The positional dropout rate in the decoder.
+        decoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The attention dropout rate in the decoder.
+        duration_predictor_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The dropout rate in the duration predictor.
+        speech_decoder_postnet_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout rate in the speech decoder postnet.
+        max_source_positions (`int`, *optional*, defaults to 5000):
+            if `"relative"` position embeddings are used, defines the maximum source input positions.
+        use_masking (`bool`, *optional*, defaults to `True`):
+            Specifies whether to use masking in the model.
+        use_weighted_masking (`bool`, *optional*, defaults to `False`):
+            Specifies whether to use weighted masking in the model.
+        num_speakers (`int`, *optional*):
+            Number of speakers. If set to > 1, assume that the speaker ids will be provided as the input and use
+            speaker id embedding layer.
+        num_languages (`int`, *optional*):
+            Number of languages. If set to > 1, assume that the language ids will be provided as the input and use the
+            languge id embedding layer.
+        speaker_embed_dim (`int`, *optional*):
+            Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Specifies whether the model is an encoder-decoder.
+    Example:
+    ```python
+    >>> from transformers import FastSpeech2ConformerModel, FastSpeech2ConformerConfig
+    >>> # Initializing a FastSpeech2Conformer style configuration
+    >>> configuration = FastSpeech2ConformerConfig()
+    >>> # Initializing a model from the FastSpeech2Conformer style configuration
+    >>> model = FastSpeech2ConformerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "fastspeech2_conformer"
+    base_config_key = "model_config"
+    attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"}
+    def __init__(
+        self,
+        hidden_size=384,
+        vocab_size=78,
+        num_mel_bins=80,
+        encoder_num_attention_heads=2,
+        encoder_layers=4,
+        encoder_linear_units=1536,
+        decoder_layers=4,
+        decoder_num_attention_heads=2,
+        decoder_linear_units=1536,
+        speech_decoder_postnet_layers=5,
+        speech_decoder_postnet_units=256,
+        speech_decoder_postnet_kernel=5,
+        positionwise_conv_kernel_size=3,
+        encoder_normalize_before=False,
+        decoder_normalize_before=False,
+        encoder_concat_after=False,
+        decoder_concat_after=False,
+        reduction_factor=1,
+        speaking_speed=1.0,
+        use_macaron_style_in_conformer=True,
+        use_cnn_in_conformer=True,
+        encoder_kernel_size=7,
+        decoder_kernel_size=31,
+        duration_predictor_layers=2,
+        duration_predictor_channels=256,
+        duration_predictor_kernel_size=3,
+        energy_predictor_layers=2,
+        energy_predictor_channels=256,
+        energy_predictor_kernel_size=3,
+        energy_predictor_dropout=0.5,
+        energy_embed_kernel_size=1,
+        energy_embed_dropout=0.0,
+        stop_gradient_from_energy_predictor=False,
+        pitch_predictor_layers=5,
+        pitch_predictor_channels=256,
+        pitch_predictor_kernel_size=5,
+        pitch_predictor_dropout=0.5,
+        pitch_embed_kernel_size=1,
+        pitch_embed_dropout=0.0,
+        stop_gradient_from_pitch_predictor=True,
+        encoder_dropout_rate=0.2,
+        encoder_positional_dropout_rate=0.2,
+        encoder_attention_dropout_rate=0.2,
+        decoder_dropout_rate=0.2,
+        decoder_positional_dropout_rate=0.2,
+        decoder_attention_dropout_rate=0.2,
+        duration_predictor_dropout_rate=0.2,
+        speech_decoder_postnet_dropout=0.5,
+        max_source_positions=5000,
+        use_masking=True,
+        use_weighted_masking=False,
+        num_speakers=None,
+        num_languages=None,
+        speaker_embed_dim=None,
+        is_encoder_decoder=True,
+        **kwargs,
+    ):
+        if positionwise_conv_kernel_size % 2 == 0:
+            raise ValueError(
+                f"positionwise_conv_kernel_size must be odd, but got {positionwise_conv_kernel_size} instead."
+            )
+        if encoder_kernel_size % 2 == 0:
+            raise ValueError(f"encoder_kernel_size must be odd, but got {encoder_kernel_size} instead.")
+        if decoder_kernel_size % 2 == 0:
+            raise ValueError(f"decoder_kernel_size must be odd, but got {decoder_kernel_size} instead.")
+        if duration_predictor_kernel_size % 2 == 0:
+            raise ValueError(
+                f"duration_predictor_kernel_size must be odd, but got {duration_predictor_kernel_size} instead."
+            )
+        if energy_predictor_kernel_size % 2 == 0:
+            raise ValueError(
+                f"energy_predictor_kernel_size must be odd, but got {energy_predictor_kernel_size} instead."
+            )
+        if energy_embed_kernel_size % 2 == 0:
+            raise ValueError(f"energy_embed_kernel_size must be odd, but got {energy_embed_kernel_size} instead.")
+        if pitch_predictor_kernel_size % 2 == 0:
+            raise ValueError(
+                f"pitch_predictor_kernel_size must be odd, but got {pitch_predictor_kernel_size} instead."
+            )
+        if pitch_embed_kernel_size % 2 == 0:
+            raise ValueError(f"pitch_embed_kernel_size must be odd, but got {pitch_embed_kernel_size} instead.")
+        if hidden_size % encoder_num_attention_heads != 0:
+            raise ValueError("The hidden_size must be evenly divisible by encoder_num_attention_heads.")
+        if hidden_size % decoder_num_attention_heads != 0:
+            raise ValueError("The hidden_size must be evenly divisible by decoder_num_attention_heads.")
+        if use_masking and use_weighted_masking:
+            raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.")
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.num_mel_bins = num_mel_bins
+        self.encoder_config = {
+            "num_attention_heads": encoder_num_attention_heads,
+            "layers": encoder_layers,
+            "kernel_size": encoder_kernel_size,
+            "attention_dropout_rate": encoder_attention_dropout_rate,
+            "dropout_rate": encoder_dropout_rate,
+            "positional_dropout_rate": encoder_positional_dropout_rate,
+            "linear_units": encoder_linear_units,
+            "normalize_before": encoder_normalize_before,
+            "concat_after": encoder_concat_after,
+        }
+        self.decoder_config = {
+            "num_attention_heads": decoder_num_attention_heads,
+            "layers": decoder_layers,
+            "kernel_size": decoder_kernel_size,
+            "attention_dropout_rate": decoder_attention_dropout_rate,
+            "dropout_rate": decoder_dropout_rate,
+            "positional_dropout_rate": decoder_positional_dropout_rate,
+            "linear_units": decoder_linear_units,
+            "normalize_before": decoder_normalize_before,
+            "concat_after": decoder_concat_after,
+        }
+        self.encoder_num_attention_heads = encoder_num_attention_heads
+        self.encoder_layers = encoder_layers
+        self.duration_predictor_channels = duration_predictor_channels
+        self.duration_predictor_kernel_size = duration_predictor_kernel_size
+        self.duration_predictor_layers = duration_predictor_layers
+        self.energy_embed_dropout = energy_embed_dropout
+        self.energy_embed_kernel_size = energy_embed_kernel_size
+        self.energy_predictor_channels = energy_predictor_channels
+        self.energy_predictor_dropout = energy_predictor_dropout
+        self.energy_predictor_kernel_size = energy_predictor_kernel_size
+        self.energy_predictor_layers = energy_predictor_layers
+        self.pitch_embed_dropout = pitch_embed_dropout
+        self.pitch_embed_kernel_size = pitch_embed_kernel_size
+        self.pitch_predictor_channels = pitch_predictor_channels
+        self.pitch_predictor_dropout = pitch_predictor_dropout
+        self.pitch_predictor_kernel_size = pitch_predictor_kernel_size
+        self.pitch_predictor_layers = pitch_predictor_layers
+        self.positionwise_conv_kernel_size = positionwise_conv_kernel_size
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_postnet_dropout = speech_decoder_postnet_dropout
+        self.speech_decoder_postnet_kernel = speech_decoder_postnet_kernel
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.reduction_factor = reduction_factor
+        self.speaking_speed = speaking_speed
+        self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
+        self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
+        self.max_source_positions = max_source_positions
+        self.use_cnn_in_conformer = use_cnn_in_conformer
+        self.use_macaron_style_in_conformer = use_macaron_style_in_conformer
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+        self.num_speakers = num_speakers
+        self.num_languages = num_languages
+        self.speaker_embed_dim = speaker_embed_dim
+        self.duration_predictor_dropout_rate = duration_predictor_dropout_rate
+        self.is_encoder_decoder = is_encoder_decoder
+        super().__init__(
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+class FastSpeech2ConformerHifiGanConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FastSpeech2ConformerHifiGanModel`]. It is used to
+    instantiate a FastSpeech2Conformer HiFi-GAN vocoder model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    FastSpeech2Conformer
+    [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        model_in_dim (`int`, *optional*, defaults to 80):
+            The number of frequency bins in the input log-mel spectrogram.
+        upsample_initial_channel (`int`, *optional*, defaults to 512):
+            The number of input channels into the upsampling network.
+        upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 2, 2]`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The
+            length of *upsample_rates* defines the number of convolutional layers and has to match the length of
+            *upsample_kernel_sizes*.
+        upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[16, 16, 4, 4]`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The
+            length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of
+            *upsample_rates*.
+        resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
+            A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field
+            fusion (MRF) module.
+        resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
+            A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
+            multi-receptive field fusion (MRF) module.
+        initializer_range (`float`, *optional*, defaults to 0.01):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        leaky_relu_slope (`float`, *optional*, defaults to 0.1):
+            The angle of the negative slope used by the leaky ReLU activation.
+        normalize_before (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance.
+    Example:
+    ```python
+    >>> from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig
+    >>> # Initializing a FastSpeech2ConformerHifiGan configuration
+    >>> configuration = FastSpeech2ConformerHifiGanConfig()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = FastSpeech2ConformerHifiGan(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "hifigan"
+    base_config_key = "vocoder_config"
+    def __init__(
+        self,
+        model_in_dim=80,
+        upsample_initial_channel=512,
+        upsample_rates=[8, 8, 2, 2],
+        upsample_kernel_sizes=[16, 16, 4, 4],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        initializer_range=0.01,
+        leaky_relu_slope=0.1,
+        normalize_before=True,
+        **kwargs,
+    ):
+        self.model_in_dim = model_in_dim
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.initializer_range = initializer_range
+        self.leaky_relu_slope = leaky_relu_slope
+        self.normalize_before = normalize_before
+        super().__init__(**kwargs)
+class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`FastSpeech2ConformerWithHifiGan`]. It is used to
+    instantiate a `FastSpeech2ConformerWithHifiGanModel` model according to the specified sub-models configurations,
+    defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    FastSpeech2ConformerModel [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer) and
+    FastSpeech2ConformerHifiGan
+    [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architectures.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        model_config (`typing.Dict`, *optional*):
+            Configuration of the text-to-speech model.
+        vocoder_config (`typing.Dict`, *optional*):
+            Configuration of the vocoder model.
+    model_config ([`FastSpeech2ConformerConfig`], *optional*):
+        Configuration of the text-to-speech model.
+    vocoder_config ([`FastSpeech2ConformerHiFiGanConfig`], *optional*):
+        Configuration of the vocoder model.
+    Example:
+    ```python
+    >>> from transformers import (
+    ...     FastSpeech2ConformerConfig,
+    ...     FastSpeech2ConformerHifiGanConfig,
+    ...     FastSpeech2ConformerWithHifiGanConfig,
+    ...     FastSpeech2ConformerWithHifiGan,
+    ... )
+    >>> # Initializing FastSpeech2ConformerWithHifiGan sub-modules configurations.
+    >>> model_config = FastSpeech2ConformerConfig()
+    >>> vocoder_config = FastSpeech2ConformerHifiGanConfig()
+    >>> # Initializing a FastSpeech2ConformerWithHifiGan module style configuration
+    >>> configuration = FastSpeech2ConformerWithHifiGanConfig(model_config.to_dict(), vocoder_config.to_dict())
+    >>> # Initializing a model (with random weights)
+    >>> model = FastSpeech2ConformerWithHifiGan(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "fastspeech2_conformer_with_hifigan"
+    sub_configs = {"model_config": FastSpeech2ConformerConfig, "vocoder_config": FastSpeech2ConformerHifiGanConfig}
+    def __init__(
+        self,
+        model_config: Dict = None,
+        vocoder_config: Dict = None,
+        **kwargs,
+    ):
+        if model_config is None:
+            model_config = {}
+            logger.info("model_config is None. initializing the model with default values.")
+        if vocoder_config is None:
+            vocoder_config = {}
+            logger.info("vocoder_config is None. initializing the coarse model with default values.")
+        self.model_config = FastSpeech2ConformerConfig(**model_config)
+        self.vocoder_config = FastSpeech2ConformerHifiGanConfig(**vocoder_config)
+        super().__init__(**kwargs)
+__all__ = ["FastSpeech2ConformerConfig", "FastSpeech2ConformerHifiGanConfig", "FastSpeech2ConformerWithHifiGanConfig"]

docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert FastSpeech2Conformer checkpoint."""
+import argparse
+import json
+import re
+from pathlib import Path
+from tempfile import TemporaryDirectory
+import torch
+import yaml
+from transformers import (
+    FastSpeech2ConformerConfig,
+    FastSpeech2ConformerModel,
+    FastSpeech2ConformerTokenizer,
+    logging,
+)
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
+CONFIG_MAPPING = {
+    "adim": "hidden_size",
+    "aheads": "num_attention_heads",
+    "conformer_dec_kernel_size": "decoder_kernel_size",
+    "conformer_enc_kernel_size": "encoder_kernel_size",
+    "decoder_normalize_before": "decoder_normalize_before",
+    "dlayers": "decoder_layers",
+    "dunits": "decoder_linear_units",
+    "duration_predictor_chans": "duration_predictor_channels",
+    "duration_predictor_kernel_size": "duration_predictor_kernel_size",
+    "duration_predictor_layers": "duration_predictor_layers",
+    "elayers": "encoder_layers",
+    "encoder_normalize_before": "encoder_normalize_before",
+    "energy_embed_dropout": "energy_embed_dropout",
+    "energy_embed_kernel_size": "energy_embed_kernel_size",
+    "energy_predictor_chans": "energy_predictor_channels",
+    "energy_predictor_dropout": "energy_predictor_dropout",
+    "energy_predictor_kernel_size": "energy_predictor_kernel_size",
+    "energy_predictor_layers": "energy_predictor_layers",
+    "eunits": "encoder_linear_units",
+    "pitch_embed_dropout": "pitch_embed_dropout",
+    "pitch_embed_kernel_size": "pitch_embed_kernel_size",
+    "pitch_predictor_chans": "pitch_predictor_channels",
+    "pitch_predictor_dropout": "pitch_predictor_dropout",
+    "pitch_predictor_kernel_size": "pitch_predictor_kernel_size",
+    "pitch_predictor_layers": "pitch_predictor_layers",
+    "positionwise_conv_kernel_size": "positionwise_conv_kernel_size",
+    "postnet_chans": "speech_decoder_postnet_units",
+    "postnet_filts": "speech_decoder_postnet_kernel",
+    "postnet_layers": "speech_decoder_postnet_layers",
+    "reduction_factor": "reduction_factor",
+    "stop_gradient_from_energy_predictor": "stop_gradient_from_energy_predictor",
+    "stop_gradient_from_pitch_predictor": "stop_gradient_from_pitch_predictor",
+    "transformer_dec_attn_dropout_rate": "decoder_attention_dropout_rate",
+    "transformer_dec_dropout_rate": "decoder_dropout_rate",
+    "transformer_dec_positional_dropout_rate": "decoder_positional_dropout_rate",
+    "transformer_enc_attn_dropout_rate": "encoder_attention_dropout_rate",
+    "transformer_enc_dropout_rate": "encoder_dropout_rate",
+    "transformer_enc_positional_dropout_rate": "encoder_positional_dropout_rate",
+    "use_cnn_in_conformer": "use_cnn_in_conformer",
+    "use_macaron_style_in_conformer": "use_macaron_style_in_conformer",
+    "use_masking": "use_masking",
+    "use_weighted_masking": "use_weighted_masking",
+    "idim": "input_dim",
+    "odim": "num_mel_bins",
+    "spk_embed_dim": "speaker_embed_dim",
+    "langs": "num_languages",
+    "spks": "num_speakers",
+}
+def remap_model_yaml_config(yaml_config_path):
+    with Path(yaml_config_path).open("r", encoding="utf-8") as f:
+        args = yaml.safe_load(f)
+        args = argparse.Namespace(**args)
+    remapped_config = {}
+    model_params = args.tts_conf["text2mel_params"]
+    # espnet_config_key -> hf_config_key, any keys not included are ignored
+    for espnet_config_key, hf_config_key in CONFIG_MAPPING.items():
+        if espnet_config_key in model_params:
+            remapped_config[hf_config_key] = model_params[espnet_config_key]
+    return remapped_config, args.g2p, args.token_list
+def convert_espnet_state_dict_to_hf(state_dict):
+    new_state_dict = {}
+    for key in state_dict:
+        if "tts.generator.text2mel." in key:
+            new_key = key.replace("tts.generator.text2mel.", "")
+            if "postnet" in key:
+                new_key = new_key.replace("postnet.postnet", "speech_decoder_postnet.layers")
+                new_key = new_key.replace(".0.weight", ".conv.weight")
+                new_key = new_key.replace(".1.weight", ".batch_norm.weight")
+                new_key = new_key.replace(".1.bias", ".batch_norm.bias")
+                new_key = new_key.replace(".1.running_mean", ".batch_norm.running_mean")
+                new_key = new_key.replace(".1.running_var", ".batch_norm.running_var")
+                new_key = new_key.replace(".1.num_batches_tracked", ".batch_norm.num_batches_tracked")
+            if "feat_out" in key:
+                if "weight" in key:
+                    new_key = "speech_decoder_postnet.feat_out.weight"
+                if "bias" in key:
+                    new_key = "speech_decoder_postnet.feat_out.bias"
+            if "encoder.embed.0.weight" in key:
+                new_key = new_key.replace("0.", "")
+            if "w_1" in key:
+                new_key = new_key.replace("w_1", "conv1")
+            if "w_2" in key:
+                new_key = new_key.replace("w_2", "conv2")
+            if "predictor.conv" in key:
+                new_key = new_key.replace(".conv", ".conv_layers")
+                pattern = r"(\d)\.(\d)"
+                replacement = (
+                    r"\1.conv" if ("2.weight" not in new_key) and ("2.bias" not in new_key) else r"\1.layer_norm"
+                )
+                new_key = re.sub(pattern, replacement, new_key)
+            if "pitch_embed" in key or "energy_embed" in key:
+                new_key = new_key.replace("0", "conv")
+            if "encoders" in key:
+                new_key = new_key.replace("encoders", "conformer_layers")
+                new_key = new_key.replace("norm_final", "final_layer_norm")
+                new_key = new_key.replace("norm_mha", "self_attn_layer_norm")
+                new_key = new_key.replace("norm_ff_macaron", "ff_macaron_layer_norm")
+                new_key = new_key.replace("norm_ff", "ff_layer_norm")
+                new_key = new_key.replace("norm_conv", "conv_layer_norm")
+            if "lid_emb" in key:
+                new_key = new_key.replace("lid_emb", "language_id_embedding")
+            if "sid_emb" in key:
+                new_key = new_key.replace("sid_emb", "speaker_id_embedding")
+            new_state_dict[new_key] = state_dict[key]
+    return new_state_dict
+@torch.no_grad()
+def convert_FastSpeech2ConformerModel_checkpoint(
+    checkpoint_path,
+    yaml_config_path,
+    pytorch_dump_folder_path,
+    repo_id=None,
+):
+    model_params, tokenizer_name, vocab = remap_model_yaml_config(yaml_config_path)
+    config = FastSpeech2ConformerConfig(**model_params)
+    # Prepare the model
+    model = FastSpeech2ConformerModel(config)
+    espnet_checkpoint = torch.load(checkpoint_path, weights_only=True)
+    hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)
+    model.load_state_dict(hf_compatible_state_dict)
+    model.save_pretrained(pytorch_dump_folder_path)
+    # Prepare the tokenizer
+    with TemporaryDirectory() as tempdir:
+        vocab = {token: id for id, token in enumerate(vocab)}
+        vocab_file = Path(tempdir) / "vocab.json"
+        with open(vocab_file, "w") as f:
+            json.dump(vocab, f)
+        should_strip_spaces = "no_space" in tokenizer_name
+        tokenizer = FastSpeech2ConformerTokenizer(str(vocab_file), should_strip_spaces=should_strip_spaces)
+    tokenizer.save_pretrained(pytorch_dump_folder_path)
+    if repo_id:
+        print("Pushing to the hub...")
+        model.push_to_hub(repo_id)
+        tokenizer.push_to_hub(repo_id)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+    parser.add_argument(
+        "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert"
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_FastSpeech2ConformerModel_checkpoint(
+        args.checkpoint_path,
+        args.yaml_config_path,
+        args.pytorch_dump_folder_path,
+        args.push_to_hub,
+    )

docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_hifigan.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert FastSpeech2Conformer HiFi-GAN checkpoint."""
+import argparse
+from pathlib import Path
+import torch
+import yaml
+from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig, logging
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
+def load_weights(checkpoint, hf_model, config):
+    vocoder_key_prefix = "tts.generator.vocoder."
+    checkpoint = {k.replace(vocoder_key_prefix, ""): v for k, v in checkpoint.items() if vocoder_key_prefix in k}
+    hf_model.apply_weight_norm()
+    hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"]
+    hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"]
+    hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"]
+    for i in range(len(config.upsample_rates)):
+        hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"]
+        hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"]
+        hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"]
+    for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)):
+        for j in range(len(config.resblock_dilation_sizes)):
+            hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"]
+            hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"]
+            hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"]
+            hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"]
+            hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"]
+            hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"]
+    hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"]
+    hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"]
+    hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"]
+    hf_model.remove_weight_norm()
+def remap_hifigan_yaml_config(yaml_config_path):
+    with Path(yaml_config_path).open("r", encoding="utf-8") as f:
+        args = yaml.safe_load(f)
+        args = argparse.Namespace(**args)
+    vocoder_type = args.tts_conf["vocoder_type"]
+    if vocoder_type != "hifigan_generator":
+        raise TypeError(f"Vocoder config must be for `hifigan_generator`, but got {vocoder_type}")
+    remapped_dict = {}
+    vocoder_params = args.tts_conf["vocoder_params"]
+    # espnet_config_key -> hf_config_key
+    key_mappings = {
+        "channels": "upsample_initial_channel",
+        "in_channels": "model_in_dim",
+        "resblock_dilations": "resblock_dilation_sizes",
+        "resblock_kernel_sizes": "resblock_kernel_sizes",
+        "upsample_kernel_sizes": "upsample_kernel_sizes",
+        "upsample_scales": "upsample_rates",
+    }
+    for espnet_config_key, hf_config_key in key_mappings.items():
+        remapped_dict[hf_config_key] = vocoder_params[espnet_config_key]
+    remapped_dict["sampling_rate"] = args.tts_conf["sampling_rate"]
+    remapped_dict["normalize_before"] = False
+    remapped_dict["leaky_relu_slope"] = vocoder_params["nonlinear_activation_params"]["negative_slope"]
+    return remapped_dict
+@torch.no_grad()
+def convert_hifigan_checkpoint(
+    checkpoint_path,
+    pytorch_dump_folder_path,
+    yaml_config_path=None,
+    repo_id=None,
+):
+    if yaml_config_path is not None:
+        config_kwargs = remap_hifigan_yaml_config(yaml_config_path)
+        config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)
+    else:
+        config = FastSpeech2ConformerHifiGanConfig()
+    model = FastSpeech2ConformerHifiGan(config)
+    orig_checkpoint = torch.load(checkpoint_path, weights_only=True)
+    load_weights(orig_checkpoint, model, config)
+    model.save_pretrained(pytorch_dump_folder_path)
+    if repo_id:
+        print("Pushing to the hub...")
+        model.push_to_hub(repo_id)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+    parser.add_argument("--yaml_config_path", default=None, type=str, help="Path to config.yaml of model to convert")
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_hifigan_checkpoint(
+        args.checkpoint_path,
+        args.pytorch_dump_folder_path,
+        args.yaml_config_path,
+        args.push_to_hub,
+    )

docs/transformers/build/lib/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert FastSpeech2Conformer checkpoint."""
+import argparse
+import torch
+from transformers import (
+    FastSpeech2ConformerConfig,
+    FastSpeech2ConformerHifiGan,
+    FastSpeech2ConformerHifiGanConfig,
+    FastSpeech2ConformerModel,
+    FastSpeech2ConformerWithHifiGan,
+    FastSpeech2ConformerWithHifiGanConfig,
+    logging,
+)
+from .convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch import (
+    convert_espnet_state_dict_to_hf,
+    remap_model_yaml_config,
+)
+from .convert_hifigan import load_weights, remap_hifigan_yaml_config
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
+def convert_FastSpeech2ConformerWithHifiGan_checkpoint(
+    checkpoint_path,
+    yaml_config_path,
+    pytorch_dump_folder_path,
+    repo_id=None,
+):
+    # Prepare the model
+    model_params, *_ = remap_model_yaml_config(yaml_config_path)
+    model_config = FastSpeech2ConformerConfig(**model_params)
+    model = FastSpeech2ConformerModel(model_config)
+    espnet_checkpoint = torch.load(checkpoint_path, weights_only=True)
+    hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)
+    model.load_state_dict(hf_compatible_state_dict)
+    # Prepare the vocoder
+    config_kwargs = remap_hifigan_yaml_config(yaml_config_path)
+    vocoder_config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)
+    vocoder = FastSpeech2ConformerHifiGan(vocoder_config)
+    load_weights(espnet_checkpoint, vocoder, vocoder_config)
+    # Prepare the model + vocoder
+    config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config)
+    with_hifigan_model = FastSpeech2ConformerWithHifiGan(config)
+    with_hifigan_model.model = model
+    with_hifigan_model.vocoder = vocoder
+    with_hifigan_model.save_pretrained(pytorch_dump_folder_path)
+    if repo_id:
+        print("Pushing to the hub...")
+        with_hifigan_model.push_to_hub(repo_id)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+    parser.add_argument(
+        "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert"
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        required=True,
+        default=None,
+        type=str,
+        help="Path to the output `FastSpeech2ConformerModel` PyTorch model.",
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_FastSpeech2ConformerWithHifiGan_checkpoint(
+        args.checkpoint_path,
+        args.yaml_config_path,
+        args.pytorch_dump_folder_path,
+        args.push_to_hub,
+    )

docs/transformers/build/lib/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py ADDED Viewed

	@@ -0,0 +1,1697 @@

+# coding=utf-8
+# Copyright 2023 The Espnet authors, IMS Toucan authors, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch FastSpeech2Conformer model."""
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, add_start_docstrings, logging, replace_return_docstrings
+from .configuration_fastspeech2_conformer import (
+    FastSpeech2ConformerConfig,
+    FastSpeech2ConformerHifiGanConfig,
+    FastSpeech2ConformerWithHifiGanConfig,
+)
+logger = logging.get_logger(__name__)
+@dataclass
+class FastSpeech2ConformerModelOutput(ModelOutput):
+    """
+    Output type of [`FastSpeech2ConformerModel`].
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Spectrogram generation loss.
+        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
+            The predicted spectrogram.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
+            Outputs of the duration predictor.
+        pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
+            Outputs of the pitch predictor.
+        energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
+            Outputs of the energy predictor.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    spectrogram: Optional[torch.FloatTensor] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    duration_outputs: Optional[torch.LongTensor] = None
+    pitch_outputs: Optional[torch.FloatTensor] = None
+    energy_outputs: Optional[torch.FloatTensor] = None
+@dataclass
+class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput):
+    """
+    Output type of [`FastSpeech2ConformerWithHifiGan`].
+    Args:
+        waveform (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
+            Speech output as a result of passing the predicted mel spectrogram through the vocoder.
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Spectrogram generation loss.
+        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
+            The predicted spectrogram.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
+            Outputs of the duration predictor.
+        pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
+            Outputs of the pitch predictor.
+        energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
+            Outputs of the energy predictor.
+    """
+    waveform: Optional[torch.FloatTensor] = None
+_CONFIG_FOR_DOC = "FastSpeech2ConformerConfig"
+FASTSPEECH2_CONFORMER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`FastSpeech2ConformerConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+HIFIGAN_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`FastSpeech2ConformerConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+FASTSPEECH2_CONFORMER_WITH_HIFIGAN_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`FastSpeech2ConformerWithHifiGanConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+def length_regulator(encoded_embeddings, duration_labels, speaking_speed=1.0):
+    """
+    Length regulator for feed-forward Transformer.
+    This is the length regulator module described in `FastSpeech: Fast, Robust and Controllable Text to Speech`
+    https://arxiv.org/pdf/1905.09263.pdf. The length regulator expands char or phoneme-level embedding features to
+    frame-level by repeating each feature based on the corresponding predicted durations.
+    Args:
+        encoded_embeddings (`torch.Tensor` of shape `(batch_size, max_text_length, embedding_dim)`):
+            Batch of sequences of char or phoneme embeddings.
+        duration_labels (`torch.LongTensor` of shape `(batch_size, time)`):
+            Batch of durations of each frame.
+        speaking_speed (`float`, *optional*, defaults to 1.0):
+            Value to control speed of speech.
+    Returns:
+        `torch.Tensor`:
+            Replicated input tensor based on durations (batch_size, time*, embedding_dim).
+    """
+    if speaking_speed <= 0:
+        raise ValueError("`speaking_speed` must be greater than 0.")
+    elif speaking_speed != 1.0:
+        duration_labels = torch.round(duration_labels.float() * speaking_speed).long()
+    if duration_labels.sum() == 0:
+        duration_labels[duration_labels.sum(dim=1).eq(0)] = 1
+    # Calculate the maximum length needed
+    max_len = torch.sum(duration_labels, dim=1).max()
+    # Create a padded tensor to hold the results
+    hidden_states = torch.zeros(
+        (encoded_embeddings.size(0), max_len, encoded_embeddings.size(2)),
+        dtype=torch.float,
+        device=encoded_embeddings.device,
+    )
+    # Loop through the batch and fill in the data
+    for i, (encoded_embedding, target_duration) in enumerate(zip(encoded_embeddings, duration_labels)):
+        repeated = torch.repeat_interleave(encoded_embedding, target_duration, dim=0)
+        hidden_states[i, : repeated.size(0)] = repeated
+    return hidden_states
+class FastSpeech2ConformerDurationPredictor(nn.Module):
+    """
+    Duration predictor module.
+    This is a module of duration predictor described in the paper 'FastSpeech: Fast, Robust and Controllable Text to
+    Speech' https://arxiv.org/pdf/1905.09263.pdf The duration predictor predicts a duration of each frame in log domain
+    from the hidden embeddings of encoder.
+    Note:
+        The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`, the
+        outputs are calculated in log domain but in `inference`, those are calculated in linear domain.
+    """
+    def __init__(self, config: FastSpeech2ConformerConfig):
+        super().__init__()
+        self.conv_layers = nn.ModuleList()
+        self.log_domain_offset = 1.0
+        for layer_idx in range(config.duration_predictor_layers):
+            num_chans = config.duration_predictor_channels
+            input_channels = config.hidden_size if layer_idx == 0 else num_chans
+            layer = FastSpeech2ConformerPredictorLayer(
+                input_channels,
+                num_chans,
+                config.duration_predictor_kernel_size,
+                config.duration_predictor_dropout_rate,
+            )
+            self.conv_layers.append(layer)
+        self.linear = nn.Linear(config.duration_predictor_channels, 1)
+    def forward(self, encoder_hidden_states):
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`):
+                Batch of input sequences.
+            padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*):
+                Batch of masks indicating padded part.
+        Returns:
+            `torch.Tensor`: Batch of predicted durations in log domain `(batch_size, max_text_length)`.
+        """
+        # (batch_size, input_dim, max_text_length)
+        hidden_states = encoder_hidden_states.transpose(1, -1)
+        for layer in self.conv_layers:
+            hidden_states = layer(hidden_states)
+        # NOTE: calculate in log domain, (batch_size, max_text_length)
+        hidden_states = self.linear(hidden_states.transpose(1, -1)).squeeze(-1)
+        if not self.training:
+            # NOTE: calculate in linear domain
+            hidden_states = torch.clamp(torch.round(hidden_states.exp() - self.log_domain_offset), min=0).long()
+        return hidden_states
+# Copied from transformers.models.speecht5.modeling_speecht5.SpeechT5BatchNormConvLayer
+class FastSpeech2ConformerBatchNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        if layer_id == 0:
+            in_conv_dim = config.num_mel_bins
+        else:
+            in_conv_dim = config.speech_decoder_postnet_units
+        if layer_id == config.speech_decoder_postnet_layers - 1:
+            out_conv_dim = config.num_mel_bins
+        else:
+            out_conv_dim = config.speech_decoder_postnet_units
+        self.conv = nn.Conv1d(
+            in_conv_dim,
+            out_conv_dim,
+            kernel_size=config.speech_decoder_postnet_kernel,
+            stride=1,
+            padding=(config.speech_decoder_postnet_kernel - 1) // 2,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm1d(out_conv_dim)
+        if layer_id < config.speech_decoder_postnet_layers - 1:
+            self.activation = nn.Tanh()
+        else:
+            self.activation = None
+        self.dropout = nn.Dropout(config.speech_decoder_postnet_dropout)
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        if self.activation is not None:
+            hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class FastSpeech2ConformerSpeechDecoderPostnet(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.feat_out = nn.Linear(config.hidden_size, config.num_mel_bins * config.reduction_factor)
+        self.layers = nn.ModuleList(
+            [FastSpeech2ConformerBatchNormConvLayer(config, i) for i in range(config.speech_decoder_postnet_layers)]
+        )
+    def forward(self, hidden_states: torch.Tensor):
+        outputs_before_postnet = self.feat_out(hidden_states).view(hidden_states.size(0), -1, self.config.num_mel_bins)
+        layer_output = outputs_before_postnet.transpose(1, 2)
+        for layer in self.layers:
+            layer_output = layer(layer_output)
+        outputs_after_postnet = outputs_before_postnet + layer_output.transpose(1, 2)
+        return outputs_before_postnet, outputs_after_postnet
+class FastSpeech2ConformerPredictorLayer(nn.Module):
+    def __init__(self, input_channels, num_chans, kernel_size, dropout_rate):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            input_channels,
+            num_chans,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.activation = nn.ReLU()
+        self.layer_norm = nn.LayerNorm(num_chans)
+        self.dropout = nn.Dropout(dropout_rate)
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        # Perform layer norm on dimension 1
+        hidden_states = hidden_states.transpose(1, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, -1)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class FastSpeech2ConformerVariancePredictor(nn.Module):
+    def __init__(
+        self,
+        config: FastSpeech2ConformerConfig,
+        num_layers=2,
+        num_chans=384,
+        kernel_size=3,
+        dropout_rate=0.5,
+    ):
+        """
+        Initilize variance predictor module.
+        Args:
+            input_dim (`int`): Input dimension.
+            num_layers (`int`, *optional*, defaults to 2): Number of convolutional layers.
+            num_chans (`int`, *optional*, defaults to 384): Number of channels of convolutional layers.
+            kernel_size (`int`, *optional*, defaults to 3): Kernel size of convolutional layers.
+            dropout_rate (`float`, *optional*, defaults to 0.5): Dropout rate.
+        """
+        super().__init__()
+        self.conv_layers = nn.ModuleList()
+        for idx in range(num_layers):
+            input_channels = config.hidden_size if idx == 0 else num_chans
+            layer = FastSpeech2ConformerPredictorLayer(input_channels, num_chans, kernel_size, dropout_rate)
+            self.conv_layers.append(layer)
+        self.linear = nn.Linear(num_chans, 1)
+    def forward(self, encoder_hidden_states, padding_masks=None):
+        """
+        Calculate forward propagation.
+        Args:
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`):
+                Batch of input sequences.
+            padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*):
+                Batch of masks indicating padded part.
+        Returns:
+            Tensor: Batch of predicted sequences `(batch_size, max_text_length, 1)`.
+        """
+        # (batch_size, input_dim, max_text_length)
+        hidden_states = encoder_hidden_states.transpose(1, -1)
+        for layer in self.conv_layers:
+            hidden_states = layer(hidden_states)
+        hidden_states = self.linear(hidden_states.transpose(1, 2))
+        if padding_masks is not None:
+            hidden_states = hidden_states.masked_fill(padding_masks, 0.0)
+        return hidden_states
+class FastSpeech2ConformerVarianceEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels=1,
+        out_channels=384,
+        kernel_size=1,
+        padding=0,
+        dropout_rate=0.0,
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+        )
+        self.dropout = nn.Dropout(dropout_rate)
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+class FastSpeech2ConformerAttention(nn.Module):
+    """
+    Multi-Head attention layer with relative position encoding. Details can be found in
+    https://github.com/espnet/espnet/pull/2816. Paper: https://arxiv.org/abs/1901.02860.
+    """
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        """Construct an FastSpeech2ConformerAttention object."""
+        super().__init__()
+        # We assume d_v always equals dim_key
+        self.num_heads = module_config["num_attention_heads"]
+        self.hidden_size = config.hidden_size
+        self.dim_key = self.hidden_size // self.num_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.linear_q = nn.Linear(self.hidden_size, self.hidden_size)
+        self.linear_k = nn.Linear(self.hidden_size, self.hidden_size)
+        self.linear_v = nn.Linear(self.hidden_size, self.hidden_size)
+        self.linear_out = nn.Linear(self.hidden_size, self.hidden_size)
+        self.dropout = nn.Dropout(p=module_config["attention_dropout_rate"])
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
+    def shift_relative_position_tensor(self, pos_tensor):
+        """
+        Args:
+            pos_tensor (torch.Tensor of shape (batch_size, head, time1, 2*time1-1)): Input tensor.
+        """
+        zero_pad = torch.zeros((*pos_tensor.size()[:3], 1), device=pos_tensor.device, dtype=pos_tensor.dtype)
+        pos_tensor_padded = torch.cat([zero_pad, pos_tensor], dim=-1)
+        pos_tensor_padded = pos_tensor_padded.view(*pos_tensor.size()[:2], pos_tensor.size(3) + 1, pos_tensor.size(2))
+        # only keep the positions from 0 to time2
+        pos_tensor = pos_tensor_padded[:, :, 1:].view_as(pos_tensor)[:, :, :, : pos_tensor.size(-1) // 2 + 1]
+        return pos_tensor
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        pos_emb: Optional[torch.Tensor] = None,
+        output_attentions: Optional[torch.Tensor] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, time2, size)`): Values of the hidden states
+            attention_mask (`torch.Tensor` of shape `(batch, time1, time2)`): Mask tensor.
+            pos_emb (`torch.Tensor` of shape `(batch, 2*time1-1, size)`): Positional embedding tensor.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        Returns:
+            `torch.Tensor`: Output tensor of shape `(batch, time1, d_model)`.
+        """
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.linear_q(hidden_states).view(bsz, -1, self.num_heads, self.head_dim)
+        key_states = self.linear_k(hidden_states).view(bsz, -1, self.num_heads, self.head_dim)
+        value_states = self.linear_v(hidden_states).view(bsz, -1, self.num_heads, self.head_dim)
+        bsz_pos = pos_emb.size(0)
+        pos_encoding = self.linear_pos(pos_emb).view(bsz_pos, -1, self.num_heads, self.head_dim)
+        # (batch_size, head, time1, dim_key)
+        query_with_bias_u = (query_states + self.pos_bias_u).transpose(1, 2)
+        # (batch_size, head, time1, dim_key)
+        query_with_bias_v = (query_states + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch_size, head, time1, time2)
+        matrix_ac = torch.matmul(query_with_bias_u, key_states.permute(0, 2, 3, 1))
+        # compute matrix b and matrix d
+        # (batch_size, head, time1, 2*time1-1)
+        matrix_bd = torch.matmul(query_with_bias_v, pos_encoding.permute(0, 2, 3, 1))
+        matrix_bd = self.shift_relative_position_tensor(matrix_bd)
+        # (batch_size, head, time1, time2)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.dim_key)
+        # Forward attention
+        if attention_mask is not None:
+            expected_size = (bsz, 1, q_len)
+            if attention_mask.size() != expected_size:
+                raise ValueError(f"Attention mask should be of size {expected_size}, but is {attention_mask.size()}")
+            attention_mask = attention_mask.unsqueeze(1).eq(0)
+            min_value = float(torch.finfo(scores.dtype).min)
+            scores = scores.masked_fill(attention_mask, min_value)
+            attn_weights = torch.softmax(scores, dim=-1).masked_fill(attention_mask, 0.0)
+        else:
+            attn_weights = torch.softmax(scores, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+        attn_output = torch.matmul(attn_weights, value_states.transpose(1, 2))
+        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, -1)
+        attn_output = self.linear_out(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights
+class FastSpeech2ConformerConvolutionModule(nn.Module):
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        super().__init__()
+        # kernel_size should be an odd number for 'SAME' padding
+        channels = config.hidden_size
+        kernel_size = module_config["kernel_size"]
+        self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=True)
+        self.depthwise_conv = nn.Conv1d(
+            channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=True
+        )
+        self.norm = nn.BatchNorm1d(channels)
+        self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=True)
+    def forward(self, hidden_states):
+        """
+        Compute convolution module.
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor.
+        Returns:
+            `torch.Tensor`: Output tensor of shape `(batch, time, channels)`.
+        """
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+        # GLU mechanism, (batch_size, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # (batch_size, channel, dim)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.norm(hidden_states)
+        hidden_states = hidden_states * torch.sigmoid(hidden_states)
+        hidden_states = self.pointwise_conv2(hidden_states)
+        return hidden_states.transpose(1, 2)
+class FastSpeech2ConformerEncoderLayer(nn.Module):
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        super().__init__()
+        # self-attention module definition
+        self.self_attn = FastSpeech2ConformerAttention(config, module_config)
+        # feed-forward module definition
+        self.feed_forward = FastSpeech2ConformerMultiLayeredConv1d(config, module_config)
+        self.macaron_style = config.use_macaron_style_in_conformer
+        if self.macaron_style:
+            self.feed_forward_macaron = FastSpeech2ConformerMultiLayeredConv1d(config, module_config)
+            self.ff_macaron_layer_norm = nn.LayerNorm(config.hidden_size)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        # convolution module definition
+        self.use_cnn_module = config.use_cnn_in_conformer
+        if self.use_cnn_module:
+            self.conv_module = FastSpeech2ConformerConvolutionModule(config, module_config)
+            self.conv_layer_norm = nn.LayerNorm(config.hidden_size)
+            self.final_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.ff_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(module_config["dropout_rate"])
+        self.size = config.hidden_size
+        self.normalize_before = module_config["normalize_before"]
+        self.concat_after = module_config["concat_after"]
+        if self.concat_after:
+            self.concat_linear = nn.Linear(config.hidden_size + config.hidden_size, config.hidden_size)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pos_emb: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[torch.Tensor] = False,
+    ):
+        """
+        Compute encoded features.
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, time, size)`): Input tensor.
+            pos_emb (`torch.Tensor` of shape `(1, time, size)`): Positional embeddings tensor.
+            attention_mask (`torch.Tensor` of shape `(batch, time)`): Attention mask tensor for the input.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        Returns:
+            `torch.Tensor`: Output tensor of shape `(batch, time, size)`.
+        """
+        # whether to use macaron style
+        if self.macaron_style:
+            residual = hidden_states
+            if self.normalize_before:
+                hidden_states = self.ff_macaron_layer_norm(hidden_states)
+            hidden_states = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(hidden_states))
+            if not self.normalize_before:
+                hidden_states = self.ff_macaron_layer_norm(hidden_states)
+        # multi-headed self-attention module
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        attention_output, attention_scores = self.self_attn(
+            hidden_states, attention_mask=attention_mask, pos_emb=pos_emb, output_attentions=output_attentions
+        )
+        if self.concat_after:
+            x_concat = torch.cat((hidden_states, attention_output), dim=-1)
+            hidden_states = self.concat_linear(x_concat)
+            hidden_states = residual + hidden_states
+        else:
+            hidden_states = self.dropout(attention_output)
+            hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        # convolution module
+        if self.use_cnn_module:
+            residual = hidden_states
+            if self.normalize_before:
+                hidden_states = self.conv_layer_norm(hidden_states)
+            hidden_states = self.conv_module(hidden_states)
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = residual + hidden_states
+            if not self.normalize_before:
+                hidden_states = self.conv_layer_norm(hidden_states)
+        # feed forward module
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.ff_layer_norm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + self.ff_scale * hidden_states
+        if not self.normalize_before:
+            hidden_states = self.ff_layer_norm(hidden_states)
+        if self.conv_module is not None:
+            hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attention_scores,)
+        return outputs
+class FastSpeech2ConformerMultiLayeredConv1d(nn.Module):
+    """
+    Multi-layered conv1d for Transformer block.
+    This is a module of multi-layered conv1d designed to replace positionwise feed-forward network in Transformer
+    block, which is introduced in 'FastSpeech: Fast, Robust and Controllable Text to Speech'
+    https://arxiv.org/pdf/1905.09263.pdf
+    """
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        """
+        Initialize FastSpeech2ConformerMultiLayeredConv1d module.
+        Args:
+            input_channels (`int`): Number of input channels.
+            hidden_channels (`int`): Number of hidden channels.
+            kernel_size (`int`): Kernel size of conv1d.
+            dropout_rate (`float`): Dropout rate.
+        """
+        super().__init__()
+        input_channels = config.hidden_size
+        hidden_channels = module_config["linear_units"]
+        kernel_size = config.positionwise_conv_kernel_size
+        self.conv1 = nn.Conv1d(input_channels, hidden_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+        self.conv2 = nn.Conv1d(hidden_channels, input_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+        self.dropout = nn.Dropout(module_config["dropout_rate"])
+    def forward(self, hidden_states):
+        """
+        Calculate forward propagation.
+        Args:
+            hidden_states (torch.Tensor): Batch of input tensors (batch_size, time, input_channels).
+        Returns:
+            torch.Tensor: Batch of output tensors (batch_size, time, hidden_channels).
+        """
+        hidden_states = hidden_states.transpose(-1, 1)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = torch.relu(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = hidden_states.transpose(-1, 1)
+        return hidden_states
+class FastSpeech2ConformerRelPositionalEncoding(nn.Module):
+    """
+    Args:
+    Relative positional encoding module (new implementation). Details can be found in
+    https://github.com/espnet/espnet/pull/2816. See : Appendix Batch in https://arxiv.org/abs/1901.02860
+        config (`FastSpeech2ConformerConfig`):
+            FastSpeech2ConformerConfig instance.
+        module_config (`dict`):
+            Dictionary containing the encoder or decoder module configuration from the `FastSpeech2ConformerConfig`.
+    """
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        """
+        Construct an PositionalEncoding object.
+        """
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.input_scale = math.sqrt(self.embed_dim)
+        self.dropout = nn.Dropout(p=module_config["positional_dropout_rate"])
+        self.pos_enc = None
+        self.max_len = 5000
+        self.extend_pos_enc(torch.tensor(0.0).expand(1, self.max_len))
+    def extend_pos_enc(self, x):
+        """Reset the positional encodings."""
+        if self.pos_enc is not None:
+            # self.pos_enc contains both positive and negative parts
+            # the length of self.pos_enc is 2 * input_len - 1
+            if self.pos_enc.size(1) >= x.size(1) * 2 - 1:
+                if self.pos_enc.dtype != x.dtype or self.pos_enc.device != x.device:
+                    self.pos_enc = self.pos_enc.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vector and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pos_enc_positive = torch.zeros(x.size(1), self.embed_dim)
+        pos_enc_negative = torch.zeros(x.size(1), self.embed_dim)
+        position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.embed_dim, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.embed_dim)
+        )
+        pos_enc_positive[:, 0::2] = torch.sin(position * div_term)
+        pos_enc_positive[:, 1::2] = torch.cos(position * div_term)
+        pos_enc_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pos_enc_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pos_enc_positive = torch.flip(pos_enc_positive, [0]).unsqueeze(0)
+        pos_enc_negative = pos_enc_negative[1:].unsqueeze(0)
+        pos_enc = torch.cat([pos_enc_positive, pos_enc_negative], dim=1)
+        self.pos_enc = pos_enc.to(device=x.device, dtype=x.dtype)
+    def forward(self, feature_representation):
+        """
+        Args:
+            feature_representation (`torch.Tensor` of shape (batch_size, time, `*`)):
+                Input tensor.
+        Returns:
+            `torch.Tensor`: Encoded tensor (batch_size, time, `*`).
+        """
+        self.extend_pos_enc(feature_representation)
+        hidden_states = feature_representation * self.input_scale
+        center_idx = self.pos_enc.size(1) // 2
+        pos_emb = self.pos_enc[:, center_idx - hidden_states.size(1) + 1 : center_idx + hidden_states.size(1)]
+        return self.dropout(hidden_states), self.dropout(pos_emb)
+class FastSpeech2ConformerEncoder(nn.Module):
+    """
+    FastSpeech2ConformerEncoder encoder module.
+    Args:
+        config (`FastSpeech2ConformerConfig`):
+            FastSpeech2ConformerConfig instance.
+        module_config (`dict`):
+            Dictionary containing the encoder or decoder module configuration from the `FastSpeech2ConformerConfig`.
+        use_encoder_input_layer (`bool`, *optional*, defaults to `False`):
+            Input layer type.
+    """
+    def __init__(
+        self,
+        config: FastSpeech2ConformerConfig,
+        module_config,
+        use_encoder_input_layer=False,
+    ):
+        super().__init__()
+        self.embed = None
+        if use_encoder_input_layer:
+            self.embed = nn.Embedding(
+                num_embeddings=config.vocab_size, embedding_dim=config.hidden_size, padding_idx=0
+            )
+        self.pos_enc = FastSpeech2ConformerRelPositionalEncoding(config, module_config)
+        self.conformer_layers = nn.ModuleList(
+            [FastSpeech2ConformerEncoderLayer(config, module_config) for _ in range(module_config["layers"])]
+        )
+    def forward(
+        self,
+        input_tensor: torch.LongTensor,
+        attention_mask: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+    ):
+        """
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        Returns:
+            `torch.Tensor`:
+                Output tensor of shape `(batch, time, attention_dim)`.
+        """
+        feature_representation = input_tensor
+        if self.embed is not None:
+            feature_representation = self.embed(feature_representation)
+        hidden_states, pos_emb = self.pos_enc(feature_representation)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for conformer_layer in self.conformer_layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_outputs = conformer_layer(hidden_states, pos_emb, attention_mask, output_attentions)
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+class FastSpeech2ConformerLoss(nn.Module):
+    def __init__(self, config: FastSpeech2ConformerConfig):
+        super().__init__()
+        use_masking = config.use_masking
+        use_weighted_masking = config.use_weighted_masking
+        if use_masking and use_weighted_masking:
+            raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.")
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+        # define criterions
+        reduction = "none" if self.use_weighted_masking else "mean"
+        self.l1_criterion = nn.L1Loss(reduction=reduction)
+        self.mse_criterion = nn.MSELoss(reduction=reduction)
+        self.duration_criterion = nn.MSELoss(reduction=reduction)
+        self.log_domain_offset = 1.0
+    def forward(
+        self,
+        outputs_after_postnet,
+        outputs_before_postnet,
+        duration_outputs,
+        pitch_outputs,
+        energy_outputs,
+        spectrogram_labels,
+        duration_labels,
+        pitch_labels,
+        energy_labels,
+        duration_mask,
+        spectrogram_mask,
+    ):
+        """
+        Args:
+            outputs_after_postnet (`torch.Tensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`):
+                Batch of outputs after postnet.
+            outputs_before_postnet (`torch.Tensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`):
+                Batch of outputs before postnet.
+            duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length)`):
+                Batch of outputs of duration predictor.
+            pitch_outputs (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
+                Batch of outputs of pitch predictor.
+            energy_outputs (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
+                Batch of outputs of energy predictor.
+            spectrogram_labels (`torch.Tensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`):
+                Batch of target features.
+            duration_labels (`torch.LongTensor` of shape `(batch_size, max_text_length)`): Batch of durations.
+            pitch_labels (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
+                Batch of target token-averaged pitch.
+            energy_labels (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
+                Batch of target token-averaged energy.
+            duration_mask (`torch.LongTensor`):
+                Mask used to discern which values the duration loss should be calculated for.
+            spectrogram_mask (`torch.LongTensor`):
+                Mask used to discern which values the spectrogam loss should be calculated for.
+        Returns:
+            `tuple(torch.FloatTensor)`: Tuple of tensors containing, in order, the L1 loss value, duration predictor
+            loss value, pitch predictor loss value, and energy predictor loss value.
+        """
+        pitch_and_energy_masks = duration_mask.unsqueeze(-1)
+        # apply mask to remove padded part
+        if self.use_masking:
+            outputs_before_postnet = outputs_before_postnet.masked_select(spectrogram_mask)
+            if outputs_after_postnet is not None:
+                outputs_after_postnet = outputs_after_postnet.masked_select(spectrogram_mask)
+            spectrogram_labels = spectrogram_labels.masked_select(spectrogram_mask)
+            duration_outputs = duration_outputs.masked_select(duration_mask)
+            duration_labels = duration_labels.masked_select(duration_mask)
+            pitch_outputs = pitch_outputs.masked_select(pitch_and_energy_masks)
+            energy_outputs = energy_outputs.masked_select(pitch_and_energy_masks)
+            pitch_labels = pitch_labels.masked_select(pitch_and_energy_masks)
+            energy_labels = energy_labels.masked_select(pitch_and_energy_masks)
+        # calculate loss
+        l1_loss = self.l1_criterion(outputs_before_postnet, spectrogram_labels)
+        if outputs_after_postnet is not None:
+            l1_loss = l1_loss + self.l1_criterion(outputs_after_postnet, spectrogram_labels)
+        duration_labels = torch.log(duration_labels.float() + self.log_domain_offset)
+        duration_loss = self.duration_criterion(duration_outputs, duration_labels)
+        pitch_loss = self.mse_criterion(pitch_outputs, pitch_labels)
+        energy_loss = self.mse_criterion(energy_outputs, energy_labels)
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            spectrogram_mask = nn.functional.pad(
+                spectrogram_mask.transpose(1, 2),
+                [0, spectrogram_labels.size(1) - spectrogram_mask.size(1), 0, 0, 0, 0],
+                value=False,
+            ).transpose(1, 2)
+            out_weights = spectrogram_mask.float() / spectrogram_mask.sum(dim=1, keepdim=True).float()
+            out_weights /= spectrogram_labels.size(0) * spectrogram_labels.size(2)
+            duration_weights = duration_mask.float() / duration_mask.sum(dim=1, keepdim=True).float()
+            duration_weights /= duration_labels.size(0)
+            # apply weight
+            l1_loss = l1_loss.mul(out_weights).masked_select(spectrogram_mask).sum()
+            duration_loss = duration_loss.mul(duration_weights).masked_select(duration_mask).sum()
+            pitch_weights = duration_weights.unsqueeze(-1)
+            pitch_loss = pitch_loss.mul(pitch_weights).masked_select(pitch_and_energy_masks).sum()
+            energy_loss = energy_loss.mul(pitch_weights).masked_select(pitch_and_energy_masks).sum()
+        return l1_loss + duration_loss + pitch_loss + energy_loss
+class FastSpeech2ConformerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = FastSpeech2ConformerConfig
+    base_model_prefix = "fastspeech2_conformer"
+    main_input_name = "input_ids"
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.LayerNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                key = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-key, b=key)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_()
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, FastSpeech2ConformerAttention):
+            nn.init.xavier_uniform_(module.pos_bias_u)
+            nn.init.xavier_uniform_(module.pos_bias_v)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, FastSpeech2ConformerEncoder):
+            module.gradient_checkpointing = value
+@add_start_docstrings(
+    """FastSpeech2Conformer Model.""",
+    FASTSPEECH2_CONFORMER_START_DOCSTRING,
+)
+class FastSpeech2ConformerModel(FastSpeech2ConformerPreTrainedModel):
+    """
+    FastSpeech 2 module.
+    This is a module of FastSpeech 2 described in 'FastSpeech 2: Fast and High-Quality End-to-End Text to Speech'
+    https://arxiv.org/abs/2006.04558. Instead of quantized pitch and energy, we use token-averaged value introduced in
+    FastPitch: Parallel Text-to-speech with Pitch Prediction. The encoder and decoder are Conformers instead of regular
+    Transformers.
+    """
+    def __init__(self, config: FastSpeech2ConformerConfig):
+        super().__init__(config)
+        self.config = config
+        # store hyperparameters
+        self.vocab_size = config.vocab_size
+        self.num_mel_bins = config.num_mel_bins
+        self.hidden_size = config.hidden_size
+        self.reduction_factor = config.reduction_factor
+        self.stop_gradient_from_pitch_predictor = config.stop_gradient_from_pitch_predictor
+        self.stop_gradient_from_energy_predictor = config.stop_gradient_from_energy_predictor
+        self.multilingual_model = config.num_languages is not None and config.num_languages > 1
+        if self.multilingual_model:
+            self.language_id_embedding = torch.nn.Embedding(config.num_languages, self.hidden_size)
+        self.multispeaker_model = config.num_speakers is not None and config.num_speakers > 1
+        if self.multispeaker_model:
+            self.speaker_id_embedding = torch.nn.Embedding(config.num_speakers, config.hidden_size)
+        self.speaker_embed_dim = config.speaker_embed_dim
+        if self.speaker_embed_dim:
+            self.projection = nn.Linear(config.hidden_size + self.speaker_embed_dim, config.hidden_size)
+        self.encoder = FastSpeech2ConformerEncoder(config, config.encoder_config, use_encoder_input_layer=True)
+        self.duration_predictor = FastSpeech2ConformerDurationPredictor(config)
+        self.pitch_predictor = FastSpeech2ConformerVariancePredictor(
+            config,
+            num_layers=config.pitch_predictor_layers,
+            num_chans=config.pitch_predictor_channels,
+            kernel_size=config.pitch_predictor_kernel_size,
+            dropout_rate=config.pitch_predictor_dropout,
+        )
+        # continuous pitch + FastPitch style avg
+        self.pitch_embed = FastSpeech2ConformerVarianceEmbedding(
+            out_channels=self.hidden_size,
+            kernel_size=config.pitch_embed_kernel_size,
+            padding=(config.pitch_embed_kernel_size - 1) // 2,
+            dropout_rate=config.pitch_embed_dropout,
+        )
+        self.energy_predictor = FastSpeech2ConformerVariancePredictor(
+            config,
+            num_layers=config.energy_predictor_layers,
+            num_chans=config.energy_predictor_channels,
+            kernel_size=config.energy_predictor_kernel_size,
+            dropout_rate=config.energy_predictor_dropout,
+        )
+        # continuous energy + FastPitch style avg
+        self.energy_embed = FastSpeech2ConformerVarianceEmbedding(
+            out_channels=self.hidden_size,
+            kernel_size=config.energy_embed_kernel_size,
+            padding=(config.energy_embed_kernel_size - 1) // 2,
+            dropout_rate=config.energy_embed_dropout,
+        )
+        # The decoder is an encoder
+        self.decoder = FastSpeech2ConformerEncoder(config, config.decoder_config, use_encoder_input_layer=False)
+        self.speech_decoder_postnet = FastSpeech2ConformerSpeechDecoderPostnet(config)
+        self.criterion = FastSpeech2ConformerLoss(config)
+        self.post_init()
+    @replace_return_docstrings(output_type=FastSpeech2ConformerModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        spectrogram_labels: Optional[torch.FloatTensor] = None,
+        duration_labels: Optional[torch.LongTensor] = None,
+        pitch_labels: Optional[torch.FloatTensor] = None,
+        energy_labels: Optional[torch.FloatTensor] = None,
+        speaker_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        speaker_embedding: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Union[Tuple, FastSpeech2ConformerModelOutput]:
+        """
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Input sequence of text vectors.
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to `None`):
+                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
+                `[0, 1]`: 0 for tokens that are **masked**, 1 for tokens that are **not masked**.
+            spectrogram_labels (`torch.FloatTensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`, *optional*, defaults to `None`):
+                Batch of padded target features.
+            duration_labels (`torch.LongTensor` of shape `(batch_size, sequence_length + 1)`, *optional*, defaults to `None`):
+                Batch of padded durations.
+            pitch_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
+                Batch of padded token-averaged pitch.
+            energy_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
+                Batch of padded token-averaged energy.
+            speaker_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
+                Speaker ids used to condition features of speech output by the model.
+            lang_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
+                Language ids used to condition features of speech output by the model.
+            speaker_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`, *optional*, defaults to `None`):
+                Embedding containing conditioning signals for the features of the speech.
+            return_dict (`bool`, *optional*, defaults to `None`):
+                Whether or not to return a [`FastSpeech2ConformerModelOutput`] instead of a plain tuple.
+            output_attentions (`bool`, *optional*, defaults to `None`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*, defaults to `None`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import (
+        ...     FastSpeech2ConformerTokenizer,
+        ...     FastSpeech2ConformerModel,
+        ...     FastSpeech2ConformerHifiGan,
+        ... )
+        >>> tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
+        >>> inputs = tokenizer("some text to convert to speech", return_tensors="pt")
+        >>> input_ids = inputs["input_ids"]
+        >>> model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
+        >>> output_dict = model(input_ids, return_dict=True)
+        >>> spectrogram = output_dict["spectrogram"]
+        >>> vocoder = FastSpeech2ConformerHifiGan.from_pretrained("espnet/fastspeech2_conformer_hifigan")
+        >>> waveform = vocoder(spectrogram)
+        >>> print(waveform.shape)
+        torch.Size([1, 49664])
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        if attention_mask is None:
+            attention_mask = torch.ones(input_ids.shape, device=input_ids.device)
+        has_missing_labels = (
+            spectrogram_labels is None or duration_labels is None or pitch_labels is None or energy_labels is None
+        )
+        if self.training and has_missing_labels:
+            raise ValueError("All labels must be provided to run in training mode.")
+        # forward encoder
+        text_masks = attention_mask.unsqueeze(-2)
+        encoder_outputs = self.encoder(
+            input_ids,
+            text_masks,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        # Integrate with language id, speaker id, and speaker embedding
+        if self.multispeaker_model and speaker_ids is not None:
+            speaker_id_embeddings = self.speaker_id_embedding(speaker_ids.view(-1))
+            hidden_states = hidden_states + speaker_id_embeddings.unsqueeze(1)
+        if self.multilingual_model and lang_ids is not None:
+            language_id_embbedings = self.language_id_embedding(lang_ids.view(-1))
+            hidden_states = hidden_states + language_id_embbedings.unsqueeze(1)
+        if self.speaker_embed_dim is not None and speaker_embedding is not None:
+            embeddings_expanded = (
+                nn.functional.normalize(speaker_embedding).unsqueeze(1).expand(-1, hidden_states.size(1), -1)
+            )
+            hidden_states = self.projection(torch.cat([hidden_states, embeddings_expanded], dim=-1))
+        # forward duration predictor and variance predictors
+        duration_mask = ~attention_mask.bool()
+        if self.stop_gradient_from_pitch_predictor:
+            pitch_predictions = self.pitch_predictor(hidden_states.detach(), duration_mask.unsqueeze(-1))
+        else:
+            pitch_predictions = self.pitch_predictor(hidden_states, duration_mask.unsqueeze(-1))
+        if self.stop_gradient_from_energy_predictor:
+            energy_predictions = self.energy_predictor(hidden_states.detach(), duration_mask.unsqueeze(-1))
+        else:
+            energy_predictions = self.energy_predictor(hidden_states, duration_mask.unsqueeze(-1))
+        duration_predictions = self.duration_predictor(hidden_states)
+        duration_predictions = duration_predictions.masked_fill(duration_mask, 0.0)
+        if not self.training:
+            # use prediction in inference
+            embedded_pitch_curve = self.pitch_embed(pitch_predictions)
+            embedded_energy_curve = self.energy_embed(energy_predictions)
+            hidden_states = hidden_states + embedded_energy_curve + embedded_pitch_curve
+            hidden_states = length_regulator(hidden_states, duration_predictions, self.config.speaking_speed)
+        else:
+            # use groundtruth in training
+            embedded_pitch_curve = self.pitch_embed(pitch_labels)
+            embedded_energy_curve = self.energy_embed(energy_labels)
+            hidden_states = hidden_states + embedded_energy_curve + embedded_pitch_curve
+            hidden_states = length_regulator(hidden_states, duration_labels)
+        # forward decoder
+        if not self.training:
+            hidden_mask = None
+        else:
+            spectrogram_mask = (spectrogram_labels != -100).any(dim=-1)
+            spectrogram_mask = spectrogram_mask.int()
+            if self.reduction_factor > 1:
+                length_dim = spectrogram_mask.shape[1] - spectrogram_mask.shape[1] % self.reduction_factor
+                spectrogram_mask = spectrogram_mask[:, :, :length_dim]
+            hidden_mask = spectrogram_mask.unsqueeze(-2)
+        decoder_outputs = self.decoder(
+            hidden_states,
+            hidden_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        outputs_before_postnet, outputs_after_postnet = self.speech_decoder_postnet(decoder_outputs[0])
+        loss = None
+        if self.training:
+            # calculate loss
+            loss_duration_mask = ~duration_mask
+            loss_spectrogram_mask = spectrogram_mask.unsqueeze(-1).bool()
+            loss = self.criterion(
+                outputs_after_postnet=outputs_after_postnet,
+                outputs_before_postnet=outputs_before_postnet,
+                duration_outputs=duration_predictions,
+                pitch_outputs=pitch_predictions,
+                energy_outputs=energy_predictions,
+                spectrogram_labels=spectrogram_labels,
+                duration_labels=duration_labels,
+                pitch_labels=pitch_labels,
+                energy_labels=energy_labels,
+                duration_mask=loss_duration_mask,
+                spectrogram_mask=loss_spectrogram_mask,
+            )
+        if not return_dict:
+            postnet_outputs = (outputs_after_postnet,)
+            audio_feature_predictions = (
+                duration_predictions,
+                pitch_predictions,
+                energy_predictions,
+            )
+            outputs = postnet_outputs + encoder_outputs + decoder_outputs[1:] + audio_feature_predictions
+            return ((loss,) + outputs) if loss is not None else outputs
+        return FastSpeech2ConformerModelOutput(
+            loss=loss,
+            spectrogram=outputs_after_postnet,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            duration_outputs=duration_predictions,
+            pitch_outputs=pitch_predictions,
+            energy_outputs=energy_predictions,
+        )
+# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
+class HifiGanResidualBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+    def get_padding(self, kernel_size, dilation=1):
+        return (kernel_size * dilation - dilation) // 2
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+        for layer in self.convs1:
+            weight_norm(layer)
+        for layer in self.convs2:
+            weight_norm(layer)
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.remove_weight_norm(layer)
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+@add_start_docstrings(
+    """HiFi-GAN vocoder.""",
+    HIFIGAN_START_DOCSTRING,
+)
+# Copied from transformers.models.speecht5.modeling_speecht5.SpeechT5HifiGan with SpeechT5->FastSpeech2Conformer
+class FastSpeech2ConformerHifiGan(PreTrainedModel):
+    config_class = FastSpeech2ConformerHifiGanConfig
+    main_input_name = "spectrogram"
+    def __init__(self, config: FastSpeech2ConformerHifiGanConfig):
+        super().__init__(config)
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(
+            config.model_in_dim,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+        self.upsampler = nn.ModuleList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.ConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
+        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
+        self.register_buffer("mean", torch.zeros(config.model_in_dim))
+        self.register_buffer("scale", torch.ones(config.model_in_dim))
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+        weight_norm(self.conv_pre)
+        for layer in self.upsampler:
+            weight_norm(layer)
+        for layer in self.resblocks:
+            layer.apply_weight_norm()
+        weight_norm(self.conv_post)
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.conv_pre)
+        for layer in self.upsampler:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+        nn.utils.remove_weight_norm(self.conv_post)
+    def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
+        r"""
+        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
+        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
+        waveform.
+        Args:
+            spectrogram (`torch.FloatTensor`):
+                Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
+                config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.
+        Returns:
+            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
+            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
+        """
+        if self.config.normalize_before:
+            spectrogram = (spectrogram - self.mean) / self.scale
+        is_batched = spectrogram.dim() == 3
+        if not is_batched:
+            spectrogram = spectrogram.unsqueeze(0)
+        hidden_states = spectrogram.transpose(2, 1)
+        hidden_states = self.conv_pre(hidden_states)
+        for i in range(self.num_upsamples):
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
+            hidden_states = self.upsampler[i](hidden_states)
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+        hidden_states = nn.functional.leaky_relu(hidden_states)
+        hidden_states = self.conv_post(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        if not is_batched:
+            # remove batch dim and collapse tensor to 1-d audio waveform
+            waveform = hidden_states.squeeze(0).transpose(1, 0).view(-1)
+        else:
+            # remove seq-len dim since this collapses to 1
+            waveform = hidden_states.squeeze(1)
+        return waveform
+@add_start_docstrings(
+    "The FastSpeech2ConformerModel with a FastSpeech2ConformerHifiGan vocoder head that performs text-to-speech (waveform).",
+    FASTSPEECH2_CONFORMER_WITH_HIFIGAN_START_DOCSTRING,
+)
+class FastSpeech2ConformerWithHifiGan(PreTrainedModel):
+    config_class = FastSpeech2ConformerWithHifiGanConfig
+    def __init__(self, config: FastSpeech2ConformerWithHifiGanConfig):
+        super().__init__(config)
+        self.model = FastSpeech2ConformerModel(config.model_config)
+        self.vocoder = FastSpeech2ConformerHifiGan(config.vocoder_config)
+        self.config = config
+    @replace_return_docstrings(
+        output_type=FastSpeech2ConformerWithHifiGanOutput, config_class=FastSpeech2ConformerWithHifiGanConfig
+    )
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        spectrogram_labels: Optional[torch.FloatTensor] = None,
+        duration_labels: Optional[torch.LongTensor] = None,
+        pitch_labels: Optional[torch.FloatTensor] = None,
+        energy_labels: Optional[torch.FloatTensor] = None,
+        speaker_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        speaker_embedding: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Union[Tuple, FastSpeech2ConformerModelOutput]:
+        """
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Input sequence of text vectors.
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to `None`):
+                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
+                `[0, 1]`: 0 for tokens that are **masked**, 1 for tokens that are **not masked**.
+            spectrogram_labels (`torch.FloatTensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`, *optional*, defaults to `None`):
+                Batch of padded target features.
+            duration_labels (`torch.LongTensor` of shape `(batch_size, sequence_length + 1)`, *optional*, defaults to `None`):
+                Batch of padded durations.
+            pitch_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
+                Batch of padded token-averaged pitch.
+            energy_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
+                Batch of padded token-averaged energy.
+            speaker_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
+                Speaker ids used to condition features of speech output by the model.
+            lang_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
+                Language ids used to condition features of speech output by the model.
+            speaker_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`, *optional*, defaults to `None`):
+                Embedding containing conditioning signals for the features of the speech.
+            return_dict (`bool`, *optional*, defaults to `None`):
+                Whether or not to return a [`FastSpeech2ConformerModelOutput`] instead of a plain tuple.
+            output_attentions (`bool`, *optional*, defaults to `None`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*, defaults to `None`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import (
+        ...     FastSpeech2ConformerTokenizer,
+        ...     FastSpeech2ConformerWithHifiGan,
+        ... )
+        >>> tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
+        >>> inputs = tokenizer("some text to convert to speech", return_tensors="pt")
+        >>> input_ids = inputs["input_ids"]
+        >>> model = FastSpeech2ConformerWithHifiGan.from_pretrained("espnet/fastspeech2_conformer_with_hifigan")
+        >>> output_dict = model(input_ids, return_dict=True)
+        >>> waveform = output_dict["waveform"]
+        >>> print(waveform.shape)
+        torch.Size([1, 49664])
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.model_config.use_return_dict
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.model_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.model_config.output_hidden_states
+        )
+        model_outputs = self.model(
+            input_ids,
+            attention_mask,
+            spectrogram_labels=spectrogram_labels,
+            duration_labels=duration_labels,
+            pitch_labels=pitch_labels,
+            energy_labels=energy_labels,
+            speaker_ids=speaker_ids,
+            lang_ids=lang_ids,
+            speaker_embedding=speaker_embedding,
+            return_dict=return_dict,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        if not return_dict:
+            has_missing_labels = (
+                spectrogram_labels is None or duration_labels is None or pitch_labels is None or energy_labels is None
+            )
+            if has_missing_labels:
+                spectrogram = model_outputs[0]
+            else:
+                spectrogram = model_outputs[1]
+        else:
+            spectrogram = model_outputs["spectrogram"]
+        waveform = self.vocoder(spectrogram)
+        if not return_dict:
+            return model_outputs + (waveform,)
+        return FastSpeech2ConformerWithHifiGanOutput(waveform=waveform, **model_outputs)
+__all__ = [
+    "FastSpeech2ConformerWithHifiGan",
+    "FastSpeech2ConformerHifiGan",
+    "FastSpeech2ConformerModel",
+    "FastSpeech2ConformerPreTrainedModel",
+]

docs/transformers/build/lib/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for FastSpeech2Conformer."""
+import json
+import os
+from typing import Optional, Tuple
+import regex
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging, requires_backends
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
+class FastSpeech2ConformerTokenizer(PreTrainedTokenizer):
+    """
+    Construct a FastSpeech2Conformer tokenizer.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<sos/eos>"`):
+            The begin of sequence token. Note that for FastSpeech2, it is the same as the `eos_token`.
+        eos_token (`str`, *optional*, defaults to `"<sos/eos>"`):
+            The end of sequence token. Note that for FastSpeech2, it is the same as the `bos_token`.
+        pad_token (`str`, *optional*, defaults to `"<blank>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        should_strip_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not to strip the spaces from the list of tokens.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<sos/eos>",
+        eos_token="<sos/eos>",
+        pad_token="<blank>",
+        unk_token="<unk>",
+        should_strip_spaces=False,
+        **kwargs,
+    ):
+        requires_backends(self, "g2p_en")
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        import g2p_en
+        self.g2p = g2p_en.G2p()
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            should_strip_spaces=should_strip_spaces,
+            **kwargs,
+        )
+        self.should_strip_spaces = should_strip_spaces
+    @property
+    def vocab_size(self):
+        return len(self.decoder)
+    def get_vocab(self):
+        "Returns vocab as a dict"
+        return dict(self.encoder, **self.added_tokens_encoder)
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        # expand symbols
+        text = regex.sub(";", ",", text)
+        text = regex.sub(":", ",", text)
+        text = regex.sub("-", " ", text)
+        text = regex.sub("&", "and", text)
+        # strip unnecessary symbols
+        text = regex.sub(r"[\(\)\[\]\<\>\"]+", "", text)
+        # strip whitespaces
+        text = regex.sub(r"\s+", " ", text)
+        text = text.upper()
+        return text, kwargs
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        # phonemize
+        tokens = self.g2p(text)
+        if self.should_strip_spaces:
+            tokens = list(filter(lambda s: s != " ", tokens))
+        tokens.append(self.eos_token)
+        return tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+    # Override since phonemes cannot be converted back to strings
+    def decode(self, token_ids, **kwargs):
+        logger.warning(
+            "Phonemes cannot be reliably converted to a string due to the one-many mapping, converting to tokens instead."
+        )
+        return self.convert_ids_to_tokens(token_ids)
+    # Override since phonemes cannot be converted back to strings
+    def convert_tokens_to_string(self, tokens, **kwargs):
+        logger.warning(
+            "Phonemes cannot be reliably converted to a string due to the one-many mapping, returning the tokens."
+        )
+        return tokens
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.get_vocab(), ensure_ascii=False))
+        return (vocab_file,)
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["g2p"] = None
+        return state
+    def __setstate__(self, d):
+        self.__dict__ = d
+        try:
+            import g2p_en
+            self.g2p = g2p_en.G2p()
+        except ImportError:
+            raise ImportError(
+                "You need to install g2p-en to use FastSpeech2ConformerTokenizer. "
+                "See https://pypi.org/project/g2p-en/ for installation."
+            )
+__all__ = ["FastSpeech2ConformerTokenizer"]

docs/transformers/build/lib/transformers/models/flaubert/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_flaubert import *
+    from .modeling_flaubert import *
+    from .modeling_tf_flaubert import *
+    from .tokenization_flaubert import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/flaubert/configuration_flaubert.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# coding=utf-8
+# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flaubert configuration"""
+from collections import OrderedDict
+from typing import Mapping
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+class FlaubertConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`FlaubertModel`] or a [`TFFlaubertModel`]. It is
+    used to instantiate a FlauBERT model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FlauBERT
+    [flaubert/flaubert_base_uncased](https://huggingface.co/flaubert/flaubert_base_uncased) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        pre_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply the layer normalization before or after the feed forward layer following the attention in
+            each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand with
+            Structured Dropout. ICLR 2020)
+        vocab_size (`int`, *optional*, defaults to 30145):
+            Vocabulary size of the FlauBERT model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`FlaubertModel`] or [`TFFlaubertModel`].
+        emb_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention mechanism
+        gelu_activation (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a *gelu* activation instead of *relu*.
+        sinusoidal_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
+        causal (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in
+            order to only attend to the left-side context instead if a bidirectional context.
+        asm (`bool`, *optional*, defaults to `False`):
+            Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+            layer.
+        n_langs (`int`, *optional*, defaults to 1):
+            The number of languages the model handles. Set to 1 for monolingual models.
+        use_lang_emb (`bool`, *optional*, defaults to `True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see [the multilingual
+            models page](http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings) for information
+            on how to use them.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        embed_init_std (`float`, *optional*, defaults to 2048^-0.5):
+            The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.
+        init_std (`int`, *optional*, defaults to 50257):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the
+            embedding matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        bos_index (`int`, *optional*, defaults to 0):
+            The index of the beginning of sentence token in the vocabulary.
+        eos_index (`int`, *optional*, defaults to 1):
+            The index of the end of sentence token in the vocabulary.
+        pad_index (`int`, *optional*, defaults to 2):
+            The index of the padding token in the vocabulary.
+        unk_index (`int`, *optional*, defaults to 3):
+            The index of the unknown token in the vocabulary.
+        mask_index (`int`, *optional*, defaults to 5):
+            The index of the masking token in the vocabulary.
+        is_encoder(`bool`, *optional*, defaults to `True`):
+            Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+        summary_type (`string`, *optional*, defaults to "first"):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+            Has to be one of the following options:
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Used in the sequence classification and multiple choice models.
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Used in the sequence classification and multiple choice models.
+            The dropout ratio to be used after the projection and activation.
+        start_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        end_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        mask_token_id (`int`, *optional*, defaults to 0):
+            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+        lang_id (`int`, *optional*, defaults to 1):
+            The ID of the language used by the model. This parameter is used when generating text in a given language.
+    """
+    model_type = "flaubert"
+    attribute_map = {
+        "hidden_size": "emb_dim",
+        "num_attention_heads": "n_heads",
+        "num_hidden_layers": "n_layers",
+        "n_words": "vocab_size",  # For backward compatibility
+    }
+    def __init__(
+        self,
+        pre_norm=False,
+        layerdrop=0.0,
+        vocab_size=30145,
+        emb_dim=2048,
+        n_layers=12,
+        n_heads=16,
+        dropout=0.1,
+        attention_dropout=0.1,
+        gelu_activation=True,
+        sinusoidal_embeddings=False,
+        causal=False,
+        asm=False,
+        n_langs=1,
+        use_lang_emb=True,
+        max_position_embeddings=512,
+        embed_init_std=2048**-0.5,
+        layer_norm_eps=1e-12,
+        init_std=0.02,
+        bos_index=0,
+        eos_index=1,
+        pad_index=2,
+        unk_index=3,
+        mask_index=5,
+        is_encoder=True,
+        summary_type="first",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        start_n_top=5,
+        end_n_top=5,
+        mask_token_id=0,
+        lang_id=0,
+        pad_token_id=2,
+        bos_token_id=0,
+        **kwargs,
+    ):
+        """Constructs FlaubertConfig."""
+        self.pre_norm = pre_norm
+        self.layerdrop = layerdrop
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.gelu_activation = gelu_activation
+        self.sinusoidal_embeddings = sinusoidal_embeddings
+        self.causal = causal
+        self.asm = asm
+        self.n_langs = n_langs
+        self.use_lang_emb = use_lang_emb
+        self.layer_norm_eps = layer_norm_eps
+        self.bos_index = bos_index
+        self.eos_index = eos_index
+        self.pad_index = pad_index
+        self.unk_index = unk_index
+        self.mask_index = mask_index
+        self.is_encoder = is_encoder
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_init_std = embed_init_std
+        self.init_std = init_std
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.summary_first_dropout = summary_first_dropout
+        self.start_n_top = start_n_top
+        self.end_n_top = end_n_top
+        self.mask_token_id = mask_token_id
+        self.lang_id = lang_id
+        if "n_words" in kwargs:
+            self.n_words = kwargs["n_words"]
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
+class FlaubertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+__all__ = ["FlaubertConfig", "FlaubertOnnxConfig"]

docs/transformers/build/lib/transformers/models/flaubert/modeling_flaubert.py ADDED Viewed

	@@ -0,0 +1,1739 @@

+# coding=utf-8
+# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Flaubert model, based on XLM."""
+import itertools
+import math
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional, Tuple, Union
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from ...activations import gelu, get_activation
+from ...generation import GenerationMixin
+from ...modeling_outputs import (
+    BaseModelOutput,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_flaubert import FlaubertConfig
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "flaubert/flaubert_base_cased"
+_CONFIG_FOR_DOC = "FlaubertConfig"
+# Copied from transformers.models.xlm.modeling_xlm.create_sinusoidal_embeddings
+def create_sinusoidal_embeddings(n_pos, dim, out):
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
+    out.requires_grad = False
+    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+    out.detach_()
+# Copied from transformers.models.xlm.modeling_xlm.get_masks
+def get_masks(slen, lengths, causal, padding_mask=None):
+    """
+    Generate hidden states mask, and optionally an attention mask.
+    """
+    alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
+    if padding_mask is not None:
+        mask = padding_mask
+    else:
+        assert lengths.max().item() <= slen
+        mask = alen < lengths[:, None]
+    # attention mask is the same as mask, or triangular inferior attention (causal)
+    bs = lengths.size(0)
+    if causal:
+        attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
+    else:
+        attn_mask = mask
+    # sanity check
+    assert mask.size() == (bs, slen)
+    assert causal is False or attn_mask.size() == (bs, slen, slen)
+    return mask, attn_mask
+# Copied from transformers.models.xlm.modeling_xlm.MultiHeadAttention
+class MultiHeadAttention(nn.Module):
+    NEW_ID = itertools.count()
+    def __init__(self, n_heads, dim, config):
+        super().__init__()
+        self.layer_id = next(MultiHeadAttention.NEW_ID)
+        self.dim = dim
+        self.n_heads = n_heads
+        self.dropout = config.attention_dropout
+        assert self.dim % self.n_heads == 0
+        self.q_lin = nn.Linear(dim, dim)
+        self.k_lin = nn.Linear(dim, dim)
+        self.v_lin = nn.Linear(dim, dim)
+        self.out_lin = nn.Linear(dim, dim)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        attention_head_size = self.dim // self.n_heads
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads)
+        # Prune linear layers
+        self.q_lin = prune_linear_layer(self.q_lin, index)
+        self.k_lin = prune_linear_layer(self.k_lin, index)
+        self.v_lin = prune_linear_layer(self.v_lin, index)
+        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.dim = attention_head_size * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(self, input, mask, kv=None, cache=None, head_mask=None, output_attentions=False):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = input.size()
+        if kv is None:
+            klen = qlen if cache is None else cache["slen"] + qlen
+        else:
+            klen = kv.size(1)
+        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
+        n_heads = self.n_heads
+        dim_per_head = self.dim // n_heads
+        mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
+        def shape(x):
+            """projection"""
+            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
+        def unshape(x):
+            """compute context"""
+            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
+        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = torch.cat([k_, k], dim=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = torch.cat([v_, v], dim=2)  # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, qlen, dim_per_head)
+        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, qlen, klen)
+        mask = (mask == 0).view(mask_reshape).expand_as(scores)  # (bs, n_heads, qlen, klen)
+        scores.masked_fill_(mask, torch.finfo(scores.dtype).min)  # (bs, n_heads, qlen, klen)
+        weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
+        weights = nn.functional.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+        context = torch.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
+        outputs = (self.out_lin(context),)
+        if output_attentions:
+            outputs = outputs + (weights,)
+        return outputs
+# Copied from transformers.models.xlm.modeling_xlm.TransformerFFN
+class TransformerFFN(nn.Module):
+    def __init__(self, in_dim, dim_hidden, out_dim, config):
+        super().__init__()
+        self.dropout = config.dropout
+        self.lin1 = nn.Linear(in_dim, dim_hidden)
+        self.lin2 = nn.Linear(dim_hidden, out_dim)
+        self.act = gelu if config.gelu_activation else nn.functional.relu
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+    def forward(self, input):
+        return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)
+    def ff_chunk(self, input):
+        x = self.lin1(input)
+        x = self.act(x)
+        x = self.lin2(x)
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
+        return x
+FLAUBERT_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`FlaubertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+FLAUBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`:
+        cache (`Dict[str, torch.FloatTensor]`, *optional*):
+            Dictionary strings to `torch.FloatTensor` that contains precomputed hidden-states (key and values in the
+            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
+            decoding. The dictionary object will be modified in-place during the forward pass to add newly computed
+            hidden-states.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.",
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_xlm.XLMPredLayer with XLM->Flaubert
+class FlaubertPredLayer(nn.Module):
+    """
+    Prediction layer (cross_entropy or adaptive_softmax).
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.asm = config.asm
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+        dim = config.emb_dim
+        if config.asm is False:
+            self.proj = nn.Linear(dim, config.n_words, bias=True)
+        else:
+            self.proj = nn.AdaptiveLogSoftmaxWithLoss(
+                in_features=dim,
+                n_classes=config.n_words,
+                cutoffs=config.asm_cutoffs,
+                div_value=config.asm_div_value,
+                head_bias=True,  # default is False
+            )
+    def forward(self, x, y=None):
+        """Compute the loss, and optionally the scores."""
+        outputs = ()
+        if self.asm is False:
+            scores = self.proj(x)
+            outputs = (scores,) + outputs
+            if y is not None:
+                loss = nn.functional.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="mean")
+                outputs = (loss,) + outputs
+        else:
+            scores = self.proj.log_prob(x)
+            outputs = (scores,) + outputs
+            if y is not None:
+                _, loss = self.proj(x, y)
+                outputs = (loss,) + outputs
+        return outputs
+@dataclass
+# Copied from transformers.models.xlm.modeling_xlm.XLMSquadHeadOutput with XLM->Flaubert
+class FlaubertSquadHeadOutput(ModelOutput):
+    """
+    Base class for outputs of question answering models using a [`~modeling_utils.FlaubertSQuADHead`].
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
+            losses.
+        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Indices for the top config.start_n_top start token possibilities (beam-search).
+        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
+            (beam-search).
+        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
+        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the `is_impossible` label of the answers.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    start_top_log_probs: Optional[torch.FloatTensor] = None
+    start_top_index: Optional[torch.LongTensor] = None
+    end_top_log_probs: Optional[torch.FloatTensor] = None
+    end_top_index: Optional[torch.LongTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerStartLogits with XLM->Flaubert
+class FlaubertPoolerStartLogits(nn.Module):
+    """
+    Compute SQuAD start logits from sequence hidden states.
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model.
+    """
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, 1)
+    def forward(
+        self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+                should be masked.
+        Returns:
+            `torch.FloatTensor`: The start logits for SQuAD.
+        """
+        x = self.dense(hidden_states).squeeze(-1)
+        if p_mask is not None:
+            if p_mask.dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
+        return x
+# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerEndLogits with XLM->Flaubert
+class FlaubertPoolerEndLogits(nn.Module):
+    """
+    Compute SQuAD end logits from sequence hidden states.
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
+            to use.
+    """
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dense_1 = nn.Linear(config.hidden_size, 1)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_states: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        p_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
+                The hidden states of the first tokens for the labeled span.
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                The position of the first token for the labeled span.
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+                should be masked.
+        <Tip>
+        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
+        `start_states`.
+        </Tip>
+        Returns:
+            `torch.FloatTensor`: The end logits for SQuAD.
+        """
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
+        if start_positions is not None:
+            slen, hsz = hidden_states.shape[-2:]
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
+            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
+        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
+        x = self.activation(x)
+        x = self.LayerNorm(x)
+        x = self.dense_1(x).squeeze(-1)
+        if p_mask is not None:
+            if p_mask.dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
+        return x
+# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerAnswerClass with XLM->Flaubert
+class FlaubertPoolerAnswerClass(nn.Module):
+    """
+    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model.
+    """
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_states: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        cls_index: Optional[torch.LongTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
+                The hidden states of the first tokens for the labeled span.
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                The position of the first token for the labeled span.
+            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
+        <Tip>
+        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
+        `start_states`.
+        </Tip>
+        Returns:
+            `torch.FloatTensor`: The SQuAD 2.0 answer class.
+        """
+        # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
+        hsz = hidden_states.shape[-1]
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
+        if start_positions is not None:
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
+        if cls_index is not None:
+            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
+        else:
+            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
+        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
+        x = self.activation(x)
+        x = self.dense_1(x).squeeze(-1)
+        return x
+# Copied from transformers.models.xlm.modeling_xlm.XLMSQuADHead with XLM->Flaubert
+class FlaubertSQuADHead(nn.Module):
+    r"""
+    A SQuAD head inspired by XLNet.
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
+            to use.
+    """
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+        self.start_n_top = config.start_n_top
+        self.end_n_top = config.end_n_top
+        self.start_logits = FlaubertPoolerStartLogits(config)
+        self.end_logits = FlaubertPoolerEndLogits(config)
+        self.answer_class = FlaubertPoolerAnswerClass(config)
+    @replace_return_docstrings(output_type=FlaubertSquadHeadOutput, config_class=FlaubertConfig)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        cls_index: Optional[torch.LongTensor] = None,
+        is_impossible: Optional[torch.LongTensor] = None,
+        p_mask: Optional[torch.FloatTensor] = None,
+        return_dict: bool = False,
+    ) -> Union[FlaubertSquadHeadOutput, Tuple[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                Final hidden states of the model on the sequence tokens.
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Positions of the first token for the labeled span.
+            end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Positions of the last token for the labeled span.
+            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
+            is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Whether the question has a possible answer in the paragraph or not.
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+                should be masked.
+            return_dict (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        Returns:
+        """
+        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, let's remove the dimension added by batch splitting
+            for x in (start_positions, end_positions, cls_index, is_impossible):
+                if x is not None and x.dim() > 1:
+                    x.squeeze_(-1)
+            # during training, compute the end logits based on the ground truth of the start position
+            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
+            loss_fct = CrossEntropyLoss()
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            if cls_index is not None and is_impossible is not None:
+                # Predict answerability from the representation of CLS and START
+                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
+                loss_fct_cls = nn.BCEWithLogitsLoss()
+                cls_loss = loss_fct_cls(cls_logits, is_impossible)
+                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
+                total_loss += cls_loss * 0.5
+            return FlaubertSquadHeadOutput(loss=total_loss) if return_dict else (total_loss,)
+        else:
+            # during inference, compute the end logits based on beam search
+            bsz, slen, hsz = hidden_states.size()
+            start_log_probs = nn.functional.softmax(start_logits, dim=-1)  # shape (bsz, slen)
+            start_top_log_probs, start_top_index = torch.topk(
+                start_log_probs, self.start_n_top, dim=-1
+            )  # shape (bsz, start_n_top)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
+                start_states
+            )  # shape (bsz, slen, start_n_top, hsz)
+            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
+            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
+            end_log_probs = nn.functional.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
+            end_top_log_probs, end_top_index = torch.topk(
+                end_log_probs, self.end_n_top, dim=1
+            )  # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
+            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
+            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
+            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
+            if not return_dict:
+                return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
+            else:
+                return FlaubertSquadHeadOutput(
+                    start_top_log_probs=start_top_log_probs,
+                    start_top_index=start_top_index,
+                    end_top_log_probs=end_top_log_probs,
+                    end_top_index=end_top_index,
+                    cls_logits=cls_logits,
+                )
+# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->Flaubert
+class FlaubertSequenceSummary(nn.Module):
+    r"""
+    Compute a single vector summary of a sequence hidden states.
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+            config class of your model for the default values it uses):
+            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
+                - `"last"` -- Take the last token hidden state (like XLNet)
+                - `"first"` -- Take the first token hidden state (like Bert)
+                - `"mean"` -- Take the mean of all tokens hidden states
+                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - `"attn"` -- Not implemented now, use multi-head attention
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
+              (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
+              another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
+    """
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+        self.summary_type = getattr(config, "summary_type", "last")
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+        self.summary = nn.Identity()
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity()
+        self.first_dropout = nn.Identity()
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+        self.last_dropout = nn.Identity()
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+    def forward(
+        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Compute a single vector summary of a sequence hidden states.
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
+                The hidden states of the last layer.
+            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
+        Returns:
+            `torch.FloatTensor`: The summary of the sequence hidden states.
+        """
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == "cls_index":
+            if cls_index is None:
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+        return output
+# Copied from transformers.models.xlm.modeling_xlm.XLMPreTrainedModel with XLM->Flaubert
+class FlaubertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = FlaubertConfig
+    load_tf_weights = None
+    base_model_prefix = "transformer"
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+    @property
+    def dummy_inputs(self):
+        inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+        attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        else:
+            langs_list = None
+        return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Embedding):
+            if self.config is not None and self.config.embed_init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if isinstance(module, nn.Linear):
+            if self.config is not None and self.config.init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0.0)
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, FlaubertModel) and self.config.sinusoidal_embeddings:
+            create_sinusoidal_embeddings(
+                self.config.max_position_embeddings, self.config.emb_dim, out=module.position_embeddings.weight
+            )
+class FlaubertModel(FlaubertPreTrainedModel):
+    def __init__(self, config):  # , dico, is_encoder, with_output):
+        super().__init__(config)
+        # encoder / decoder, output layer
+        self.is_encoder = config.is_encoder
+        self.is_decoder = not config.is_encoder
+        if self.is_decoder:
+            raise NotImplementedError("Currently Flaubert can only be used as an encoder")
+        # self.with_output = with_output
+        self.causal = config.causal
+        # dictionary / languages
+        self.n_langs = config.n_langs
+        self.use_lang_emb = config.use_lang_emb
+        self.n_words = config.n_words
+        self.eos_index = config.eos_index
+        self.pad_index = config.pad_index
+        # self.dico = dico
+        # self.id2lang = config.id2lang
+        # self.lang2id = config.lang2id
+        # assert len(self.dico) == self.n_words
+        # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
+        # model parameters
+        self.dim = config.emb_dim  # 512 by default
+        self.hidden_dim = self.dim * 4  # 2048 by default
+        self.n_heads = config.n_heads  # 8 by default
+        self.n_layers = config.n_layers
+        self.dropout = config.dropout
+        self.attention_dropout = config.attention_dropout
+        assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads"
+        # embeddings
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
+        if config.n_langs > 1 and config.use_lang_emb:
+            self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
+        self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
+        self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
+        # transformer layers
+        self.attentions = nn.ModuleList()
+        self.layer_norm1 = nn.ModuleList()
+        self.ffns = nn.ModuleList()
+        self.layer_norm2 = nn.ModuleList()
+        # if self.is_decoder:
+        #     self.layer_norm15 = nn.ModuleList()
+        #     self.encoder_attn = nn.ModuleList()
+        for _ in range(self.n_layers):
+            self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config))
+            self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            # if self.is_decoder:
+            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
+            self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+        if hasattr(config, "pruned_heads"):
+            pruned_heads = config.pruned_heads.copy().items()
+            config.pruned_heads = {}
+            for layer, heads in pruned_heads:
+                if self.attentions[int(layer)].n_heads == config.n_heads:
+                    self.prune_heads({int(layer): list(map(int, heads))})
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.layerdrop = getattr(config, "layerdrop", 0.0)
+        self.pre_norm = getattr(config, "pre_norm", False)
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+    # Copied from transformers.models.xlm.modeling_xlm.XLMModel.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.embeddings
+    # Copied from transformers.models.xlm.modeling_xlm.XLMModel.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+    # Copied from transformers.models.xlm.modeling_xlm.XLMModel._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.attentions[layer].prune_heads(heads)
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        lengths: Optional[torch.LongTensor] = None,
+        cache: Optional[Dict[str, torch.FloatTensor]] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # removed: src_enc=None, src_len=None
+        if input_ids is not None:
+            bs, slen = input_ids.size()
+        else:
+            bs, slen = inputs_embeds.size()[:-1]
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if lengths is None:
+            if input_ids is not None:
+                lengths = (input_ids != self.pad_index).sum(dim=1).long()
+            else:
+                lengths = torch.tensor([slen] * bs, device=device)
+        # mask = input_ids != self.pad_index
+        # check inputs
+        assert lengths.size(0) == bs
+        assert lengths.max().item() <= slen
+        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
+        # assert (src_enc is None) == (src_len is None)
+        # if src_enc is not None:
+        #     assert self.is_decoder
+        #     assert src_enc.size(0) == bs
+        # generate masks
+        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
+        # if self.is_decoder and src_enc is not None:
+        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+        # Setting the position-ids to the registered buffer in constructor, it helps
+        # when tracing the model without passing position-ids, solves
+        # isues similar to issue #5664
+        if position_ids is None:
+            if hasattr(self, "position_ids"):
+                position_ids = self.position_ids[:, :slen]
+                position_ids = position_ids.expand((bs, slen))
+            else:
+                position_ids = torch.arange(slen, dtype=torch.long, device=device)
+                position_ids = position_ids.unsqueeze(0).expand((bs, slen))
+        else:
+            assert position_ids.size() == (bs, slen)  # (slen, bs)
+            # position_ids = position_ids.transpose(0, 1)
+        # langs
+        if langs is not None:
+            assert langs.size() == (bs, slen)  # (slen, bs)
+            # langs = langs.transpose(0, 1)
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.n_layers)
+        # do not recompute cached elements
+        if cache is not None and input_ids is not None:
+            _slen = slen - cache["slen"]
+            input_ids = input_ids[:, -_slen:]
+            position_ids = position_ids[:, -_slen:]
+            if langs is not None:
+                langs = langs[:, -_slen:]
+            mask = mask[:, -_slen:]
+            attn_mask = attn_mask[:, -_slen:]
+        # embeddings
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+        tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds)
+        if langs is not None and self.use_lang_emb and self.config.n_langs > 1:
+            tensor = tensor + self.lang_embeddings(langs)
+        if token_type_ids is not None:
+            tensor = tensor + self.embeddings(token_type_ids)
+        tensor = self.layer_norm_emb(tensor)
+        tensor = nn.functional.dropout(tensor, p=self.dropout, training=self.training)
+        tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+        # transformer layers
+        hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+        for i in range(self.n_layers):
+            # LayerDrop
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+            if output_hidden_states:
+                hidden_states = hidden_states + (tensor,)
+            # self attention
+            if not self.pre_norm:
+                attn_outputs = self.attentions[i](
+                    tensor,
+                    attn_mask,
+                    cache=cache,
+                    head_mask=head_mask[i],
+                    output_attentions=output_attentions,
+                )
+                attn = attn_outputs[0]
+                if output_attentions:
+                    attentions = attentions + (attn_outputs[1],)
+                attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
+                tensor = tensor + attn
+                tensor = self.layer_norm1[i](tensor)
+            else:
+                tensor_normalized = self.layer_norm1[i](tensor)
+                attn_outputs = self.attentions[i](tensor_normalized, attn_mask, cache=cache, head_mask=head_mask[i])
+                attn = attn_outputs[0]
+                if output_attentions:
+                    attentions = attentions + (attn_outputs[1],)
+                attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
+                tensor = tensor + attn
+            # encoder attention (for decoder only)
+            # if self.is_decoder and src_enc is not None:
+            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
+            #     attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
+            #     tensor = tensor + attn
+            #     tensor = self.layer_norm15[i](tensor)
+            # FFN
+            if not self.pre_norm:
+                tensor = tensor + self.ffns[i](tensor)
+                tensor = self.layer_norm2[i](tensor)
+            else:
+                tensor_normalized = self.layer_norm2[i](tensor)
+                tensor = tensor + self.ffns[i](tensor_normalized)
+            tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+        # Add last hidden state
+        if output_hidden_states:
+            hidden_states = hidden_states + (tensor,)
+        # update cache length
+        if cache is not None:
+            cache["slen"] += tensor.size(1)
+        # move back sequence length to dimension 0
+        # tensor = tensor.transpose(0, 1)
+        if not return_dict:
+            return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
+        return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
+@add_start_docstrings(
+    """
+    The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied transformers.models.xlm.modeling_xlm.XLMWithLMHeadModel with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertWithLMHeadModel(FlaubertPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["pred_layer.proj.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = FlaubertModel(config)
+        self.pred_layer = FlaubertPredLayer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.pred_layer.proj
+    def set_output_embeddings(self, new_embeddings):
+        self.pred_layer.proj = new_embeddings
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        # Overwritten -- uses a language id
+        mask_token_id = self.config.mask_token_id
+        lang_id = self.config.lang_id
+        effective_batch_size = input_ids.shape[0]
+        mask_token = torch.full((effective_batch_size, 1), mask_token_id, dtype=torch.long, device=input_ids.device)
+        input_ids = torch.cat([input_ids, mask_token], dim=1)
+        if lang_id is not None:
+            langs = torch.full_like(input_ids, lang_id)
+        else:
+            langs = None
+        return {"input_ids": input_ids, "langs": langs}
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<special1>",
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        output = transformer_outputs[0]
+        outputs = self.pred_layer(output, labels)  # (loss, logits) or (logits,) depending on if labels are provided.
+        if not return_dict:
+            return outputs + transformer_outputs[1:]
+        return MaskedLMOutput(
+            loss=outputs[0] if labels is not None else None,
+            logits=outputs[0] if labels is None else outputs[1],
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+    e.g. for GLUE tasks.
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_xlm.XLMForSequenceClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForSequenceClassification(FlaubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.transformer = FlaubertModel(config)
+        self.sequence_summary = FlaubertSequenceSummary(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Flaubert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_xlm.XLMForTokenClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForTokenClassification(FlaubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = FlaubertModel(config)
+        self.dropout = nn.Dropout(config.dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_xlm.XLMForQuestionAnsweringSimple with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForQuestionAnsweringSimple(FlaubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = FlaubertModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = transformer_outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + transformer_outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Flaubert Model with a beam-search span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+@dataclass
+# Copied from transformer.models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput with XLM->Flaubert
+class FlaubertForQuestionAnsweringOutput(ModelOutput):
+    """
+    Base class for outputs of question answering models using a `SquadHead`.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
+            losses.
+        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Indices for the top config.start_n_top start token possibilities (beam-search).
+        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
+            (beam-search).
+        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
+        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the `is_impossible` label of the answers.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    start_top_log_probs: Optional[torch.FloatTensor] = None
+    start_top_index: Optional[torch.LongTensor] = None
+    end_top_log_probs: Optional[torch.FloatTensor] = None
+    end_top_index: Optional[torch.LongTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+# Copied from transformers.models.xlm.modeling_xlm.XLMForQuestionAnswering with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForQuestionAnswering(FlaubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = FlaubertModel(config)
+        self.qa_outputs = FlaubertSQuADHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=FlaubertForQuestionAnsweringOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        is_impossible: Optional[torch.Tensor] = None,
+        cls_index: Optional[torch.Tensor] = None,
+        p_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, FlaubertForQuestionAnsweringOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels whether a question has an answer or no answer (SQuAD 2.0)
+        cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the classification token to use as input for computing plausibility of the
+            answer.
+        p_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
+            masked. 0.0 mean token is not masked.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, FlaubertForQuestionAnswering
+        >>> import torch
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-mlm-en-2048")
+        >>> model = FlaubertForQuestionAnswering.from_pretrained("FacebookAI/xlm-mlm-en-2048")
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
+        ...     0
+        ... )  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        output = transformer_outputs[0]
+        outputs = self.qa_outputs(
+            output,
+            start_positions=start_positions,
+            end_positions=end_positions,
+            cls_index=cls_index,
+            is_impossible=is_impossible,
+            p_mask=p_mask,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return outputs + transformer_outputs[1:]
+        return FlaubertForQuestionAnsweringOutput(
+            loss=outputs.loss,
+            start_top_log_probs=outputs.start_top_log_probs,
+            start_top_index=outputs.start_top_index,
+            end_top_log_probs=outputs.end_top_log_probs,
+            end_top_index=outputs.end_top_index,
+            cls_logits=outputs.cls_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Flaubert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_xlm.XLMForMultipleChoice with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForMultipleChoice(FlaubertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = FlaubertModel(config)
+        self.sequence_summary = FlaubertSequenceSummary(config)
+        self.logits_proj = nn.Linear(config.num_labels, 1)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(
+        FLAUBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        langs = langs.view(-1, langs.size(-1)) if langs is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        if lengths is not None:
+            logger.warning(
+                "The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the "
+                "attention mask instead."
+            )
+            lengths = None
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        logits = self.logits_proj(logits)
+        reshaped_logits = logits.view(-1, num_choices)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+__all__ = [
+    "FlaubertForMultipleChoice",
+    "FlaubertForQuestionAnswering",
+    "FlaubertForQuestionAnsweringSimple",
+    "FlaubertForSequenceClassification",
+    "FlaubertForTokenClassification",
+    "FlaubertModel",
+    "FlaubertWithLMHeadModel",
+    "FlaubertPreTrainedModel",
+]

docs/transformers/build/lib/transformers/models/flaubert/modeling_tf_flaubert.py ADDED Viewed

	@@ -0,0 +1,1344 @@

+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TF 2.0 Flaubert model.
+"""
+from __future__ import annotations
+import itertools
+import random
+import warnings
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+import numpy as np
+import tensorflow as tf
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFMultipleChoiceModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFSequenceSummary,
+    TFSharedEmbeddings,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_flaubert import FlaubertConfig
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "flaubert/flaubert_base_cased"
+_CONFIG_FOR_DOC = "FlaubertConfig"
+FLAUBERT_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+    <Tip>
+    TensorFlow models and layers in `transformers` accept two formats as input:
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+    </Tip>
+    Parameters:
+        config ([`FlaubertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+FLAUBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - `1` for tokens that are **not masked**,
+            - `0` for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        langs (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - `0` corresponds to a *sentence A* token,
+            - `1` corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        lengths (`tf.Tensor` or `Numpy array` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility Indices selected in
+            `[0, ..., input_ids.size(-1)]`:
+        cache (`Dict[str, tf.Tensor]`, *optional*):
+            Dictionary string to `tf.FloatTensor` that contains precomputed hidden states (key and values in the
+            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
+            decoding.
+            The dictionary object will be modified in-place during the forward pass to add newly computed
+            hidden-states.
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - `1` indicates the head is **not masked**,
+            - `0` indicates the head is **masked**.
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+def get_masks(slen, lengths, causal, padding_mask=None):
+    """
+    Generate hidden states mask, and optionally an attention mask.
+    """
+    bs = shape_list(lengths)[0]
+    if padding_mask is not None:
+        mask = padding_mask
+    else:
+        # assert lengths.max().item() <= slen
+        alen = tf.range(slen, dtype=lengths.dtype)
+        mask = alen < tf.expand_dims(lengths, axis=1)
+    # attention mask is the same as mask, or triangular inferior attention (causal)
+    if causal:
+        attn_mask = tf.less_equal(
+            tf.tile(tf.reshape(alen, (1, 1, slen)), (bs, slen, 1)), tf.reshape(alen, (1, slen, 1))
+        )
+    else:
+        attn_mask = mask
+    # sanity check
+    # assert shape_list(mask) == [bs, slen]
+    tf.debugging.assert_equal(shape_list(mask), [bs, slen])
+    if causal:
+        tf.debugging.assert_equal(shape_list(attn_mask), [bs, slen, slen])
+    return mask, attn_mask
+class TFFlaubertPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = FlaubertConfig
+    base_model_prefix = "transformer"
+    @property
+    def dummy_inputs(self):
+        # Sometimes Flaubert has language embeddings so don't forget to build them as well if needed
+        inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]], dtype=tf.int32)
+        attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32)
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            return {
+                "input_ids": inputs_list,
+                "attention_mask": attns_list,
+                "langs": tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32),
+            }
+        else:
+            return {"input_ids": inputs_list, "attention_mask": attns_list}
+@add_start_docstrings(
+    "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.",
+    FLAUBERT_START_DOCSTRING,
+)
+class TFFlaubertModel(TFFlaubertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: Optional[Dict[str, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+    ) -> Union[Tuple, TFBaseModelOutput]:
+        outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMMultiHeadAttention with XLM->Flaubert
+class TFFlaubertMultiHeadAttention(keras.layers.Layer):
+    NEW_ID = itertools.count()
+    def __init__(self, n_heads, dim, config, **kwargs):
+        super().__init__(**kwargs)
+        self.layer_id = next(TFFlaubertMultiHeadAttention.NEW_ID)
+        self.dim = dim
+        self.n_heads = n_heads
+        self.output_attentions = config.output_attentions
+        assert self.dim % self.n_heads == 0
+        self.q_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin")
+        self.k_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin")
+        self.v_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin")
+        self.out_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin")
+        self.dropout = keras.layers.Dropout(config.attention_dropout)
+        self.pruned_heads = set()
+        self.dim = dim
+    def prune_heads(self, heads):
+        raise NotImplementedError
+    def call(self, input, mask, kv, cache, head_mask, output_attentions, training=False):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = shape_list(input)
+        if kv is None:
+            klen = qlen if cache is None else cache["slen"] + qlen
+        else:
+            klen = shape_list(kv)[1]
+        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
+        dim_per_head = self.dim // self.n_heads
+        mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen)
+        def shape(x):
+            """projection"""
+            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
+        def unshape(x):
+            """compute context"""
+            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
+        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = tf.concat([k_, k], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = tf.concat([v_, v], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+        f_dim_per_head = tf.cast(dim_per_head, dtype=q.dtype)
+        q = tf.multiply(q, tf.math.rsqrt(f_dim_per_head))  # (bs, n_heads, qlen, dim_per_head)
+        k = tf.cast(k, dtype=q.dtype)
+        scores = tf.matmul(q, k, transpose_b=True)  # (bs, n_heads, qlen, klen)
+        mask = tf.reshape(mask, mask_reshape)  # (bs, n_heads, qlen, klen)
+        # scores.masked_fill_(mask, -float('inf'))                            # (bs, n_heads, qlen, klen)
+        mask = tf.cast(mask, dtype=scores.dtype)
+        scores = scores - 1e30 * (1.0 - mask)
+        weights = stable_softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
+        outputs = (self.out_lin(context),)
+        if output_attentions:
+            outputs = outputs + (weights,)
+        return outputs
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "q_lin", None) is not None:
+            with tf.name_scope(self.q_lin.name):
+                self.q_lin.build([None, None, self.dim])
+        if getattr(self, "k_lin", None) is not None:
+            with tf.name_scope(self.k_lin.name):
+                self.k_lin.build([None, None, self.dim])
+        if getattr(self, "v_lin", None) is not None:
+            with tf.name_scope(self.v_lin.name):
+                self.v_lin.build([None, None, self.dim])
+        if getattr(self, "out_lin", None) is not None:
+            with tf.name_scope(self.out_lin.name):
+                self.out_lin.build([None, None, self.dim])
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMTransformerFFN
+class TFFlaubertTransformerFFN(keras.layers.Layer):
+    def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
+        super().__init__(**kwargs)
+        self.lin1 = keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
+        self.lin2 = keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
+        self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu")
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.in_dim = in_dim
+        self.dim_hidden = dim_hidden
+    def call(self, input, training=False):
+        x = self.lin1(input)
+        x = self.act(x)
+        x = self.lin2(x)
+        x = self.dropout(x, training=training)
+        return x
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "lin1", None) is not None:
+            with tf.name_scope(self.lin1.name):
+                self.lin1.build([None, None, self.in_dim])
+        if getattr(self, "lin2", None) is not None:
+            with tf.name_scope(self.lin2.name):
+                self.lin2.build([None, None, self.dim_hidden])
+@keras_serializable
+class TFFlaubertMainLayer(keras.layers.Layer):
+    config_class = FlaubertConfig
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.n_heads = config.n_heads
+        self.n_langs = config.n_langs
+        self.dim = config.emb_dim
+        self.hidden_dim = self.dim * 4
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+        self.causal = config.causal
+        self.n_layers = config.n_layers
+        self.use_lang_emb = config.use_lang_emb
+        self.layerdrop = getattr(config, "layerdrop", 0.0)
+        self.pre_norm = getattr(config, "pre_norm", False)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.return_dict = config.use_return_dict
+        self.max_position_embeddings = config.max_position_embeddings
+        self.embed_init_std = config.embed_init_std
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.embeddings = TFSharedEmbeddings(
+            self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings"
+        )
+        self.layer_norm_emb = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb")
+        self.attentions = []
+        self.layer_norm1 = []
+        self.ffns = []
+        self.layer_norm2 = []
+        for i in range(self.n_layers):
+            self.attentions.append(
+                TFFlaubertMultiHeadAttention(self.n_heads, self.dim, config=config, name=f"attentions_._{i}")
+            )
+            self.layer_norm1.append(
+                keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm1_._{i}")
+            )
+            # if self.is_decoder:
+            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.ffns.append(
+                TFFlaubertTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name=f"ffns_._{i}")
+            )
+            self.layer_norm2.append(
+                keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}")
+            )
+    def build(self, input_shape=None):
+        with tf.name_scope("position_embeddings"):
+            self.position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_position_embeddings, self.dim],
+                initializer=get_initializer(self.embed_init_std),
+            )
+        if self.n_langs > 1 and self.use_lang_emb:
+            with tf.name_scope("lang_embeddings"):
+                self.lang_embeddings = self.add_weight(
+                    name="embeddings",
+                    shape=[self.n_langs, self.dim],
+                    initializer=get_initializer(self.embed_init_std),
+                )
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "layer_norm_emb", None) is not None:
+            with tf.name_scope(self.layer_norm_emb.name):
+                self.layer_norm_emb.build([None, None, self.dim])
+        for layer in self.attentions:
+            with tf.name_scope(layer.name):
+                layer.build(None)
+        for layer in self.layer_norm1:
+            with tf.name_scope(layer.name):
+                layer.build([None, None, self.dim])
+        for layer in self.ffns:
+            with tf.name_scope(layer.name):
+                layer.build(None)
+        for layer in self.layer_norm2:
+            with tf.name_scope(layer.name):
+                layer.build([None, None, self.dim])
+    def get_input_embeddings(self):
+        return self.embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: Optional[Dict[str, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+    ) -> Union[Tuple, TFBaseModelOutput]:
+        # removed: src_enc=None, src_len=None
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            bs, slen = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            bs, slen = shape_list(inputs_embeds)[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if lengths is None:
+            if input_ids is not None:
+                lengths = tf.reduce_sum(
+                    tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=input_ids.dtype), axis=1
+                )
+            else:
+                lengths = tf.convert_to_tensor([slen] * bs)
+        # mask = input_ids != self.pad_index
+        # check inputs
+        # assert shape_list(lengths)[0] == bs
+        (
+            tf.debugging.assert_equal(shape_list(lengths)[0], bs),
+            f"Expected batch size {shape_list(lengths)[0]} and received batch size {bs} mismatched",
+        )
+        # assert lengths.max().item() <= slen
+        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
+        # assert (src_enc is None) == (src_len is None)
+        # if src_enc is not None:
+        #     assert self.is_decoder
+        #     assert src_enc.size(0) == bs
+        # generate masks
+        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
+        # if self.is_decoder and src_enc is not None:
+        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+        # position_ids
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(slen), axis=0)
+            position_ids = tf.tile(position_ids, (bs, 1))
+        # assert shape_list(position_ids) == [bs, slen]  # (slen, bs)
+        (
+            tf.debugging.assert_equal(shape_list(position_ids), [bs, slen]),
+            f"Position id shape {shape_list(position_ids)} and input shape {[bs, slen]} mismatched",
+        )
+        # position_ids = position_ids.transpose(0, 1)
+        # langs
+        if langs is not None:
+            # assert shape_list(langs) == [bs, slen]  # (slen, bs)
+            (
+                tf.debugging.assert_equal(shape_list(langs), [bs, slen]),
+                f"Lang shape {shape_list(langs)} and input shape {[bs, slen]} mismatched",
+            )
+            # langs = langs.transpose(0, 1)
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.n_layers
+        # do not recompute cached elements
+        if cache is not None and input_ids is not None:
+            _slen = slen - cache["slen"]
+            input_ids = input_ids[:, -_slen:]
+            position_ids = position_ids[:, -_slen:]
+            if langs is not None:
+                langs = langs[:, -_slen:]
+            mask = mask[:, -_slen:]
+            attn_mask = attn_mask[:, -_slen:]
+        # embeddings
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.embeddings.vocab_size)
+            inputs_embeds = self.embeddings(input_ids)
+        tensor = inputs_embeds + tf.gather(self.position_embeddings, position_ids)
+        if langs is not None and self.use_lang_emb:
+            tensor = tensor + tf.gather(self.lang_embeddings, langs)
+        if token_type_ids is not None:
+            tensor = tensor + self.embeddings(token_type_ids)
+        tensor = self.layer_norm_emb(tensor)
+        tensor = self.dropout(tensor, training=training)
+        mask = tf.cast(mask, dtype=tensor.dtype)
+        tensor = tensor * tf.expand_dims(mask, axis=-1)
+        # hidden_states and attentions cannot be None in graph mode.
+        hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+        # transformer layers
+        for i in range(self.n_layers):
+            # LayerDrop
+            dropout_probability = random.uniform(0, 1)
+            if training and (dropout_probability < self.layerdrop):
+                continue
+            if output_hidden_states:
+                hidden_states = hidden_states + (tensor,)
+            # self attention
+            if not self.pre_norm:
+                attn_outputs = self.attentions[i](
+                    tensor,
+                    attn_mask,
+                    None,
+                    cache,
+                    head_mask[i],
+                    output_attentions,
+                    training=training,
+                )
+                attn = attn_outputs[0]
+                if output_attentions:
+                    attentions = attentions + (attn_outputs[1],)
+                attn = self.dropout(attn, training=training)
+                tensor = tensor + attn
+                tensor = self.layer_norm1[i](tensor)
+            else:
+                tensor_normalized = self.layer_norm1[i](tensor)
+                attn_outputs = self.attentions[i](
+                    tensor_normalized,
+                    attn_mask,
+                    None,
+                    cache,
+                    head_mask[i],
+                    output_attentions,
+                    training=training,
+                )
+                attn = attn_outputs[0]
+                if output_attentions:
+                    attentions = attentions + (attn_outputs[1],)
+                attn = self.dropout(attn, training=training)
+                tensor = tensor + attn
+            # encoder attention (for decoder only)
+            # if self.is_decoder and src_enc is not None:
+            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
+            #     attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
+            #     tensor = tensor + attn
+            #     tensor = self.layer_norm15[i](tensor)
+            # FFN
+            if not self.pre_norm:
+                tensor = tensor + self.ffns[i](tensor)
+                tensor = self.layer_norm2[i](tensor)
+            else:
+                tensor_normalized = self.layer_norm2[i](tensor)
+                tensor = tensor + self.ffns[i](tensor_normalized)
+            tensor = tensor * tf.expand_dims(mask, axis=-1)
+        # Add last hidden state
+        if output_hidden_states:
+            hidden_states = hidden_states + (tensor,)
+        # update cache length
+        if cache is not None:
+            cache["slen"] += tensor.size(1)
+        # move back sequence length to dimension 0
+        # tensor = tensor.transpose(0, 1)
+        if not return_dict:
+            return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
+        return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMPredLayer
+class TFFlaubertPredLayer(keras.layers.Layer):
+    """
+    Prediction layer (cross_entropy or adaptive_softmax).
+    """
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+        self.asm = config.asm
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+        if config.asm is False:
+            self.input_embeddings = input_embeddings
+        else:
+            raise NotImplementedError
+            # self.proj = nn.AdaptiveLogSoftmaxWithLoss(
+            #     in_features=dim,
+            #     n_classes=config.n_words,
+            #     cutoffs=config.asm_cutoffs,
+            #     div_value=config.asm_div_value,
+            #     head_bias=True,  # default is False
+            # )
+    def build(self, input_shape):
+        # The output weights are the same as the input embeddings, but there is an output-only bias for each token.
+        self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")
+        super().build(input_shape)
+    def get_output_embeddings(self):
+        return self.input_embeddings
+    def set_output_embeddings(self, value):
+        self.input_embeddings.weight = value
+        self.input_embeddings.vocab_size = shape_list(value)[0]
+    def get_bias(self):
+        return {"bias": self.bias}
+    def set_bias(self, value):
+        self.bias = value["bias"]
+        self.vocab_size = shape_list(value["bias"])[0]
+    def call(self, hidden_states):
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+        return hidden_states
+@dataclass
+class TFFlaubertWithLMHeadModelOutput(ModelOutput):
+    """
+    Base class for [`TFFlaubertWithLMHeadModel`] outputs.
+    Args:
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    logits: Optional[tf.Tensor] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+@add_start_docstrings(
+    """
+    The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+class TFFlaubertWithLMHeadModel(TFFlaubertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.pred_layer = TFFlaubertPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
+        # Flaubert does not have past caching features
+        self.supports_xla_generation = False
+    def get_lm_head(self):
+        return self.pred_layer
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.pred_layer.name
+    def prepare_inputs_for_generation(self, inputs, **kwargs):
+        mask_token_id = self.config.mask_token_id
+        lang_id = self.config.lang_id
+        effective_batch_size = inputs.shape[0]
+        mask_token = tf.fill((effective_batch_size, 1), 1) * mask_token_id
+        inputs = tf.concat([inputs, mask_token], axis=1)
+        if lang_id is not None:
+            langs = tf.ones_like(inputs) * lang_id
+        else:
+            langs = None
+        return {"input_ids": inputs, "langs": langs}
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFFlaubertWithLMHeadModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: Optional[Dict[str, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+    ) -> Union[Tuple, TFFlaubertWithLMHeadModelOutput]:
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+        outputs = self.pred_layer(output)
+        if not return_dict:
+            return (outputs,) + transformer_outputs[1:]
+        return TFFlaubertWithLMHeadModelOutput(
+            logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "pred_layer", None) is not None:
+            with tf.name_scope(self.pred_layer.name):
+                self.pred_layer.build(None)
+@add_start_docstrings(
+    """
+    Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+    e.g. for GLUE tasks.
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForSequenceClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class TFFlaubertForSequenceClassification(TFFlaubertPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: Optional[Dict[str, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "sequence_summary", None) is not None:
+            with tf.name_scope(self.sequence_summary.name):
+                self.sequence_summary.build(None)
+@add_start_docstrings(
+    """
+    Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForQuestionAnsweringSimple with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class TFFlaubertForQuestionAnsweringSimple(TFFlaubertPreTrainedModel, TFQuestionAnsweringLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.qa_outputs = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
+        )
+        self.config = config
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: Optional[Dict[str, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = transformer_outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels, (start_logits, end_logits))
+        if not return_dict:
+            output = (start_logits, end_logits) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+@add_start_docstrings(
+    """
+    Flaubert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForTokenClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class TFFlaubertForTokenClassification(TFFlaubertPreTrainedModel, TFTokenClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.classifier = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
+        )
+        self.config = config
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: Optional[Dict[str, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = transformer_outputs[0]
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+@add_start_docstrings(
+    """
+    Flaubert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForMultipleChoice with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class TFFlaubertForMultipleChoice(TFFlaubertPreTrainedModel, TFMultipleChoiceLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
+        self.logits_proj = keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
+        )
+        self.config = config
+    @property
+    def dummy_inputs(self):
+        """
+        Dummy inputs to build the network.
+        Returns:
+            tf.Tensor with dummy inputs
+        """
+        # Sometimes Flaubert has language embeddings so don't forget to build them as well if needed
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            return {
+                "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
+                "langs": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
+            }
+        else:
+            return {
+                "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
+            }
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(
+        FLAUBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: Optional[Dict[str, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None
+        flat_inputs_embeds = (
+            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
+            if inputs_embeds is not None
+            else None
+        )
+        if lengths is not None:
+            logger.warning(
+                "The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the "
+                "attention mask instead.",
+            )
+            lengths = None
+        transformer_outputs = self.transformer(
+            flat_input_ids,
+            flat_attention_mask,
+            flat_langs,
+            flat_token_type_ids,
+            flat_position_ids,
+            lengths,
+            cache,
+            head_mask,
+            flat_inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        logits = self.logits_proj(logits)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
+        if not return_dict:
+            output = (reshaped_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "sequence_summary", None) is not None:
+            with tf.name_scope(self.sequence_summary.name):
+                self.sequence_summary.build(None)
+        if getattr(self, "logits_proj", None) is not None:
+            with tf.name_scope(self.logits_proj.name):
+                self.logits_proj.build([None, None, self.config.num_labels])
+__all__ = [
+    "TFFlaubertForMultipleChoice",
+    "TFFlaubertForQuestionAnsweringSimple",
+    "TFFlaubertForSequenceClassification",
+    "TFFlaubertForTokenClassification",
+    "TFFlaubertModel",
+    "TFFlaubertPreTrainedModel",
+    "TFFlaubertWithLMHeadModel",
+]

docs/transformers/build/lib/transformers/models/flaubert/tokenization_flaubert.py ADDED Viewed

	@@ -0,0 +1,568 @@

+# coding=utf-8
+# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Flaubert."""
+import json
+import os
+import re
+import unicodedata
+from typing import List, Optional, Tuple
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+def convert_to_unicode(text):
+    """
+    Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
+    """
+    def ensure_text(s, encoding="utf-8", errors="strict"):
+        if isinstance(s, bytes):
+            return s.decode(encoding, errors)
+        elif isinstance(s, str):
+            return s
+        else:
+            raise TypeError(f"not expecting type '{type(s)}'")
+    return ensure_text(text, encoding="utf-8", errors="ignore")
+# Copied from transformers.models.xlm.tokenization_xlm.get_pairs
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
+    strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+# Copied from transformers.models.xlm.tokenization_xlm.replace_unicode_punct
+def replace_unicode_punct(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
+    """
+    text = text.replace("，", ",")
+    text = re.sub(r"。\s*", ". ", text)
+    text = text.replace("、", ",")
+    text = text.replace("”", '"')
+    text = text.replace("“", '"')
+    text = text.replace("∶", ":")
+    text = text.replace("：", ":")
+    text = text.replace("？", "?")
+    text = text.replace("《", '"')
+    text = text.replace("》", '"')
+    text = text.replace("）", ")")
+    text = text.replace("！", "!")
+    text = text.replace("（", "(")
+    text = text.replace("；", ";")
+    text = text.replace("１", "1")
+    text = text.replace("」", '"')
+    text = text.replace("「", '"')
+    text = text.replace("０", "0")
+    text = text.replace("３", "3")
+    text = text.replace("２", "2")
+    text = text.replace("５", "5")
+    text = text.replace("６", "6")
+    text = text.replace("９", "9")
+    text = text.replace("７", "7")
+    text = text.replace("８", "8")
+    text = text.replace("４", "4")
+    text = re.sub(r"．\s*", ". ", text)
+    text = text.replace("～", "~")
+    text = text.replace("’", "'")
+    text = text.replace("…", "...")
+    text = text.replace("━", "-")
+    text = text.replace("〈", "<")
+    text = text.replace("〉", ">")
+    text = text.replace("【", "[")
+    text = text.replace("】", "]")
+    text = text.replace("％", "%")
+    return text
+# Copied from transformers.models.xlm.tokenization_xlm.remove_non_printing_char
+def remove_non_printing_char(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
+    """
+    output = []
+    for char in text:
+        cat = unicodedata.category(char)
+        if cat.startswith("C"):
+            continue
+        output.append(char)
+    return "".join(output)
+class FlaubertTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Flaubert tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:
+    - Moses preprocessing and tokenization.
+    - Normalizing all inputs text.
+    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
+      "__classify__") to a vocabulary.
+    - The argument `do_lowercase` controls lower casing (automatically set for pretrained vocabularies).
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            Vocabulary file.
+        merges_file (`str`):
+            Merges file.
+        do_lowercase (`bool`, *optional*, defaults to `False`):
+            Controls lower casing.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+            </Tip>
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"</s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"<special1>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<special0>', '<special1>', '<special2>', '<special3>', '<special4>', '<special5>', '<special6>', '<special7>', '<special8>', '<special9>']`):
+            List of additional special tokens.
+        lang2id (`Dict[str, int]`, *optional*):
+            Dictionary mapping languages string identifiers to their IDs.
+        id2lang (`Dict[int, str]`, *optional*):
+            Dictionary mapping language IDs to their string identifiers.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        do_lowercase=False,
+        unk_token="<unk>",
+        bos_token="<s>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        cls_token="</s>",
+        mask_token="<special1>",
+        additional_special_tokens=[
+            "<special0>",
+            "<special1>",
+            "<special2>",
+            "<special3>",
+            "<special4>",
+            "<special5>",
+            "<special6>",
+            "<special7>",
+            "<special8>",
+            "<special9>",
+        ],
+        lang2id=None,
+        id2lang=None,
+        **kwargs,
+    ):
+        do_lowercase_and_remove_accent = kwargs.pop("do_lowercase_and_remove_accent", None)
+        if do_lowercase_and_remove_accent is not None:
+            logger.warning(
+                "`do_lowercase_and_remove_accent` is passed as a keyword argument, but this won't do anything."
+                " `FlaubertTokenizer` will always set it to `False`."
+            )
+        # always `False`
+        self.do_lowercase_and_remove_accent = False
+        self.do_lowercase = do_lowercase
+        try:
+            import sacremoses
+        except ImportError:
+            raise ImportError(
+                "You need to install sacremoses to use FlaubertTokenizer. "
+                "See https://pypi.org/project/sacremoses/ for installation."
+            )
+        self.sm = sacremoses
+        # cache of sm.MosesPunctNormalizer instance
+        self.cache_moses_punct_normalizer = {}
+        # cache of sm.MosesTokenizer instance
+        self.cache_moses_tokenizer = {}
+        self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
+        self.lang2id = lang2id
+        self.id2lang = id2lang
+        if lang2id is not None and id2lang is not None:
+            assert len(lang2id) == len(id2lang)
+        self.ja_word_tokenizer = None
+        self.zh_word_tokenizer = None
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[:-1]
+        merges = [tuple(merge.split()[:2]) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+        super().__init__(
+            do_lowercase=do_lowercase,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            lang2id=lang2id,
+            id2lang=id2lang,
+            **kwargs,
+        )
+    @property
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case
+    def do_lower_case(self):
+        return self.do_lowercase_and_remove_accent
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_punct_norm
+    def moses_punct_norm(self, text, lang):
+        if lang not in self.cache_moses_punct_normalizer:
+            punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
+            self.cache_moses_punct_normalizer[lang] = punct_normalizer
+        else:
+            punct_normalizer = self.cache_moses_punct_normalizer[lang]
+        return punct_normalizer.normalize(text)
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_tokenize
+    def moses_tokenize(self, text, lang):
+        if lang not in self.cache_moses_tokenizer:
+            moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
+            self.cache_moses_tokenizer[lang] = moses_tokenizer
+        else:
+            moses_tokenizer = self.cache_moses_tokenizer[lang]
+        return moses_tokenizer.tokenize(text, return_str=False, escape=False)
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_pipeline
+    def moses_pipeline(self, text, lang):
+        text = replace_unicode_punct(text)
+        text = self.moses_punct_norm(text, lang)
+        text = remove_non_printing_char(text)
+        return text
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.ja_tokenize
+    def ja_tokenize(self, text):
+        if self.ja_word_tokenizer is None:
+            try:
+                import Mykytea
+                self.ja_word_tokenizer = Mykytea.Mykytea(
+                    f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
+                )
+            except (AttributeError, ImportError):
+                logger.error(
+                    "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper"
+                    " (https://github.com/chezou/Mykytea-python) with the following steps"
+                )
+                logger.error("1. git clone [email protected]:neubig/kytea.git && cd kytea")
+                logger.error("2. autoreconf -i")
+                logger.error("3. ./configure --prefix=$HOME/local")
+                logger.error("4. make && make install")
+                logger.error("5. pip install kytea")
+                raise
+        return list(self.ja_word_tokenizer.getWS(text))
+    @property
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.vocab_size
+    def vocab_size(self):
+        return len(self.encoder)
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_vocab
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.bpe
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+        if not pairs:
+            return token + "</w>"
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        if word == "\n  </w>":
+            word = "\n</w>"
+        self.cache[token] = word
+        return word
+    def preprocess_text(self, text):
+        text = text.replace("``", '"').replace("''", '"')
+        text = convert_to_unicode(text)
+        text = unicodedata.normalize("NFC", text)
+        if self.do_lowercase:
+            text = text.lower()
+        return text
+    def _tokenize(self, text, bypass_tokenizer=False):
+        """
+        Tokenize a string given language code using Moses.
+        Details of tokenization:
+            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
+            - Install with `pip install sacremoses`
+        Args:
+            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
+              (bool). If True, we only apply BPE.
+        Returns:
+            List of tokens.
+        """
+        lang = "fr"
+        if lang and self.lang2id and lang not in self.lang2id:
+            logger.error(
+                "Supplied language code not found in lang2id mapping. Please check that your language is supported by"
+                " the loaded pretrained model."
+            )
+        if bypass_tokenizer:
+            text = text.split()
+        else:
+            text = self.preprocess_text(text)
+            text = self.moses_pipeline(text, lang=lang)
+            text = self.moses_tokenize(text, lang=lang)
+        split_tokens = []
+        for token in text:
+            if token:
+                split_tokens.extend(list(self.bpe(token).split(" ")))
+        return split_tokens
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = "".join(tokens).replace("</w>", " ").strip()
+        return out_string
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM sequence has the following format:
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s> B </s>`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        bos = [self.bos_token_id]
+        sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return bos + token_ids_0 + sep
+        return bos + token_ids_0 + sep + token_ids_1 + sep
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
+        pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+        return vocab_file, merge_file
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__getstate__
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sm"] = None
+        return state
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__setstate__
+    def __setstate__(self, d):
+        self.__dict__ = d
+        try:
+            import sacremoses
+        except ImportError:
+            raise ImportError(
+                "You need to install sacremoses to use XLMTokenizer. "
+                "See https://pypi.org/project/sacremoses/ for installation."
+            )
+        self.sm = sacremoses
+__all__ = ["FlaubertTokenizer"]

docs/transformers/build/lib/transformers/models/flava/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_flava import *
+    from .feature_extraction_flava import *
+    from .image_processing_flava import *
+    from .image_processing_flava_fast import *
+    from .modeling_flava import *
+    from .processing_flava import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/flava/configuration_flava.py ADDED Viewed

	@@ -0,0 +1,701 @@

+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FLAVA model configurations"""
+from typing import Any, Dict
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+class FlavaImageConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FlavaImageModel`]. It is used to instantiate an
+    FLAVA model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
+    [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        mask_token (`bool`, *optional*, defaults to `True`):
+            Whether to use a mask token or not. Used in MIM (Masked Image Modeling) loss for FLAVA.
+        vocab_size (`int`, *optional*, defaults to 8192):
+            Vocabulary size of the [`FlavaImageCodebook`] used in conjunction with [`FlavaImageModel`] for MIM (Masked
+            Image Modeling) loss for FLAVA.
+    Example:
+    ```python
+    >>> from transformers import FlavaImageConfig, FlavaImageModel
+    >>> # Initializing a FlavaImageModel with  style configuration
+    >>> configuration = FlavaImageConfig()
+    >>> # Initializing a FlavaImageModel model (with random weights) from the style configuration
+    >>> model = FlavaImageModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "flava_image_model"
+    base_config_key = "image_config"
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: int = "gelu",
+        hidden_dropout_prob: float = 0.0,
+        attention_probs_dropout_prob: float = 0.0,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        image_size: int = 224,
+        patch_size: int = 16,
+        num_channels: int = 3,
+        qkv_bias: bool = True,
+        mask_token: bool = True,
+        vocab_size: int = 8192,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.mask_token = mask_token
+        self.vocab_size = vocab_size
+class FlavaTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FlavaTextModel`]. It is used to instantiate an
+    FLAVA model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
+    [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FlavaTextModel`].
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`FlavaTextModel`]. Note that even though
+            text encoder allows `token_type_ids`'s value as 2, for text-only pretraining and fine-tuning, only 1 is
+            used similar to RoBERTa.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048). For VL, max_length passed to model is 77.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+    Example:
+    ```python
+    >>> from transformers import FlavaTextConfig, FlavaTextModel
+    >>> # Initializing a FlavaTextModel with  style configuration
+    >>> configuration = FlavaTextConfig()
+    >>> # Initializing a FlavaTextModel model (with random weights) from the style configuration
+    >>> model = FlavaTextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "flava_text_model"
+    base_config_key = "text_config"
+    def __init__(
+        self,
+        vocab_size: int = 30522,
+        type_vocab_size: int = 2,
+        max_position_embeddings: int = 512,
+        position_embedding_type: str = "absolute",
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.0,
+        attention_probs_dropout_prob: float = 0.0,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        pad_token_id: int = 0,
+        qkv_bias: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.type_vocab_size = type_vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.position_embedding_type = position_embedding_type
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.pad_token_id = pad_token_id
+class FlavaMultimodalConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FlavaMultimodalModel`]. It is used to instantiate
+    an FLAVA model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
+    [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        use_cls_token (`bool`, *optional*, defaults to `True`):
+            Whether to use an extra CLS token for multimodal settings. Usually needed by the FLAVA model.
+    Example:
+    ```python
+    >>> from transformers import FlavaMultimodalConfig, FlavaMultimodalModel
+    >>> # Initializing a FlavaMultimodalModel with  style configuration
+    >>> configuration = FlavaMultimodalConfig()
+    >>> # Initializing a FlavaMultimodalModel model (with random weights) from the style configuration
+    >>> model = FlavaMultimodalModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "flava_multimodal_model"
+    base_config_key = "multimodal_config"
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 6,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: int = "gelu",
+        hidden_dropout_prob: int = 0.0,
+        attention_probs_dropout_prob: int = 0.0,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        qkv_bias: bool = True,
+        use_cls_token: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.use_cls_token = use_cls_token
+class FlavaImageCodebookConfig(PretrainedConfig):
+    model_type = "flava_image_codebook"
+    base_config_key = "image_codebook_config"
+    r"""
+    [`FlavaImageCodebookConfig`] is the configuration class to store the configuration of a [`FlavaImageCodebook`]. It
+    is used to instantiate an FLAVA model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
+    [facebook/flava-image-codebook](https://huggingface.co/facebook/flava-image-codebook) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        num_groups (`int`, *optional*, defaults to 4):
+            Number of groups to be created. This parameter as of now doesn't affect the model and is used for some
+            internal calculation and estimations.
+        input_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the image to be passed.
+        num_blocks_per_group (`int`, *optional*, defaults to 2):
+            Number of conv-based blocks per group.
+        hidden_size (`int`, *optional*, defaults to 256):
+            Size of hidden dim for the blocks.
+        vocab_size (`int`, *optional*, defaults to 8192):
+            Size of the output vocabulary for the codebook.
+        freeze (`bool`, defaults to `True`):
+            Whether to freeze the weights of the model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from transformers import FlavaImageCodebookConfig, FlavaImageCodebook
+    >>> # Initializing a FlavaImageCodebook with style configuration
+    >>> configuration = FlavaImageCodebookConfig()
+    >>> # Initializing a FlavaImageCodebook model (with random weights) from the style configuration
+    >>> model = FlavaImageCodebook(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    def __init__(
+        self,
+        num_groups: int = 4,
+        input_channels: int = 3,
+        num_blocks_per_group: int = 2,
+        hidden_size: int = 256,
+        vocab_size: int = 8192,
+        freeze: int = True,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_groups = num_groups
+        self.input_channels = input_channels
+        self.num_blocks_per_group = num_blocks_per_group
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.freeze = freeze
+        self.initializer_range = initializer_range
+class FlavaConfig(PretrainedConfig):
+    r"""
+    [`FlavaConfig`] is the configuration class to store the configuration of a [`FlavaModel`]. It is used to
+    instantiate FLAVA model according to the specified arguments, defining the text model, image model, image codebook
+    and multimodal model configs. Instantiating a configuration with the defaults will yield a similar configuration to
+    that of the FLAVA [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`FlavaTextConfig`].
+        image_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`FlavaImageConfig`].
+        multimodal_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`FlavaMultimodalConfig`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and image projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter. Default is used as per the original FLAVA/CLIP
+            implementation.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        ce_ignore_index (`int`, *optional*, defaults to -100):
+            Cross entropy index to ignore.
+        mim_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to MIM (Masked Image Modeling) unimodal loss
+        mlm_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to MLM (Masked Language Modeling) unimodal loss
+        global_contrastive_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to global contrastive cross-alignment loss.
+        itm_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to image-text matching multimodal loss.
+        mmm_image_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to MMM loss's image part.
+        mmm_text_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to MMM loss's text part.
+        global_backprop_contrastive (`bool`, *optional*, defaults to `True`):
+            Whether to use global backpropgation through all workers in contrastive loss.
+        skip_unmasked_multimodal_encoder (`bool`, *optional*, defaults to `True`):
+            Whether to skip running unmasked multimodal encoder whose outputs are not used by FLAVA losses.
+        return_loss (`bool`, *optional*, defaults to `True`):
+            Whether to return loss or not
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from transformers import FlavaConfig, FlavaModel, FlavaForPreTraining
+    >>> # Initializing a FlavaConfig with style configuration
+    >>> configuration = FlavaConfig()
+    >>> # Initializing a FlavaModel and FlavaForPreTraining model (with random weights) from the style configuration
+    >>> model = FlavaModel(configuration)
+    >>> model_pre = FlavaForPreTraining(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> configuration_pre = model_pre.config
+    ```
+    """
+    model_type = "flava"
+    sub_configs = {
+        "text_config": FlavaTextConfig,
+        "image_config": FlavaImageConfig,
+        "multimodal_config": FlavaMultimodalConfig,
+        "image_codebook_config": FlavaImageCodebookConfig,
+    }
+    def __init__(
+        self,
+        image_config: Dict[str, Any] = None,
+        text_config: Dict[str, Any] = None,
+        multimodal_config: Dict[str, Any] = None,
+        image_codebook_config: Dict[str, Any] = None,
+        hidden_size: int = 768,
+        layer_norm_eps: float = 1e-12,
+        projection_dim: int = 768,
+        init_codebook: bool = True,
+        logit_scale_init_value: float = 2.6592,
+        initializer_range: float = 0.02,
+        ce_ignore_index: int = -100,
+        mim_weight: float = 1.0,
+        mlm_weight: float = 1.0,
+        global_contrastive_weight: float = 1.0,
+        itm_weight: float = 1.0,
+        mmm_image_weight: float = 1.0,
+        mmm_text_weight: float = 1.0,
+        global_backprop_contrastive: bool = True,
+        skip_unmasked_multimodal_encoder: bool = True,
+        return_loss: bool = True,
+        **kwargs,
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        image_config_dict = kwargs.pop("image_config_dict", None)
+        multimodal_config_dict = kwargs.pop("multimodal_config_dict", None)
+        image_codebook_config_dict = kwargs.pop("image_codebook_config_dict", None)
+        super().__init__(**kwargs)
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = FlavaTextConfig(**text_config_dict).to_dict()
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `FlavaTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+        if image_config_dict is not None:
+            if image_config is None:
+                image_config = {}
+            # This is the complete result when using `image_config_dict`.
+            _image_config_dict = FlavaImageConfig(**image_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _image_config_dict:
+                _image_config_dict["id2label"] = {
+                    str(key): value for key, value in _image_config_dict["id2label"].items()
+                }
+            # Give a warning if the values exist in both `_image_config_dict` and `image_config` but being different.
+            for key, value in _image_config_dict.items():
+                if key in image_config and value != image_config[key] and key not in ["transformers_version"]:
+                    # If specified in `image_config_dict`
+                    if key in image_config_dict:
+                        message = (
+                            f"`{key}` is found in both `image_config_dict` and `image_config` but with different "
+                            f'values. The value `image_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`image_config_dict` is provided which will be used to initialize `FlavaImageConfig`. "
+                            f'The value `image_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+            # Update all values in `image_config` with the ones in `_image_config_dict`.
+            image_config.update(_image_config_dict)
+        if multimodal_config_dict is not None:
+            if multimodal_config is None:
+                multimodal_config = {}
+            # This is the complete result when using `multimodal_config_dict`.
+            _multimodal_config_dict = FlavaMultimodalConfig(**multimodal_config_dict).to_dict()
+            # Give a warning if the values exist in both `_multimodal_config_dict` and `multimodal_config` but being
+            # different.
+            for key, value in _multimodal_config_dict.items():
+                if (
+                    key in multimodal_config
+                    and value != multimodal_config[key]
+                    and key not in ["transformers_version"]
+                ):
+                    # If specified in `multimodal_config_dict`
+                    if key in multimodal_config_dict:
+                        message = (
+                            f"`{key}` is found in both `multimodal_config_dict` and `multimodal_config` but with "
+                            f'different values. The value `multimodal_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`multimodal_config_dict` is provided which will be used to initialize "
+                            f'`FlavaMultimodalConfig`. The value `multimodal_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+            # Update all values in `multimodal_config` with the ones in `_multimodal_config_dict`.
+            multimodal_config.update(_multimodal_config_dict)
+        if image_codebook_config_dict is not None:
+            if image_codebook_config is None:
+                image_codebook_config = {}
+            # This is the complete result when using `image_codebook_config_dict`.
+            _image_codebook_config_dict = FlavaImageCodebookConfig(**image_codebook_config_dict).to_dict()
+            # Give a warning if the values exist in both `_image_codebook_config_dict` and `image_codebook_config` but
+            # being different.
+            for key, value in _image_codebook_config_dict.items():
+                if (
+                    key in image_codebook_config
+                    and value != image_codebook_config[key]
+                    and key not in ["transformers_version"]
+                ):
+                    # If specified in `image_codebook_config_dict`
+                    if key in image_codebook_config_dict:
+                        message = (
+                            f"`{key}` is found in both `image_codebook_config_dict` and `image_codebook_config` but "
+                            f'with different values. The value `image_codebook_config_dict["{key}"]` will be used '
+                            "instead."
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`image_codebook_config_dict` is provided which will be used to initialize "
+                            f'`FlavaImageCodebookConfig`. The value `image_codebook_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+            # Update all values in `image_codebook_config` with the ones in `_image_codebook_config_dict`.
+            image_codebook_config.update(_image_codebook_config_dict)
+        if image_config is None:
+            image_config = {}
+            logger.info("`image_config` is `None`. initializing the `FlavaImageConfig` with default values.")
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `FlavaTextConfig` with default values.")
+        if multimodal_config is None:
+            multimodal_config = {}
+            logger.info("`multimodal_config` is `None`. initializing the `FlavaMultimodalConfig` with default values.")
+        if image_codebook_config is None:
+            image_codebook_config = {}
+            logger.info(
+                "`image_codebook_config` is `None`. initializing the `FlavaImageCodebookConfig` with default values."
+            )
+        self.image_config = FlavaImageConfig(**image_config)
+        self.text_config = FlavaTextConfig(**text_config)
+        self.multimodal_config = FlavaMultimodalConfig(**multimodal_config)
+        self.image_codebook_config = FlavaImageCodebookConfig(**image_codebook_config)
+        self.projection_dim = projection_dim
+        self.init_codebook = init_codebook
+        self.hidden_size = hidden_size
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+        self.ce_ignore_index = ce_ignore_index
+        self.mim_weight = mim_weight
+        self.mlm_weight = mlm_weight
+        self.global_contrastive_weight = global_contrastive_weight
+        self.itm_weight = itm_weight
+        self.mmm_image_weight = mmm_image_weight
+        self.mmm_text_weight = mmm_text_weight
+        self.global_backprop_contrastive = global_backprop_contrastive
+        self.skip_unmasked_multimodal_encoder = skip_unmasked_multimodal_encoder
+        self.return_loss = return_loss
+    @classmethod
+    def from_configs(
+        cls,
+        image_config: FlavaImageConfig,
+        text_config: FlavaTextConfig,
+        multimodal_config: FlavaMultimodalConfig,
+        image_codebook_config: FlavaImageCodebookConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`FlavaConfig`] (or a derived class) from flava text model configuration, flava image model
+        configuration, flava multimodal model and flava codebook model configuration.
+        Returns:
+            [`FlavaConfig`]: An instance of a configuration object
+        """
+        return cls(
+            image_config=image_config.to_dict(),
+            text_config=text_config.to_dict(),
+            multimodal_config=multimodal_config.to_dict(),
+            image_codebook_config=image_codebook_config.to_dict(),
+            **kwargs,
+        )
+__all__ = ["FlavaConfig", "FlavaImageCodebookConfig", "FlavaImageConfig", "FlavaMultimodalConfig", "FlavaTextConfig"]

docs/transformers/build/lib/transformers/models/flava/convert_dalle_to_flava_codebook.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import torch
+from transformers import FlavaImageCodebook, FlavaImageCodebookConfig
+def rreplace(s, old, new, occurrence):
+    li = s.rsplit(old, occurrence)
+    return new.join(li)
+def count_parameters(state_dict):
+    # encoder.embeddings are double copied in original FLAVA
+    return sum(param.float().sum() if "encoder.embeddings" not in key else 0 for key, param in state_dict.items())
+def upgrade_state_dict(state_dict):
+    upgrade = {}
+    group_keys = ["group_1", "group_2", "group_3", "group_4"]
+    for key, value in state_dict.items():
+        for group_key in group_keys:
+            if group_key in key:
+                key = key.replace(f"{group_key}.", f"{group_key}.group.")
+        if "res_path" in key:
+            key = key.replace("res_path.", "res_path.path.")
+        if key.endswith(".w"):
+            key = rreplace(key, ".w", ".weight", 1)
+        if key.endswith(".b"):
+            key = rreplace(key, ".b", ".bias", 1)
+        upgrade[key] = value.float()
+    return upgrade
+@torch.no_grad()
+def convert_dalle_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None, save_checkpoint=True):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    from dall_e import Encoder
+    encoder = Encoder()
+    if os.path.exists(checkpoint_path):
+        ckpt = torch.load(checkpoint_path, weights_only=True)
+    else:
+        ckpt = torch.hub.load_state_dict_from_url(checkpoint_path)
+    if isinstance(ckpt, Encoder):
+        ckpt = ckpt.state_dict()
+    encoder.load_state_dict(ckpt)
+    if config_path is not None:
+        config = FlavaImageCodebookConfig.from_pretrained(config_path)
+    else:
+        config = FlavaImageCodebookConfig()
+    hf_model = FlavaImageCodebook(config).eval()
+    state_dict = encoder.state_dict()
+    hf_state_dict = upgrade_state_dict(state_dict)
+    hf_model.load_state_dict(hf_state_dict)
+    hf_state_dict = hf_model.state_dict()
+    hf_count = count_parameters(hf_state_dict)
+    state_dict_count = count_parameters(state_dict)
+    assert torch.allclose(hf_count, state_dict_count, atol=1e-3)
+    if save_checkpoint:
+        hf_model.save_pretrained(pytorch_dump_folder_path)
+    else:
+        return hf_state_dict
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to flava checkpoint")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    args = parser.parse_args()
+    convert_dalle_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)

docs/transformers/build/lib/transformers/models/flava/convert_flava_original_pytorch_to_hf.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import torch
+from transformers import FlavaConfig, FlavaForPreTraining
+from transformers.models.flava.convert_dalle_to_flava_codebook import convert_dalle_checkpoint
+def count_parameters(state_dict):
+    # encoder.embeddings are double copied in original FLAVA
+    return sum(param.float().sum() if "encoder.embeddings" not in key else 0 for key, param in state_dict.items())
+def upgrade_state_dict(state_dict, codebook_state_dict):
+    upgrade = {}
+    for key, value in state_dict.items():
+        if "text_encoder.embeddings" in key or "image_encoder.embeddings" in key:
+            continue
+        key = key.replace("heads.cmd.mim_head.cls.predictions", "mmm_image_head")
+        key = key.replace("heads.cmd.mlm_head.cls.predictions", "mmm_text_head")
+        key = key.replace("heads.cmd.itm_head.cls", "itm_head")
+        key = key.replace("heads.cmd.itm_head.pooler", "itm_head.pooler")
+        key = key.replace("heads.cmd.clip_head.logit_scale", "flava.logit_scale")
+        key = key.replace("heads.fairseq_mlm.cls.predictions", "mlm_head")
+        key = key.replace("heads.imagenet.mim_head.cls.predictions", "mim_head")
+        key = key.replace("mm_text_projection", "flava.text_to_mm_projection")
+        key = key.replace("mm_image_projection", "flava.image_to_mm_projection")
+        key = key.replace("image_encoder.module", "flava.image_model")
+        key = key.replace("text_encoder.module", "flava.text_model")
+        key = key.replace("mm_encoder.module.encoder.cls_token", "flava.multimodal_model.cls_token")
+        key = key.replace("mm_encoder.module", "flava.multimodal_model")
+        key = key.replace("text_projection", "flava.text_projection")
+        key = key.replace("image_projection", "flava.image_projection")
+        upgrade[key] = value.float()
+    for key, value in codebook_state_dict.items():
+        upgrade[f"image_codebook.{key}"] = value
+    return upgrade
+@torch.no_grad()
+def convert_flava_checkpoint(checkpoint_path, codebook_path, pytorch_dump_folder_path, config_path=None):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = FlavaConfig.from_pretrained(config_path)
+    else:
+        config = FlavaConfig()
+    hf_model = FlavaForPreTraining(config).eval()
+    codebook_state_dict = convert_dalle_checkpoint(codebook_path, None, save_checkpoint=False)
+    if os.path.exists(checkpoint_path):
+        state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+    else:
+        state_dict = torch.hub.load_state_dict_from_url(checkpoint_path, map_location="cpu")
+    hf_state_dict = upgrade_state_dict(state_dict, codebook_state_dict)
+    hf_model.load_state_dict(hf_state_dict)
+    hf_state_dict = hf_model.state_dict()
+    hf_count = count_parameters(hf_state_dict)
+    state_dict_count = count_parameters(state_dict) + count_parameters(codebook_state_dict)
+    assert torch.allclose(hf_count, state_dict_count, atol=1e-3)
+    hf_model.save_pretrained(pytorch_dump_folder_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to flava checkpoint")
+    parser.add_argument("--codebook_path", default=None, type=str, help="Path to flava codebook checkpoint")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    args = parser.parse_args()
+    convert_flava_checkpoint(args.checkpoint_path, args.codebook_path, args.pytorch_dump_folder_path, args.config_path)

docs/transformers/build/lib/transformers/models/flava/feature_extraction_flava.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for FLAVA."""
+import warnings
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_flava import FlavaImageProcessor
+logger = logging.get_logger(__name__)
+@requires(backends=("vision",))
+class FlavaFeatureExtractor(FlavaImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class FlavaFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
+            " use FlavaImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+__all__ = ["FlavaFeatureExtractor"]

docs/transformers/build/lib/transformers/models/flava/image_processing_flava.py ADDED Viewed

	@@ -0,0 +1,705 @@

+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Flava."""
+import math
+import random
+from functools import lru_cache
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+import numpy as np
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+from ...utils.import_utils import requires
+if is_vision_available():
+    import PIL
+logger = logging.get_logger(__name__)
+# These values are taken from CLIP
+FLAVA_IMAGE_MEAN = OPENAI_CLIP_MEAN
+FLAVA_IMAGE_STD = OPENAI_CLIP_STD
+FLAVA_CODEBOOK_MEAN = [0.0, 0.0, 0.0]
+FLAVA_CODEBOOK_STD = [1.0, 1.0, 1.0]
+LOGIT_LAPLACE_EPS: float = 0.1
+# Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py
+class FlavaMaskingGenerator:
+    def __init__(
+        self,
+        input_size: Union[int, Tuple[int, int]] = 14,
+        total_mask_patches: int = 75,
+        mask_group_max_patches: Optional[int] = None,
+        mask_group_min_patches: int = 16,
+        mask_group_min_aspect_ratio: Optional[float] = 0.3,
+        mask_group_max_aspect_ratio: Optional[float] = None,
+    ):
+        if not isinstance(input_size, tuple):
+            input_size = (input_size,) * 2
+        self.height, self.width = input_size
+        self.num_patches = self.height * self.width
+        self.total_mask_patches = total_mask_patches
+        self.mask_group_min_patches = mask_group_min_patches
+        self.mask_group_max_patches = total_mask_patches if mask_group_max_patches is None else mask_group_max_patches
+        mask_group_max_aspect_ratio = mask_group_max_aspect_ratio or 1 / mask_group_min_aspect_ratio
+        self.log_aspect_ratio = (math.log(mask_group_min_aspect_ratio), math.log(mask_group_max_aspect_ratio))
+    def __repr__(self):
+        repr_str = "MaskingGenerator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
+            self.height,
+            self.width,
+            self.mask_group_min_patches,
+            self.mask_group_max_patches,
+            self.total_mask_patches,
+            self.log_aspect_ratio[0],
+            self.log_aspect_ratio[1],
+        )
+        return repr_str
+    def get_shape(self):
+        return self.height, self.width
+    def _mask(self, mask, max_mask_patches):
+        delta = 0
+        for _attempt in range(10):
+            target_area = random.uniform(self.mask_group_min_patches, max_mask_patches)
+            aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+            height = int(round(math.sqrt(target_area * aspect_ratio)))
+            width = int(round(math.sqrt(target_area / aspect_ratio)))
+            if width < self.width and height < self.height:
+                top = random.randint(0, self.height - height)
+                left = random.randint(0, self.width - width)
+                num_masked = mask[top : top + height, left : left + width].sum()
+                # Overlap
+                if 0 < height * width - num_masked <= max_mask_patches:
+                    for i in range(top, top + height):
+                        for j in range(left, left + width):
+                            if mask[i, j] == 0:
+                                mask[i, j] = 1
+                                delta += 1
+                if delta > 0:
+                    break
+        return delta
+    def __call__(self):
+        mask = np.zeros(shape=self.get_shape(), dtype=int)
+        mask_count = 0
+        while mask_count < self.total_mask_patches:
+            max_mask_patches = self.total_mask_patches - mask_count
+            max_mask_patches = min(max_mask_patches, self.mask_group_max_patches)
+            delta = self._mask(mask, max_mask_patches)
+            if delta == 0:
+                break
+            else:
+                mask_count += delta
+        return mask
+@requires(backends=("vision",))
+class FlavaImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Flava image processor.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in `preprocess`.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after resizing. Can be overridden by the `size` parameter in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in
+            `preprocess`.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the images. Can be overridden by the `do_center_crop` parameter in `preprocess`.
+        crop_size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of image after the center crop `(crop_size["height"], crop_size["width"])`. Can be overridden by the
+            `crop_size` parameter in `preprocess`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in `preprocess`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in
+            `preprocess`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in `preprocess`.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        return_image_mask (`bool`, *optional*, defaults to `False`):
+            Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
+        input_size_patches (`int`, *optional*, defaults to 14):
+            Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
+            by the `input_size_patches` parameter in `preprocess`.
+        total_mask_patches (`int`, *optional*, defaults to 75):
+            Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
+            `preprocess`.
+        mask_group_min_patches (`int`, *optional*, defaults to 16):
+            Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
+            parameter in `preprocess`.
+        mask_group_max_patches (`int`, *optional*):
+            Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
+            parameter in `preprocess`.
+        mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
+            Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
+            in `preprocess`.
+        mask_group_max_aspect_ratio (`float`, *optional*):
+            Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
+            in `preprocess`.
+        codebook_do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
+            parameter in `preprocess`. `codebook_size`.
+        codebook_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
+            `preprocess`.
+        codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+            Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
+            parameter in `preprocess`.
+        codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to crop the input for codebook at the center. If the input size is smaller than
+            `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
+            overridden by the `codebook_do_center_crop` parameter in `preprocess`.
+        codebook_crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Desired output size for codebook input when applying center-cropping. Can be overridden by the
+            `codebook_crop_size` parameter in `preprocess`.
+        codebook_do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
+            overridden by the `codebook_do_rescale` parameter in `preprocess`.
+        codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
+            `codebook_rescale_factor` parameter in `preprocess`.
+        codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
+            Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
+            `codebook_do_map_pixels` parameter in `preprocess`.
+        codebook_do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
+            be overridden by the `codebook_do_normalize` parameter in `preprocess`.
+        codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
+            The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
+            by the `codebook_image_mean` parameter in `preprocess`.
+        codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
+            be overridden by the `codebook_image_std` parameter in `preprocess`.
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, Iterable[float]]] = None,
+        image_std: Optional[Union[float, Iterable[float]]] = None,
+        # Mask related params
+        return_image_mask: bool = False,
+        input_size_patches: int = 14,
+        total_mask_patches: int = 75,
+        mask_group_min_patches: int = 16,
+        mask_group_max_patches: Optional[int] = None,
+        mask_group_min_aspect_ratio: float = 0.3,
+        mask_group_max_aspect_ratio: Optional[float] = None,
+        # Codebook related params
+        return_codebook_pixels: bool = False,
+        codebook_do_resize: bool = True,
+        codebook_size: Optional[bool] = None,
+        codebook_resample: int = PILImageResampling.LANCZOS,
+        codebook_do_center_crop: bool = True,
+        codebook_crop_size: Optional[int] = None,
+        codebook_do_rescale: bool = True,
+        codebook_rescale_factor: Union[int, float] = 1 / 255,
+        codebook_do_map_pixels: bool = True,
+        codebook_do_normalize: bool = True,
+        codebook_image_mean: Optional[Union[float, Iterable[float]]] = None,
+        codebook_image_std: Optional[Union[float, Iterable[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+        codebook_size = codebook_size if codebook_size is not None else {"height": 112, "width": 112}
+        codebook_size = get_size_dict(codebook_size, param_name="codebook_size")
+        codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else {"height": 112, "width": 112}
+        codebook_crop_size = get_size_dict(codebook_crop_size, param_name="codebook_crop_size")
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else FLAVA_IMAGE_MEAN
+        self.image_std = image_std if image_std is not None else FLAVA_IMAGE_STD
+        self.return_image_mask = return_image_mask
+        self.input_size_patches = input_size_patches
+        self.total_mask_patches = total_mask_patches
+        self.mask_group_min_patches = mask_group_min_patches
+        self.mask_group_max_patches = mask_group_max_patches
+        self.mask_group_min_aspect_ratio = mask_group_min_aspect_ratio
+        self.mask_group_max_aspect_ratio = mask_group_max_aspect_ratio
+        self.return_codebook_pixels = return_codebook_pixels
+        self.codebook_do_resize = codebook_do_resize
+        self.codebook_size = codebook_size
+        self.codebook_resample = codebook_resample
+        self.codebook_do_center_crop = codebook_do_center_crop
+        self.codebook_crop_size = codebook_crop_size
+        self.codebook_do_rescale = codebook_do_rescale
+        self.codebook_rescale_factor = codebook_rescale_factor
+        self.codebook_do_map_pixels = codebook_do_map_pixels
+        self.codebook_do_normalize = codebook_do_normalize
+        self.codebook_image_mean = codebook_image_mean
+        self.codebook_image_mean = codebook_image_mean if codebook_image_mean is not None else FLAVA_CODEBOOK_MEAN
+        self.codebook_image_std = codebook_image_std if codebook_image_std is not None else FLAVA_CODEBOOK_STD
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `FlavaImageProcessor.from_pretrained(checkpoint, codebook_size=600)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "codebook_size" in kwargs:
+            image_processor_dict["codebook_size"] = kwargs.pop("codebook_size")
+        if "codebook_crop_size" in kwargs:
+            image_processor_dict["codebook_crop_size"] = kwargs.pop("codebook_crop_size")
+        return super().from_dict(image_processor_dict, **kwargs)
+    @lru_cache()
+    def masking_generator(
+        self,
+        input_size_patches,
+        total_mask_patches,
+        mask_group_min_patches,
+        mask_group_max_patches,
+        mask_group_min_aspect_ratio,
+        mask_group_max_aspect_ratio,
+    ) -> FlavaMaskingGenerator:
+        return FlavaMaskingGenerator(
+            input_size=input_size_patches,
+            total_mask_patches=total_mask_patches,
+            mask_group_min_patches=mask_group_min_patches,
+            mask_group_max_patches=mask_group_max_patches,
+            mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
+            mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
+        )
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+    def map_pixels(self, image: np.ndarray) -> np.ndarray:
+        return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_map_pixels: Optional[bool] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[ChannelDimension] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+        if do_rescale and is_scaled_image(image):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(image)
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+        if do_center_crop:
+            image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+        if do_map_pixels:
+            image = self.map_pixels(image)
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[Dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        # Mask related params
+        return_image_mask: Optional[bool] = None,
+        input_size_patches: Optional[int] = None,
+        total_mask_patches: Optional[int] = None,
+        mask_group_min_patches: Optional[int] = None,
+        mask_group_max_patches: Optional[int] = None,
+        mask_group_min_aspect_ratio: Optional[float] = None,
+        mask_group_max_aspect_ratio: Optional[float] = None,
+        # Codebook related params
+        return_codebook_pixels: Optional[bool] = None,
+        codebook_do_resize: Optional[bool] = None,
+        codebook_size: Optional[Dict[str, int]] = None,
+        codebook_resample: Optional[int] = None,
+        codebook_do_center_crop: Optional[bool] = None,
+        codebook_crop_size: Optional[Dict[str, int]] = None,
+        codebook_do_rescale: Optional[bool] = None,
+        codebook_rescale_factor: Optional[float] = None,
+        codebook_do_map_pixels: Optional[bool] = None,
+        codebook_do_normalize: Optional[bool] = None,
+        codebook_image_mean: Optional[Iterable[float]] = None,
+        codebook_image_std: Optional[Iterable[float]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            return_image_mask (`bool`, *optional*, defaults to `self.return_image_mask`):
+                Whether to return the image mask.
+            input_size_patches (`int`, *optional*, defaults to `self.input_size_patches`):
+                Size of the patches to extract from the image.
+            total_mask_patches (`int`, *optional*, defaults to `self.total_mask_patches`):
+                Total number of patches to extract from the image.
+            mask_group_min_patches (`int`, *optional*, defaults to `self.mask_group_min_patches`):
+                Minimum number of patches to extract from the image.
+            mask_group_max_patches (`int`, *optional*, defaults to `self.mask_group_max_patches`):
+                Maximum number of patches to extract from the image.
+            mask_group_min_aspect_ratio (`float`, *optional*, defaults to `self.mask_group_min_aspect_ratio`):
+                Minimum aspect ratio of the patches to extract from the image.
+            mask_group_max_aspect_ratio (`float`, *optional*, defaults to `self.mask_group_max_aspect_ratio`):
+                Maximum aspect ratio of the patches to extract from the image.
+            return_codebook_pixels (`bool`, *optional*, defaults to `self.return_codebook_pixels`):
+                Whether to return the codebook pixels.
+            codebook_do_resize (`bool`, *optional*, defaults to `self.codebook_do_resize`):
+                Whether to resize the codebook pixels.
+            codebook_size (`Dict[str, int]`, *optional*, defaults to `self.codebook_size`):
+                Size of the codebook pixels.
+            codebook_resample (`int`, *optional*, defaults to `self.codebook_resample`):
+                Resampling filter to use if resizing the codebook pixels. This can be one of the enum
+                `PILImageResampling`, Only has an effect if `codebook_do_resize` is set to `True`.
+            codebook_do_center_crop (`bool`, *optional*, defaults to `self.codebook_do_center_crop`):
+                Whether to center crop the codebook pixels.
+            codebook_crop_size (`Dict[str, int]`, *optional*, defaults to `self.codebook_crop_size`):
+                Size of the center crop of the codebook pixels. Only has an effect if `codebook_do_center_crop` is set
+                to `True`.
+            codebook_do_rescale (`bool`, *optional*, defaults to `self.codebook_do_rescale`):
+                Whether to rescale the codebook pixels values between [0 - 1].
+            codebook_rescale_factor (`float`, *optional*, defaults to `self.codebook_rescale_factor`):
+                Rescale factor to rescale the codebook pixels by if `codebook_do_rescale` is set to `True`.
+            codebook_do_map_pixels (`bool`, *optional*, defaults to `self.codebook_do_map_pixels`):
+                Whether to map the codebook pixels values.
+            codebook_do_normalize (`bool`, *optional*, defaults to `self.codebook_do_normalize`):
+                Whether to normalize the codebook pixels.
+            codebook_image_mean (`float` or `List[float]`, *optional*, defaults to `self.codebook_image_mean`):
+                Codebook pixels mean to normalize the codebook pixels by if `codebook_do_normalize` is set to `True`.
+            codebook_image_std (`float` or `List[float]`, *optional*, defaults to `self.codebook_image_std`):
+                Codebook pixels standard deviation to normalize the codebook pixels by if `codebook_do_normalize` is
+                set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        return_image_mask = return_image_mask if return_image_mask is not None else self.return_image_mask
+        input_size_patches = input_size_patches if input_size_patches is not None else self.input_size_patches
+        total_mask_patches = total_mask_patches if total_mask_patches is not None else self.total_mask_patches
+        mask_group_min_patches = (
+            mask_group_min_patches if mask_group_min_patches is not None else self.mask_group_min_patches
+        )
+        mask_group_max_patches = (
+            mask_group_max_patches if mask_group_max_patches is not None else self.mask_group_max_patches
+        )
+        mask_group_min_aspect_ratio = (
+            mask_group_min_aspect_ratio
+            if mask_group_min_aspect_ratio is not None
+            else self.mask_group_min_aspect_ratio
+        )
+        mask_group_max_aspect_ratio = (
+            mask_group_max_aspect_ratio
+            if mask_group_max_aspect_ratio is not None
+            else self.mask_group_max_aspect_ratio
+        )
+        return_codebook_pixels = (
+            return_codebook_pixels if return_codebook_pixels is not None else self.return_codebook_pixels
+        )
+        codebook_do_resize = codebook_do_resize if codebook_do_resize is not None else self.codebook_do_resize
+        codebook_size = codebook_size if codebook_size is not None else self.codebook_size
+        codebook_size = get_size_dict(codebook_size, param_name="codebook_size")
+        codebook_resample = codebook_resample if codebook_resample is not None else self.codebook_resample
+        codebook_do_rescale = codebook_do_rescale if codebook_do_rescale is not None else self.codebook_do_rescale
+        codebook_rescale_factor = (
+            codebook_rescale_factor if codebook_rescale_factor is not None else self.codebook_rescale_factor
+        )
+        codebook_do_center_crop = (
+            codebook_do_center_crop if codebook_do_center_crop is not None else self.codebook_do_center_crop
+        )
+        codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else self.codebook_crop_size
+        codebook_crop_size = get_size_dict(codebook_crop_size, param_name="codebook_crop_size")
+        codebook_do_map_pixels = (
+            codebook_do_map_pixels if codebook_do_map_pixels is not None else self.codebook_do_map_pixels
+        )
+        codebook_do_normalize = (
+            codebook_do_normalize if codebook_do_normalize is not None else self.codebook_do_normalize
+        )
+        codebook_image_mean = codebook_image_mean if codebook_image_mean is not None else self.codebook_image_mean
+        codebook_image_std = codebook_image_std if codebook_image_std is not None else self.codebook_image_std
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        processed_images = [
+            self._preprocess_image(
+                image=img,
+                do_resize=do_resize,
+                size=size,
+                resample=resample,
+                do_center_crop=do_center_crop,
+                crop_size=crop_size,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                do_map_pixels=False,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for img in images
+        ]
+        data = {"pixel_values": processed_images}
+        if return_codebook_pixels:
+            codebook_images = [
+                self._preprocess_image(
+                    image=img,
+                    do_resize=codebook_do_resize,
+                    size=codebook_size,
+                    resample=codebook_resample,
+                    do_center_crop=codebook_do_center_crop,
+                    crop_size=codebook_crop_size,
+                    do_rescale=codebook_do_rescale,
+                    rescale_factor=codebook_rescale_factor,
+                    do_normalize=codebook_do_normalize,
+                    image_mean=codebook_image_mean,
+                    image_std=codebook_image_std,
+                    do_map_pixels=codebook_do_map_pixels,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                for img in images
+            ]
+            data["codebook_pixel_values"] = codebook_images
+        if return_image_mask:
+            mask_generator = self.masking_generator(
+                input_size_patches=input_size_patches,
+                total_mask_patches=total_mask_patches,
+                mask_group_min_patches=mask_group_min_patches,
+                mask_group_max_patches=mask_group_max_patches,
+                mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
+                mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
+            )
+            masks = [mask_generator() for _ in images]
+            data["bool_masked_pos"] = masks
+        return BatchFeature(data=data, tensor_type=return_tensors)
+__all__ = ["FlavaImageProcessor"]

docs/transformers/build/lib/transformers/models/flava/image_processing_flava_fast.py ADDED Viewed

	@@ -0,0 +1,549 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for Flava."""
+import math
+import random
+from functools import lru_cache
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+from ...image_processing_utils_fast import (
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+    BaseImageProcessorFast,
+    BatchFeature,
+    DefaultFastImageProcessorKwargs,
+    get_size_dict,
+)
+from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
+from ...image_utils import ImageInput, PILImageResampling, SizeDict
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+)
+from .image_processing_flava import (
+    FLAVA_CODEBOOK_MEAN,
+    FLAVA_CODEBOOK_STD,
+    FLAVA_IMAGE_MEAN,
+    FLAVA_IMAGE_STD,
+    LOGIT_LAPLACE_EPS,
+)
+if is_torch_available():
+    import torch
+if is_torchvision_available():
+    from ...image_utils import pil_torch_interpolation_mapping
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+class FlavaMaskingGenerator:
+    def __init__(
+        self,
+        input_size: Union[int, Tuple[int, int]] = 14,
+        total_mask_patches: int = 75,
+        mask_group_max_patches: Optional[int] = None,
+        mask_group_min_patches: int = 16,
+        mask_group_min_aspect_ratio: Optional[float] = 0.3,
+        mask_group_max_aspect_ratio: float = None,
+    ):
+        if not isinstance(input_size, tuple):
+            input_size = (input_size,) * 2
+        self.height, self.width = input_size
+        self.num_patches = self.height * self.width
+        self.total_mask_patches = total_mask_patches
+        self.mask_group_min_patches = mask_group_min_patches
+        self.mask_group_max_patches = total_mask_patches if mask_group_max_patches is None else mask_group_max_patches
+        mask_group_max_aspect_ratio = mask_group_max_aspect_ratio or 1 / mask_group_min_aspect_ratio
+        self.log_aspect_ratio = (math.log(mask_group_min_aspect_ratio), math.log(mask_group_max_aspect_ratio))
+    def __repr__(self):
+        repr_str = "MaskingGenerator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
+            self.height,
+            self.width,
+            self.mask_group_min_patches,
+            self.mask_group_max_patches,
+            self.total_mask_patches,
+            self.log_aspect_ratio[0],
+            self.log_aspect_ratio[1],
+        )
+        return repr_str
+    def get_shape(self):
+        return self.height, self.width
+    def _mask(self, mask, max_mask_patches):
+        delta = 0
+        for _attempt in range(10):
+            target_area = random.uniform(self.mask_group_min_patches, max_mask_patches)
+            aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+            height = int(round(math.sqrt(target_area * aspect_ratio)))
+            width = int(round(math.sqrt(target_area / aspect_ratio)))
+            if width < self.width and height < self.height:
+                top = random.randint(0, self.height - height)
+                left = random.randint(0, self.width - width)
+                num_masked = mask[top : top + height, left : left + width].sum()
+                # Overlap
+                if 0 < height * width - num_masked <= max_mask_patches:
+                    zeros_pos = mask[top : top + height, left : left + width] == 0
+                    mask[top : top + height, left : left + width][zeros_pos] = 1
+                    delta += zeros_pos.sum()
+                if delta > 0:
+                    break
+        return delta
+    def __call__(self):
+        mask = torch.zeros(self.get_shape(), dtype=torch.int)
+        mask_count = 0
+        while mask_count < self.total_mask_patches:
+            max_mask_patches = self.total_mask_patches - mask_count
+            max_mask_patches = min(max_mask_patches, self.mask_group_max_patches)
+            delta = self._mask(mask, max_mask_patches)
+            if delta == 0:
+                break
+            else:
+                mask_count += delta
+        return mask
+class FlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    # Mask related params
+    return_image_mask: Optional[bool]
+    input_size_patches: Optional[int]
+    total_mask_patches: Optional[int]
+    mask_group_min_patches: Optional[int]
+    mask_group_max_patches: Optional[int]
+    mask_group_min_aspect_ratio: Optional[float]
+    mask_group_max_aspect_ratio: Optional[float]
+    # Codebook related params
+    return_codebook_pixels: Optional[bool]
+    codebook_do_resize: Optional[bool]
+    codebook_size: Optional[bool]
+    codebook_resample: Optional[int]
+    codebook_do_center_crop: Optional[bool]
+    codebook_crop_size: Optional[int]
+    codebook_do_rescale: Optional[bool]
+    codebook_rescale_factor: Optional[Union[int, float]]
+    codebook_do_map_pixels: Optional[bool]
+    codebook_do_normalize: Optional[bool]
+    codebook_image_mean: Optional[Union[float, Iterable[float]]]
+    codebook_image_std: Optional[Union[float, Iterable[float]]]
+@add_start_docstrings(
+    "Constructs a fast Flava image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    """
+        return_image_mask (`bool`, *optional*, defaults to `False`):
+            Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
+        input_size_patches (`int`, *optional*, defaults to 14):
+            Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
+            by the `input_size_patches` parameter in `preprocess`.
+        total_mask_patches (`int`, *optional*, defaults to 75):
+            Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
+            `preprocess`.
+        mask_group_min_patches (`int`, *optional*, defaults to 16):
+            Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
+            parameter in `preprocess`.
+        mask_group_max_patches (`int`, *optional*):
+            Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
+            parameter in `preprocess`.
+        mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
+            Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
+            in `preprocess`.
+        mask_group_max_aspect_ratio (`float`, *optional*):
+            Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
+            in `preprocess`.
+        codebook_do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
+            parameter in `preprocess`. `codebook_size`.
+        codebook_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
+            `preprocess`.
+        codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+            Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
+            parameter in `preprocess`.
+        codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to crop the input for codebook at the center. If the input size is smaller than
+            `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
+            overridden by the `codebook_do_center_crop` parameter in `preprocess`.
+        codebook_crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Desired output size for codebook input when applying center-cropping. Can be overridden by the
+            `codebook_crop_size` parameter in `preprocess`.
+        codebook_do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
+            overridden by the `codebook_do_rescale` parameter in `preprocess`.
+        codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
+            `codebook_rescale_factor` parameter in `preprocess`.
+        codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
+            Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
+            `codebook_do_map_pixels` parameter in `preprocess`.
+        codebook_do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
+            be overridden by the `codebook_do_normalize` parameter in `preprocess`.
+        codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
+            The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
+            by the `codebook_image_mean` parameter in `preprocess`.
+        codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
+            be overridden by the `codebook_image_std` parameter in `preprocess`.
+    """,
+)
+class FlavaImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = FLAVA_IMAGE_MEAN
+    image_std = FLAVA_IMAGE_STD
+    size = {"height": 224, "width": 224}
+    crop_size = {"height": 224, "width": 224}
+    do_resize = True
+    do_center_crop = True
+    do_rescale = True
+    do_normalize = True
+    # Mask related params
+    return_image_mask = False
+    input_size_patches = 14
+    total_mask_patches = 75
+    mask_group_min_patches = 16
+    mask_group_max_patches = None
+    mask_group_min_aspect_ratio = 0.3
+    mask_group_max_aspect_ratio = None
+    # Codebook related params
+    return_codebook_pixels = False
+    codebook_do_resize = True
+    codebook_size = {"height": 112, "width": 112}
+    # LANCZOS resample does not support torch Tensor. Use BICUBIC as closest alternative
+    codebook_resample = PILImageResampling.BICUBIC
+    codebook_do_center_crop = True
+    codebook_crop_size = {"height": 112, "width": 112}
+    codebook_do_rescale = True
+    codebook_rescale_factor = 1 / 255
+    codebook_do_map_pixels = True
+    codebook_do_normalize = True
+    codebook_image_mean = FLAVA_CODEBOOK_MEAN
+    codebook_image_std = FLAVA_CODEBOOK_STD
+    valid_kwargs = FlavaFastImageProcessorKwargs
+    def __init__(self, **kwargs: Unpack[FlavaFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+    @add_start_docstrings(
+        BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+        """
+            return_image_mask (`bool`, *optional*, defaults to `False`):
+                Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
+            input_size_patches (`int`, *optional*, defaults to 14):
+                Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
+                by the `input_size_patches` parameter in `preprocess`.
+            total_mask_patches (`int`, *optional*, defaults to 75):
+                Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
+                `preprocess`.
+            mask_group_min_patches (`int`, *optional*, defaults to 16):
+                Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
+                parameter in `preprocess`.
+            mask_group_max_patches (`int`, *optional*):
+                Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
+                parameter in `preprocess`.
+            mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
+                Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
+                in `preprocess`.
+            mask_group_max_aspect_ratio (`float`, *optional*):
+                Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
+                in `preprocess`.
+            codebook_do_resize (`bool`, *optional*, defaults to `True`):
+                Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
+                parameter in `preprocess`. `codebook_size`.
+            codebook_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+                Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
+                `preprocess`.
+            codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+                Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
+                parameter in `preprocess`.
+            codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
+                Whether to crop the input for codebook at the center. If the input size is smaller than
+                `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
+                overridden by the `codebook_do_center_crop` parameter in `preprocess`.
+            codebook_crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+                Desired output size for codebook input when applying center-cropping. Can be overridden by the
+                `codebook_crop_size` parameter in `preprocess`.
+            codebook_do_rescale (`bool`, *optional*, defaults to `True`):
+                Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
+                overridden by the `codebook_do_rescale` parameter in `preprocess`.
+            codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+                Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
+                `codebook_rescale_factor` parameter in `preprocess`.
+            codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
+                Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
+                `codebook_do_map_pixels` parameter in `preprocess`.
+            codebook_do_normalize (`bool`, *optional*, defaults to `True`):
+                Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
+                be overridden by the `codebook_do_normalize` parameter in `preprocess`.
+            codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
+                The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
+                by the `codebook_image_mean` parameter in `preprocess`.
+            codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+                The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
+                be overridden by the `codebook_image_std` parameter in `preprocess`.
+        """,
+    )
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `FlavaImageProcessor.from_pretrained(checkpoint, codebook_size=600)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "codebook_size" in kwargs:
+            image_processor_dict["codebook_size"] = kwargs.pop("codebook_size")
+        if "codebook_crop_size" in kwargs:
+            image_processor_dict["codebook_crop_size"] = kwargs.pop("codebook_crop_size")
+        return super().from_dict(image_processor_dict, **kwargs)
+    @lru_cache()
+    def masking_generator(
+        self,
+        input_size_patches,
+        total_mask_patches,
+        mask_group_min_patches,
+        mask_group_max_patches,
+        mask_group_min_aspect_ratio,
+        mask_group_max_aspect_ratio,
+    ) -> FlavaMaskingGenerator:
+        return FlavaMaskingGenerator(
+            input_size=input_size_patches,
+            total_mask_patches=total_mask_patches,
+            mask_group_min_patches=mask_group_min_patches,
+            mask_group_max_patches=mask_group_max_patches,
+            mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
+            mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
+        )
+    def map_pixels(self, image: "torch.Tensor") -> "torch.Tensor":
+        return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS
+    def _further_process_kwargs(
+        self,
+        size: Optional[SizeDict] = None,
+        crop_size: Optional[SizeDict] = None,
+        default_to_square: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        codebook_size: Optional[SizeDict] = None,
+        codebook_crop_size: Optional[SizeDict] = None,
+        codebook_image_mean: Optional[Union[float, list[float]]] = None,
+        codebook_image_std: Optional[Union[float, list[float]]] = None,
+        codebook_resample: Optional[PILImageResampling] = None,
+        data_format: Optional[ChannelDimension] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if kwargs is None:
+            kwargs = {}
+        if size is not None:
+            size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square))
+        if crop_size is not None:
+            crop_size = SizeDict(**get_size_dict(crop_size, param_name="crop_size"))
+        if isinstance(image_mean, list):
+            image_mean = tuple(image_mean)
+        if isinstance(image_std, list):
+            image_std = tuple(image_std)
+        if data_format is None:
+            data_format = ChannelDimension.FIRST
+        if codebook_size is not None:
+            codebook_size = SizeDict(**get_size_dict(size=codebook_size, default_to_square=default_to_square))
+        if codebook_crop_size is not None:
+            codebook_crop_size = SizeDict(**get_size_dict(codebook_crop_size, param_name="codebook_crop_size"))
+        if isinstance(codebook_image_mean, list):
+            codebook_image_mean = tuple(codebook_image_mean)
+        if isinstance(codebook_image_std, list):
+            codebook_image_std = tuple(codebook_image_std)
+        kwargs["size"] = size
+        kwargs["crop_size"] = crop_size
+        kwargs["default_to_square"] = default_to_square
+        kwargs["image_mean"] = image_mean
+        kwargs["image_std"] = image_std
+        kwargs["codebook_size"] = codebook_size
+        kwargs["codebook_crop_size"] = codebook_crop_size
+        kwargs["codebook_image_mean"] = codebook_image_mean
+        kwargs["codebook_image_std"] = codebook_image_std
+        kwargs["data_format"] = data_format
+        kwargs["codebook_interpolation"] = (
+            pil_torch_interpolation_mapping[codebook_resample]
+            if isinstance(codebook_resample, (PILImageResampling, int))
+            else codebook_resample
+        )
+        return kwargs
+    def _preprocess_image(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        do_map_pixels: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> "torch.Tensor":
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            if do_map_pixels:
+                stacked_images = self.map_pixels(image=stacked_images)
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        return processed_images
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        # Mask related params
+        return_image_mask: Optional[bool],
+        input_size_patches: Optional[int],
+        total_mask_patches: Optional[int],
+        mask_group_min_patches: Optional[int],
+        mask_group_max_patches: Optional[int],
+        mask_group_min_aspect_ratio: Optional[float],
+        mask_group_max_aspect_ratio: Optional[float],
+        # Codebook related params
+        return_codebook_pixels: Optional[bool],
+        codebook_do_resize: Optional[bool],
+        codebook_size: Optional[SizeDict],
+        codebook_interpolation: Optional["F.InterpolationMode"],
+        codebook_do_center_crop: Optional[bool],
+        codebook_crop_size: Optional[SizeDict],
+        codebook_do_rescale: Optional[bool],
+        codebook_rescale_factor: Optional[float],
+        codebook_do_map_pixels: Optional[bool],
+        codebook_do_normalize: Optional[bool],
+        codebook_image_mean: Optional[Union[float, list[float]]],
+        codebook_image_std: Optional[Union[float, list[float]]],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        processed_images = self._preprocess_image(
+            images=images,
+            do_resize=do_resize,
+            size=size,
+            interpolation=interpolation,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            do_map_pixels=False,
+            image_mean=image_mean,
+            image_std=image_std,
+            return_tensors=return_tensors,
+        )
+        data = {
+            "pixel_values": processed_images,
+        }
+        if return_codebook_pixels:
+            codebook_processed_images = self._preprocess_image(
+                images=images,
+                do_resize=codebook_do_resize,
+                size=codebook_size,
+                interpolation=codebook_interpolation,
+                do_center_crop=codebook_do_center_crop,
+                crop_size=codebook_crop_size,
+                do_rescale=codebook_do_rescale,
+                rescale_factor=codebook_rescale_factor,
+                do_normalize=codebook_do_normalize,
+                do_map_pixels=codebook_do_map_pixels,
+                image_mean=codebook_image_mean,
+                image_std=codebook_image_std,
+                return_tensors=return_tensors,
+            )
+            data["codebook_pixel_values"] = codebook_processed_images
+        if return_image_mask:
+            mask_generator = self.masking_generator(
+                input_size_patches=input_size_patches,
+                total_mask_patches=total_mask_patches,
+                mask_group_min_patches=mask_group_min_patches,
+                mask_group_max_patches=mask_group_max_patches,
+                mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
+                mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
+            )
+            masks = [mask_generator() for _ in range(len(images))]
+            masks = torch.stack(masks, dim=0) if return_tensors else masks
+            data["bool_masked_pos"] = masks
+        return BatchFeature(data=data, tensor_type=return_tensors)
+__all__ = ["FlavaImageProcessorFast"]

docs/transformers/build/lib/transformers/models/flava/modeling_flava.py ADDED Viewed

	@@ -0,0 +1,2127 @@

+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch FLAVA model."""
+import collections
+import math
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from .configuration_flava import (
+    FlavaConfig,
+    FlavaImageCodebookConfig,
+    FlavaImageConfig,
+    FlavaMultimodalConfig,
+    FlavaTextConfig,
+)
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "facebook/flava-full"
+# Codebook docstring
+_CHECKPOINT_FOR_CODEBOOK_DOC = "facebook/flava-image-codebook"
+_CONFIG_CLASS_FOR_IMAGE_MODEL_DOC = "FlavaImageConfig"
+_CONFIG_CLASS_FOR_TEXT_MODEL_DOC = "FlavaTextConfig"
+_CONFIG_CLASS_FOR_MULTIMODAL_MODEL_DOC = "FlavaMultimodalConfig"
+_EXPECTED_IMAGE_OUTPUT_SHAPE = [1, 197, 768]
+LOGIT_SCALE_CLAMP_MIN = 0
+LOGIT_SCALE_CLAMP_MAX = 4.6052
+FlavaPossibleConfigs = Union[FlavaTextConfig, FlavaImageConfig, FlavaMultimodalConfig]
+@dataclass
+class FlavaModelOutput(ModelOutput):
+    """
+    Output from FlavaModel containing embeddings and outputs from individual encoders.
+    Note that `image_embeddings` and `text_embeddigns` returned are similar to pooled output returned from a
+    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
+    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
+    Args:
+        image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
+            The image embeddings which are basically the pooled output of [`FlavaImageModel`].
+        image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
+            The output of the [`FlavaImageModel`].
+        text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
+            The text embeddings which are basically the pooled output of [`FlavaTextModel`].
+        text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
+            The output of the [`FlavaTextModel`].
+        multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
+            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
+        multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
+            The output of the [`FlavaMultimodalModel`].
+    """
+    image_embeddings: Optional[torch.FloatTensor] = None
+    image_output: Optional[BaseModelOutputWithPooling] = None
+    text_embeddings: Optional[torch.FloatTensor] = None
+    text_output: Optional[BaseModelOutputWithPooling] = None
+    multimodal_embeddings: Optional[torch.FloatTensor] = None
+    multimodal_output: Optional[BaseModelOutputWithPooling] = None
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_output", "image_output", "multimodal_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+@dataclass
+class FlavaLosses(ModelOutput):
+    """Class representing pretraining losses from FLAVA model
+    Args:
+        mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.:
+            Masked Image Modeling loss as used in BeIT calculated only for unimodal image data.
+        mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.:
+            Masked Language Modeling loss as used in BERT calculated only for unimodal text data.
+        itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.:
+            Image Text Matching (ITM) loss calculated for paired image-text data. Note that ITM loss is calculated on
+            masked pairs in FLAVA.
+        global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.:
+            Contrastive loss for image-text similarity similar to CLIP but calculated globally for paired image-text
+            data. This is calculated on unmasked images and texts.
+        mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.:
+            Masked Multimodal Modeling loss's image component calculated on paired image-text data.
+        mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.:
+            Masked Multimodal Modeling loss's text component calculated on paired image-text data.
+    """
+    mim: Optional[torch.FloatTensor] = None
+    mlm: Optional[torch.FloatTensor] = None
+    itm: Optional[torch.FloatTensor] = None
+    global_contrastive: Optional[torch.FloatTensor] = None
+    mmm_image: Optional[torch.FloatTensor] = None
+    mmm_text: Optional[torch.FloatTensor] = None
+    def all_none(self) -> bool:
+        all_none = True
+        for v in self.values():
+            if v is not None:
+                all_none = False
+                break
+        return all_none
+@dataclass
+class FlavaForPreTrainingOutput(ModelOutput):
+    """
+    Output from FlavaForPreTraining containing embeddings, and outputs from individual encoders.
+    Note that `image_embeddings` and `text_embeddings` returned are similar to pooled output returned from a
+    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
+    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
+    Args:
+        loss (`torch.FloatTensor`, *optional*, returned when `return_loss` is True):
+            Total loss calculated for this model.
+        loss_info (`FlavaLosses`):
+            Detailed info for FLAVA Pretraining losses. Check `FlavaLosses` class description for the information on
+            the keys.
+        image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
+            The image embeddings which are basically the pooled output of [`FlavaImageModel`].
+        image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
+            The output of the [`FlavaImageModel`].
+        text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
+            The text embeddings which are basically the pooled output of [`FlavaTextModel`].
+        text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
+            The output of the [`FlavaTextModel`].
+        multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
+            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
+        multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
+            The output of the [`FlavaMultimodalModel`].
+        image_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
+            The image embeddings which are basically the pooled output of [`FlavaImageModel`]. Uses `bool_masked_pos`
+            to create masked images.
+        image_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
+            The output of the [`FlavaImageModel`]. Uses `bool_masked_pos` to create masked images.
+        text_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids_masked` are present):
+            The text embeddings which are basically the pooled output of [`FlavaTextModel`].
+        text_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` are present):
+            The output of the [`FlavaTextModel`].
+        multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
+            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
+        multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
+            The output of the [`FlavaMultimodalModel`].
+        mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
+                The logits for MIM unimodal loss. Uses `book_masked_pos` to get masked patches. The flattened output is
+                returned when `bool_masked_pos` has some of the patches masked.
+        mlm_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(total_masked_seq_length, text_vocab_size)`, *optional*, returned when `input_ids_masked` are present and `pixel_values` are not):
+                The logits for MLM unimodal loss. The flattened output is returned when `input_ids_masked` has some of
+                the tokens masked.
+        itm_logits (`torch.FloatTensor` of shape `(batch_size, 2)`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
+                The logits for ITM loss. Note that ITM loss is calculated on masked pairs in FLAVA.
+        mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present):
+                The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened
+                output is returned when `bool_masked_pos` has some of the patches masked.
+        mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present):
+                The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has
+                some of the tokens masked.
+        contrastive_logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeddings` and `text_embeddings` but passed through FLAVA's
+            `image_projection` and `text_projection` layers respectively. This represents the image-text similarity
+            scores. This is calculated on unmasked images and texts.
+        contrastive_logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeddings` and `image_embeddings` but passed through FLAVA's
+            `text_projection` and `image_projection` layers respectively. This is calculated on unmasked images and
+            texts.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    loss_info: FlavaLosses = None
+    image_embeddings: Optional[torch.FloatTensor] = None
+    image_output: Optional[BaseModelOutputWithPooling] = None
+    text_embeddings: Optional[torch.FloatTensor] = None
+    text_output: Optional[BaseModelOutputWithPooling] = None
+    multimodal_embeddings: Optional[torch.FloatTensor] = None
+    multimodal_output: Optional[BaseModelOutputWithPooling] = None
+    image_masked_embeddings: Optional[torch.FloatTensor] = None
+    image_masked_output: Optional[BaseModelOutputWithPooling] = None
+    text_masked_embeddings: Optional[torch.FloatTensor] = None
+    text_masked_output: Optional[BaseModelOutputWithPooling] = None
+    multimodal_masked_embeddings: Optional[torch.FloatTensor] = None
+    multimodal_masked_output: Optional[BaseModelOutputWithPooling] = None
+    mim_logits: Optional[torch.FloatTensor] = None
+    mlm_logits: Optional[torch.FloatTensor] = None
+    itm_logits: Optional[torch.FloatTensor] = None
+    contrastive_logits_per_image: Optional[torch.FloatTensor] = None
+    contrastive_logits_per_text: Optional[torch.FloatTensor] = None
+    mmm_image_logits: Optional[torch.FloatTensor] = None
+    mmm_text_logits: Optional[torch.FloatTensor] = None
+    def to_tuple(self) -> Tuple[Any]:
+        transformer_outputs = [
+            "text_output",
+            "image_output",
+            "multimodal_output",
+            "text_masked_output",
+            "image_masked_output",
+            "multimodal_masked_output",
+        ]
+        return tuple(self[k] if k not in transformer_outputs else getattr(self, k).to_tuple() for k in self.keys())
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/image_transformer.py
+class FlavaImageEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    """
+    def __init__(self, config: FlavaImageConfig, use_mask_token: bool = False) -> None:
+        super().__init__()
+        use_mask_token = use_mask_token or config.mask_token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+        self.patch_embeddings = PatchEmbeddings(
+            image_size=config.image_size,
+            patch_size=config.patch_size,
+            num_channels=config.num_channels,
+            embed_dim=config.hidden_size,
+        )
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+    # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        batch_size, seq_len, _ = embeddings.size()
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # B X H X W = B X HW
+            if bool_masked_pos.dim() == 3:
+                bool_masked_pos = bool_masked_pos.view(bool_masked_pos.size(0), -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+        embeddings = self.dropout(embeddings)
+        return embeddings
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/image_transformer.py
+class PatchEmbeddings(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+    def __init__(
+        self,
+        image_size: int = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        num_channels: int = 3,
+        embed_dim: int = 768,
+    ):
+        super().__init__()
+        if not isinstance(image_size, collections.abc.Iterable):
+            image_size = (image_size, image_size)
+        if not isinstance(patch_size, collections.abc.Iterable):
+            patch_size = (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return x
+class FlavaTextEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ):
+        input_shape = input_ids.size()
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+        inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class FlavaSelfAttention(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        return outputs
+class FlavaSelfOutput(nn.Module):
+    """
+    The residual connection is defined in FlavaLayer (same as ViTLayer) instead of here (as is the case with other
+    models), due to the layernorm applied before each block.
+    """
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class FlavaAttention(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.attention = FlavaSelfAttention(config)
+        self.output = FlavaSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(
+            hidden_states, attention_mask=attention_mask, head_mask=head_mask, output_attentions=output_attentions
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class FlavaIntermediate(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    # Copied from transformers.models.vit.modeling_vit.ViTIntermediate.forward
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class FlavaOutput(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    # Copied from transformers.models.vit.modeling_vit.ViTOutput.forward
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+class FlavaLayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = FlavaAttention(config)
+        self.intermediate = FlavaIntermediate(config)
+        self.output = FlavaOutput(config)
+        # TODO: Check fp32 layer norm possiblity
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViT, layernorm is applied before self-attention
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+        # in ViT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+        outputs = (layer_output,) + outputs
+        return outputs
+class FlavaEncoder(nn.Module):
+    def __init__(self, config: FlavaConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([FlavaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+class FlavaPooler(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states: torch.Tensor):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+FLAVA_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`{config}`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+FLAVA_INPUTS_DOCSTRING_COMMON = r"""
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+FLAVA_IMAGE_INPUTS_DOCSTRING_BASE = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`FlavaImageProcessor.__call__`] for details.
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        interpolate_pos_encoding (`bool`, *optional*):
+            Whether to interpolate the pre-trained position encodings.
+"""
+FLAVA_IMAGE_INPUTS_DOCSTRING = FLAVA_IMAGE_INPUTS_DOCSTRING_BASE + FLAVA_INPUTS_DOCSTRING_COMMON
+FLAVA_TEXT_INPUTS_DOCSTRING_BASE = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+"""
+FLAVA_TEXT_INPUTS_DOCSTRING = FLAVA_TEXT_INPUTS_DOCSTRING_BASE + FLAVA_INPUTS_DOCSTRING_COMMON
+FLAVA_MULTIMODAL_INPUTS_DOCSTRING = (
+    r"""
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch_size, image_num_patches + text_seq_len, hidden_size)`):
+            The concatenated hidden states of unimodal encoders.
+"""
+    + FLAVA_INPUTS_DOCSTRING_COMMON
+)
+FLAVA_MODEL_INPUTS_DOCSTRING_BASE = r"""
+    Args:
+        skip_multimodal_encoder (*bool*, *optional*):
+            Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used.
+"""
+FLAVA_MODEL_INPUTS_DOCSTRING = (
+    FLAVA_IMAGE_INPUTS_DOCSTRING_BASE
+    + FLAVA_TEXT_INPUTS_DOCSTRING_BASE
+    + FLAVA_INPUTS_DOCSTRING_COMMON
+    + FLAVA_MODEL_INPUTS_DOCSTRING_BASE
+)
+FLAVA_PRETRAINING_INPUTS_DOCSTRING = (
+    r"""
+    Args:
+        input_ids_masked (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary. These ones are the masked version of the original task
+            to be used with MLM. Indices can be obtained using [`AutoTokenizer`] along with
+            [`DataCollatorForMaskedLanguageModeling`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
+"""
+    + FLAVA_TEXT_INPUTS_DOCSTRING_BASE
+    + FLAVA_IMAGE_INPUTS_DOCSTRING_BASE
+    + r"""
+        image_attention_mask (`torch.FloatTensor` of shape `({1})`, *optional*):
+            Mask to avoid performing attention on padding token indices specifically for images. Mask values selected
+            in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        skip_unmasked_multimodal_encoder (*bool*, *optional*):
+            Skip any calculations for multimodal encoder for unmasked inputs. FLAVA pretraining doesn't need unmasked
+            multimodal embeddings or outputs as of now.
+        mlm_labels (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
+            Labels for computing the left-to-right language and multimodal masked modeling loss (next word prediction).
+            Indices should be in `[-100, 0, ..., text_config.vocab_size - 1]` (see `input_ids` docstring). Tokens with
+            indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0,
+            ..., text_config.vocab_size - 1]`.
+        mim_labels (`torch.LongTensor` of shape `(batch_size, image_num_patches)`, *optional*):
+            Labels for computing the image and multimodal masked modeling loss. Indices should be in `[-100, 0, ...,
+            image_config.vocab_size - 1]`. Tokens with indices set to `-100` are ignored (masked), the loss is only
+            computed for the tokens with labels in `[0, ..., image_config.vocab_size - 1]`. If not passed, they are
+            generated automatically using the image codebook assigned to the model. By default, it uses
+            [`FlavaImageCodebook`]. See [`FlavaImageCodebook`] to understand how to generate mim_labels.
+        itm_labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
+            Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
+            The pairs with 0 will be skipped for calculation of MMM and global contrastive losses as well.
+        return_loss (`bool`, *optional*, default to None):
+            Whether to return calculated loss or not.
+"""
+    + FLAVA_INPUTS_DOCSTRING_COMMON
+)
+FLAVA_PRETRAINING_START_DOCSTRING_EXTRA = r"""
+    Parameters:
+        image_codebook ([`nn.Module`]): If passed, the image codebook will be set to this. Otherwise. it will
+            be initialized using the image_codebook_config defined in the config first as the first parameter.
+"""
+class FlavaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = FlavaConfig
+    base_model_prefix = "flava"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, FlavaMaskedPredictionHead):
+            module.bias.data.zero_()
+        elif isinstance(module, FlavaImageEmbeddings):
+            module.cls_token.data.zero_()
+            module.position_embeddings.data.zero_()
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+        elif isinstance(module, FlavaMultimodalModel):
+            if module.use_cls_token:
+                module.cls_token.data.zero_()
+        elif isinstance(module, FlavaModel):
+            module.logit_scale.data.fill_(self.config.logit_scale_init_value)
+@add_start_docstrings(
+    "The bare FLAVA Image Model transformer outputting raw hidden-states without any specific head on top.",
+    FLAVA_START_DOCSTRING.format(config="FlavaImageConfig"),
+)
+class FlavaImageModel(FlavaPreTrainedModel):
+    config_class = FlavaImageConfig
+    # This override allows us to load FlavaImageModel from FlavaModel/FlavaForPreTraining checkpoints.
+    base_model_prefix = "flava.image_model"
+    main_input_name = "pixel_values"
+    def __init__(self, config: FlavaImageConfig, add_pooling_layer: bool = True):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = FlavaImageEmbeddings(config)
+        self.encoder = FlavaEncoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = FlavaPooler(config) if add_pooling_layer else None
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embeddings
+    def set_input_embeddings(self, value: nn.Module):
+        self.embeddings.patch_embeddings = value
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(FLAVA_IMAGE_INPUTS_DOCSTRING.format("batch_size, image_num_patches"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_CLASS_FOR_IMAGE_MODEL_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_IMAGE_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    "The bare FLAVA Text Model transformer outputting raw hidden-states without any specific head on top.",
+    FLAVA_START_DOCSTRING.format(config="FlavaTextConfig"),
+)
+class FlavaTextModel(FlavaPreTrainedModel):
+    config_class = FlavaTextConfig
+    # This override allows us to load FlavaTextModel from FlavaModel/FlavaForPreTraining checkpoints.
+    base_model_prefix = "flava.text_model"
+    def __init__(self, config: FlavaTextConfig, add_pooling_layer: bool = True):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = FlavaTextEmbeddings(config)
+        self.encoder = FlavaEncoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = FlavaPooler(config) if add_pooling_layer else None
+        self.post_init()
+    def get_input_embeddings(self) -> PatchEmbeddings:
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value: nn.Module):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(FLAVA_TEXT_INPUTS_DOCSTRING.format("batch_size, text_seq_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_CLASS_FOR_TEXT_MODEL_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+        input_shape = input_ids.size()
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=input_ids.device)
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, input_ids.device
+        )
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    "The bare FLAVA Multimodal Model transformer outputting raw hidden-states without any specific head on top.",
+    FLAVA_START_DOCSTRING.format(config="FlavaMultimodalConfig"),
+)
+class FlavaMultimodalModel(FlavaPreTrainedModel):
+    config_class = FlavaMultimodalConfig
+    # This override allows us to load FlavaMultimodalModel from FlavaModel/FlavaForPreTraining checkpoints.
+    base_model_prefix = "flava.multimodal_model"
+    main_input_name = "hidden_states"
+    def __init__(self, config: FlavaMultimodalConfig, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+        self.use_cls_token = self.config.use_cls_token
+        if self.use_cls_token:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.encoder = FlavaEncoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = FlavaPooler(config) if add_pooling_layer else None
+        self.post_init()
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(
+        FLAVA_MULTIMODAL_INPUTS_DOCSTRING.format("batch_size, image_num_patches + text_seq_len")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_CLASS_FOR_MULTIMODAL_MODEL_DOC,
+    )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, seq_length, _ = hidden_states.size()
+        if self.use_cls_token:
+            cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+            hidden_states = torch.cat((cls_tokens, hidden_states), dim=1)
+            seq_length += 1
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length), device=hidden_states.device)
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, (batch_size, seq_length), hidden_states.device
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    "The bare FLAVA Model transformer outputting raw hidden-states without any specific head on top.",
+    FLAVA_START_DOCSTRING.format(config="FlavaConfig"),
+)
+class FlavaModel(FlavaPreTrainedModel):
+    config_class = FlavaConfig
+    def __init__(self, config: FlavaConfig):
+        super().__init__(config)
+        if not isinstance(config.text_config, FlavaTextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type FlavaTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+        if not isinstance(config.image_config, FlavaImageConfig):
+            raise TypeError(
+                "config.image_config is expected to be of type FlavaImageConfig but is of type"
+                f" {type(config.image_config)}."
+            )
+        if not isinstance(config.multimodal_config, FlavaMultimodalConfig):
+            raise TypeError(
+                "config.multimodal_config is expected to be of type FlavaMultimodalConfig but "
+                + f"is of type {type(config.multimodal_config)}."
+            )
+        text_config = config.text_config
+        image_config = config.image_config
+        multimodal_config = config.multimodal_config
+        self.projection_dim = config.projection_dim
+        self.text_hidden_size = text_config.hidden_size
+        self.image_hidden_size = image_config.hidden_size
+        self.mm_hidden_size = multimodal_config.hidden_size
+        self.text_model = FlavaTextModel(text_config)
+        self.image_model = FlavaImageModel(image_config)
+        self.multimodal_model = FlavaMultimodalModel(multimodal_config)
+        self.image_projection = nn.Linear(self.image_hidden_size, self.projection_dim)
+        self.text_projection = nn.Linear(self.text_hidden_size, self.projection_dim)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+        self.image_to_mm_projection = nn.Linear(self.image_hidden_size, self.mm_hidden_size)
+        self.text_to_mm_projection = nn.Linear(self.text_hidden_size, self.mm_hidden_size)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(FLAVA_TEXT_INPUTS_DOCSTRING.format("batch_size, text_seq_length"))
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`FlavaTextModel`].
+        Examples:
+        ```python
+        >>> from transformers import AutoProcessor, FlavaModel
+        >>> model = FlavaModel.from_pretrained("{0}")
+        >>> processor = AutoProcessor.from_pretrained("{0}")
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], max_length=77, padding="max_length", return_tensors="pt"
+        ... )
+        >>> text_features = model.get_text_features(**inputs)
+        ```""".format(_CHECKPOINT_FOR_DOC)
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = text_outputs[0]  # last_hidden_state
+        text_features = self.text_projection(pooled_output)
+        return text_features
+    @add_start_docstrings_to_model_forward(FLAVA_IMAGE_INPUTS_DOCSTRING.format("batch_size, image_num_patches"))
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`FlavaImageModel`].
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, FlavaModel
+        >>> model = FlavaModel.from_pretrained("{0}")
+        >>> processor = AutoProcessor.from_pretrained("{0}")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> image_features = model.get_image_features(**inputs)
+        ```""".format(_CHECKPOINT_FOR_DOC)
+        image_outputs = self.image_model(
+            pixel_values=pixel_values,
+            bool_masked_pos=bool_masked_pos,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+        pooled_output = image_outputs[0]  # last_hidden_state
+        image_features = self.image_projection(pooled_output)
+        return image_features
+    @add_start_docstrings_to_model_forward(
+        FLAVA_MODEL_INPUTS_DOCSTRING.format("batch_size, image_num_patches + text_seq_len")
+    )
+    @replace_return_docstrings(output_type=FlavaModelOutput, config_class=FlavaConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        skip_multimodal_encoder: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: bool = True,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, FlavaOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, FlavaModel
+        >>> model = FlavaModel.from_pretrained("facebook/flava-full")
+        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)
+        >>> outputs = model(**inputs)
+        >>> image_embeddings = outputs.image_embeddings
+        >>> text_embeddings = outputs.text_embeddings
+        >>> multimodal_embeddings = outputs.multimodal_embeddings
+        >>> outputs.image_embeddings.shape
+        torch.Size([1, 197, 768])
+        >>> text_embeddings.shape
+        torch.Size([1, 7, 768])
+        >>> multimodal_embeddings.shape
+        torch.Size([1, 205, 768])
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        if not output_hidden_states:
+            raise ValueError("FLAVA model requires hidden states to work. Please set `output_hidden_states=True`")
+        image_embeddings = None
+        image_states = None
+        image_mm_projection = None
+        image_output = None
+        if pixel_values is not None:
+            image_output = self.image_model(
+                pixel_values=pixel_values,
+                bool_masked_pos=bool_masked_pos,
+                attention_mask=image_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            image_embeddings, image_states = image_output[0], image_output[2]
+            # Note that these states don't use final layernorm in the transformer model
+            image_mm_projection = self.image_to_mm_projection(image_states[-1])
+        text_embeddings = None
+        text_states = None
+        text_mm_projection = None
+        text_output = None
+        if input_ids is not None:
+            text_output = self.text_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            text_embeddings, text_states = text_output[0], text_output[2]
+            # Note that these states don't use final layernorm in the transformer model
+            text_mm_projection = self.text_to_mm_projection(text_states[-1])
+        multimodal_embeddings = None
+        multimodal_output = None
+        if image_mm_projection is not None and text_mm_projection is not None and not skip_multimodal_encoder:
+            if attention_mask is not None:
+                batch_size, seq_len, _ = image_mm_projection.shape
+                if self.multimodal_model.use_cls_token:
+                    seq_len += 1
+                attention_mask_image = torch.ones(batch_size, seq_len, device=image_mm_projection.device)
+                attention_multimodal = torch.cat([attention_mask_image, attention_mask], dim=1)
+            else:
+                attention_multimodal = None
+            multimodal_input = torch.cat([image_mm_projection, text_mm_projection], dim=1)
+            multimodal_output = self.multimodal_model(
+                multimodal_input, attention_mask=attention_multimodal, return_dict=return_dict
+            )
+            multimodal_embeddings = multimodal_output[0]
+        if not return_dict:
+            return (
+                image_embeddings,
+                image_output,
+                text_embeddings,
+                text_output,
+                multimodal_embeddings,
+                multimodal_output,
+            )
+        return FlavaModelOutput(
+            image_embeddings=image_embeddings,
+            image_output=image_output,
+            text_embeddings=text_embeddings,
+            text_output=text_output,
+            multimodal_embeddings=multimodal_embeddings,
+            multimodal_output=multimodal_output,
+        )
+class FlavaImageCodebookResPath(nn.Module):
+    def __init__(self, in_size: int, out_size: int, **kwargs):
+        super().__init__()
+        hid_size = out_size // 4
+        path = OrderedDict()
+        path["relu_1"] = nn.ReLU()
+        path["conv_1"] = nn.Conv2d(in_size, hid_size, kernel_size=3, padding=1)
+        path["relu_2"] = nn.ReLU()
+        path["conv_2"] = nn.Conv2d(hid_size, hid_size, kernel_size=3, padding=1)
+        path["relu_3"] = nn.ReLU()
+        path["conv_3"] = nn.Conv2d(hid_size, hid_size, kernel_size=3, padding=1)
+        path["relu_4"] = nn.ReLU()
+        path["conv_4"] = nn.Conv2d(hid_size, out_size, kernel_size=1, padding=0)
+        self.path = nn.Sequential(path)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.path(x)
+class FlavaImageCodebookBlock(nn.Module):
+    def __init__(self, in_size: int, out_size: int, num_layers: int, **kwargs):
+        super().__init__()
+        self.post_gain = 1 / (num_layers**2)
+        if in_size != out_size:
+            self.id_path = nn.Conv2d(in_size, out_size, kernel_size=1, padding=0)
+        else:
+            self.id_path = nn.Identity()
+        self.res_path = FlavaImageCodebookResPath(in_size, out_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.id_path(x) + self.post_gain * self.res_path(x)
+class FlavaImageCodebookLayerGroup(nn.Module):
+    def __init__(self, num_blocks: int, num_layers: int, in_size: int, out_size: int, use_pool: bool = True):
+        super().__init__()
+        blocks = OrderedDict()
+        for i in range(num_blocks):
+            if i == 0:
+                blocks[f"block_{i + 1}"] = FlavaImageCodebookBlock(in_size, out_size, num_layers)
+            else:
+                blocks[f"block_{i + 1}"] = FlavaImageCodebookBlock(out_size, out_size, num_layers)
+        if use_pool:
+            blocks["pool"] = nn.MaxPool2d(kernel_size=2)
+        self.group = nn.Sequential(blocks)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.group(x)
+# Inspired by DALLE Encoder in https://github.com/openai/DALL-E/blob/5be4b236bc3ade6943662354117a0e83752cc322/dall_e/encoder.py#L42
+@add_start_docstrings(
+    """
+    The FLAVA's image codebook model inspired from DALL-E's original encoder. Outputs raw hidden states and can be used
+    to generate image tokens for an image based on DALL-E's vocab. Used to generate labels for MIM. Use
+    `get_codebook_indices` to get image tokens for an image.
+    """,
+    FLAVA_START_DOCSTRING.format(config="FlavaImageCodebookConfig"),
+)
+class FlavaImageCodebook(FlavaPreTrainedModel):
+    base_model_prefix = ""
+    config_class = FlavaImageCodebookConfig
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = False
+    def __init__(
+        self,
+        config: FlavaImageCodebookConfig,
+        **kwargs: Any,
+    ):
+        super().__init__(config)
+        self.config = config
+        self.num_groups = config.num_groups
+        self.input_channels = config.input_channels
+        self.num_blocks_per_group = config.num_blocks_per_group
+        self.hidden_size = config.hidden_size
+        self.vocab_size = config.vocab_size
+        num_layers = self.num_groups * self.num_blocks_per_group
+        output_blocks = OrderedDict()
+        output_blocks["relu"] = nn.ReLU()
+        output_blocks["conv"] = nn.Conv2d(8 * self.hidden_size, self.vocab_size, kernel_size=1, padding=0)
+        blocks = OrderedDict()
+        blocks["input"] = nn.Conv2d(self.input_channels, 1 * self.hidden_size, kernel_size=7, padding=3)
+        blocks["group_1"] = FlavaImageCodebookLayerGroup(
+            self.num_blocks_per_group, num_layers, 1 * self.hidden_size, 1 * self.hidden_size
+        )
+        blocks["group_2"] = FlavaImageCodebookLayerGroup(
+            self.num_blocks_per_group, num_layers, 1 * self.hidden_size, 2 * self.hidden_size
+        )
+        blocks["group_3"] = FlavaImageCodebookLayerGroup(
+            self.num_blocks_per_group, num_layers, 2 * self.hidden_size, 4 * self.hidden_size
+        )
+        blocks["group_4"] = FlavaImageCodebookLayerGroup(
+            self.num_blocks_per_group, num_layers, 4 * self.hidden_size, 8 * self.hidden_size, use_pool=False
+        )
+        blocks["output"] = nn.Sequential(output_blocks)
+        self.blocks = nn.Sequential(blocks)
+        self.post_init()
+        if self.config.freeze:
+            for param in self.parameters():
+                param.requires_grad = False
+    def get_codebook_indices(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
+                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoImageProcessor, FlavaImageCodebook
+        >>> model = FlavaImageCodebook.from_pretrained("{0}")
+        >>> image_processor = AutoImageProcessor.from_pretrained("{0}")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
+        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
+        >>> outputs = model.get_codebook_indices(**inputs)
+        ```
+        """.format(_CHECKPOINT_FOR_CODEBOOK_DOC)
+        z_logits = self.blocks(pixel_values)
+        return torch.argmax(z_logits, axis=1)
+    def get_codebook_probs(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        z_logits = self.blocks(pixel_values)
+        return nn.Softmax(dim=1)(z_logits)
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
+                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoImageProcessor, FlavaImageCodebook
+        >>> model = FlavaImageCodebook.from_pretrained("{0}")
+        >>> image_processor = AutoImageProcessor.from_pretrained("{0}")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
+        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
+        >>> outputs = model(**inputs)
+        >>> print(outputs.shape)
+        (1, 196)
+        ```
+        """.format(_CHECKPOINT_FOR_CODEBOOK_DOC)
+        if len(pixel_values.shape) != 4:
+            raise ValueError(f"input shape {pixel_values.shape} is not 4d")
+        if pixel_values.shape[1] != self.input_channels:
+            raise ValueError(f"input has {pixel_values.shape[1]} channels but model built for {self.input_channels}")
+        return self.blocks(pixel_values)
+class FlavaPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class FlavaMaskedPredictionHead(nn.Module):
+    def __init__(self, config, weight=None):
+        super().__init__()
+        self.config = config
+        self.transform = FlavaPredictionHeadTransform(config)
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        if weight is not None:
+            self.decoder.weight = weight
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+    def forward(self, x):
+        x = self.transform(x)
+        x = self.decoder(x)
+        return x
+class FlavaITMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pooler = FlavaPooler(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, x):
+        x = self.pooler(x)
+        x = self.seq_relationship(x)
+        return x
+class FlavaGlobalContrastiveHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.global_backprop_contrastive = config.global_backprop_contrastive
+    def forward(self, image_embeddings, text_embeddings, logit_scale):
+        temperature = torch.exp(logit_scale)
+        if not torch.distributed.is_available() or not torch.distributed.is_initialized():
+            labels = torch.arange(image_embeddings.size(0), device=image_embeddings.device)
+            image_embeddings_all = [image_embeddings]
+            text_embeddings_all = [text_embeddings]
+        else:
+            local_batch_size = image_embeddings.size(0)
+            world_size = torch.distributed.get_world_size()
+            if self.global_backprop_contrastive:
+                # `torch.distributed.nn.functional.all_gather` does backprop on all active workers
+                # whereas `torch.distributed.all_gather` does only backpropagates on the current worker.
+                image_embeddings_all = torch.distributed.nn.functional.all_gather(image_embeddings)
+                text_embeddings_all = torch.distributed.nn.functional.all_gather(text_embeddings)
+            else:
+                image_embeddings_all = [torch.zeros_like(text_embeddings) for _ in range(world_size)]
+                text_embeddings_all = [torch.zeros_like(image_embeddings) for _ in range(world_size)]
+                torch.distributed.all_gather(image_embeddings_all, image_embeddings)
+                torch.distributed.all_gather(text_embeddings_all, text_embeddings)
+            labels = local_batch_size * torch.distributed.get_rank() + torch.arange(
+                local_batch_size, device=image_embeddings.device
+            )
+        image_embeddings_all = torch.cat(image_embeddings_all)
+        text_embeddings_all = torch.cat(text_embeddings_all)
+        logits_per_image = torch.matmul(image_embeddings, text_embeddings_all.transpose(0, 1)) * temperature
+        logits_per_text = torch.matmul(text_embeddings, image_embeddings_all.transpose(0, 1)) * temperature
+        return logits_per_image, logits_per_text, labels
+@add_start_docstrings(
+    """
+    The FLAVA model for pretraining which outputs losses, embeddings, logits and transformer outputs.
+    """,
+    FLAVA_START_DOCSTRING.format(config="FlavaConfig") + FLAVA_PRETRAINING_START_DOCSTRING_EXTRA,
+)
+class FlavaForPreTraining(FlavaPreTrainedModel):
+    # Those are linked to xxx.bias
+    _tied_weights_keys = [
+        "mmm_text_head.decoder.bias",
+        "mmm_image_head.decoder.bias",
+        "mlm_head.decoder.bias",
+        "mim_head.decoder.bias",
+    ]
+    def __init__(self, config: FlavaConfig, image_codebook: Optional[nn.Module] = None):
+        super().__init__(config)
+        self.flava = FlavaModel(config)
+        self.image_codebook = image_codebook
+        if self.image_codebook is None and config.init_codebook:
+            self.image_codebook = FlavaImageCodebook(config.image_codebook_config)
+        # Levarage text and image encoder configs to create the masked
+        # head since it has the right vocab
+        self.mim_head = FlavaMaskedPredictionHead(config.image_config)
+        self.mlm_head = FlavaMaskedPredictionHead(config.text_config)
+        self.itm_head = FlavaITMHead(config)
+        self.mmm_image_head = FlavaMaskedPredictionHead(config.image_config)
+        self.mmm_text_head = FlavaMaskedPredictionHead(config.text_config)
+        self.global_contrastive_head = FlavaGlobalContrastiveHead(config)
+        self.image_vocab_size = config.image_config.vocab_size
+        self.text_vocab_size = config.text_config.vocab_size
+        self.mlm_weight = config.mlm_weight
+        self.mim_weight = config.mim_weight
+        self.global_contrastive_weight = config.global_contrastive_weight
+        self.ce_ignore_index = config.ce_ignore_index
+        self.itm_weight = config.itm_weight
+        self.mmm_image_weight = config.mmm_image_weight
+        self.mmm_text_weight = config.mmm_text_weight
+        self.skip_unmasked_multimodal_encoder = config.skip_unmasked_multimodal_encoder
+        self.post_init()
+    def _resize_to_2d(self, x: torch.Tensor):
+        if x.dim() > 2:
+            x = x.view(x.size(0), -1)
+        return x
+    @add_start_docstrings_to_model_forward(
+        FLAVA_PRETRAINING_INPUTS_DOCSTRING.format("batch_size, text_seq_len", "batch_size, image_num_patches")
+    )
+    @replace_return_docstrings(output_type=FlavaForPreTrainingOutput, config_class=FlavaConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_ids_masked: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        codebook_pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        skip_unmasked_multimodal_encoder: Optional[bool] = None,
+        mlm_labels: Optional[torch.Tensor] = None,
+        mim_labels: Optional[torch.Tensor] = None,
+        itm_labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: bool = True,
+        return_dict: Optional[bool] = None,
+        return_loss: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], FlavaForPreTrainingOutput]:
+        """
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import FlavaForPreTraining, AutoProcessor
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> model = FlavaForPreTraining.from_pretrained("facebook/flava-full")
+        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")
+        >>> text = ["a photo of a cat"]
+        >>> inputs = processor(
+        ...     images=[image],
+        ...     text=text,
+        ...     return_masks=True,
+        ...     return_codebook_pixels=True,
+        ...     padding=True,
+        ...     max_length=77,
+        ...     return_tensors="pt",
+        ... )
+        >>> output = model(**inputs)
+        ```
+        Return:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_loss = return_loss if return_loss is not None else self.config.return_loss
+        skip_unmasked_multimodal_encoder = (
+            skip_unmasked_multimodal_encoder
+            if skip_unmasked_multimodal_encoder is not None
+            else self.skip_unmasked_multimodal_encoder
+        )
+        if input_ids_masked is None and input_ids is not None:
+            logger.warning(
+                "`input_ids_masked` isn't passed which means MLM loss won't be calculated correctlySetting it to"
+                " `input_ids` so that model can work. Please pass it if this is unintentional. This is usually OKAY if"
+                " you are doing inference on unmasked text..."
+            )
+            input_ids_masked = input_ids
+        flava_output = self.flava(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            image_attention_mask=image_attention_mask,
+            # Don't need unmasked multimodal embedding for anything so skip it
+            # NOTE: ITM uses masked version
+            skip_multimodal_encoder=skip_unmasked_multimodal_encoder,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            # Pass true to have deterministic outputs
+            return_dict=True,
+        )
+        flava_masked_output = self.flava(
+            input_ids=input_ids_masked,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            image_attention_mask=image_attention_mask,
+            bool_masked_pos=bool_masked_pos,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        pos_mask = None
+        image_embeddings = flava_output.image_embeddings
+        text_embeddings = flava_output.text_embeddings
+        image_masked_embeddings = flava_masked_output.image_embeddings
+        text_masked_embeddings = flava_masked_output.text_embeddings
+        multimodal_masked_embeddings = flava_masked_output.multimodal_embeddings
+        total_loss = mim_loss = mlm_loss = mmm_text_loss = mmm_image_loss = gc_loss = itm_loss = None
+        mim_logits = mlm_logits = mmm_text_logits = mmm_image_logits = None
+        itm_logits = logits_per_image = logits_per_text = None
+        # Calculate mim_labels if necessary from the image_codebook
+        if image_masked_embeddings is not None or multimodal_masked_embeddings is not None:
+            if mim_labels is None and return_loss:
+                if self.image_codebook is None:
+                    raise RuntimeError(
+                        "`return_loss` is set to True but the image codebook is not initialized and no `mim_labels` "
+                        " have been passed. Reinstantiate the model with `init_codebook` set to True or "
+                        "pass in your custom `mim_labels`"
+                    )
+                if codebook_pixel_values is None:
+                    raise ValueError(
+                        "`codebook_pixel_value` are required to generate `mim_labels` if loss is expected. "
+                        "Call `AutoProcessor` with `return_codebook_pixels` set to True"
+                    )
+                mim_labels = self.image_codebook.get_codebook_indices(codebook_pixel_values)
+        # Unimodal MIM Loss
+        # If multimodal embeddings are present, we will calculate MMM loss
+        if self.mim_weight > 0 and image_masked_embeddings is not None and multimodal_masked_embeddings is None:
+            sequence_for_image = image_masked_embeddings
+            if mim_labels is not None:
+                mim_labels = self._resize_to_2d(mim_labels)
+                bool_masked_pos = self._resize_to_2d(bool_masked_pos)
+                mim_labels[bool_masked_pos.ne(True)] = self.ce_ignore_index
+                sequence_for_image = sequence_for_image[:, -mim_labels.size(1) :, :]
+                masked_tokens = mim_labels.ne(self.ce_ignore_index)
+                mim_labels_filtered = mim_labels[masked_tokens]
+                sequence_for_image = sequence_for_image[masked_tokens, :]
+                mim_logits = self.mim_head(sequence_for_image)
+                if return_loss:
+                    mim_loss = nn.functional.cross_entropy(
+                        mim_logits.view(-1, self.image_vocab_size), mim_labels_filtered.view(-1)
+                    )
+                    mim_loss *= self.mim_weight
+            else:
+                mim_logits = self.mim_head(sequence_for_image)
+        # Unimodal MLM Loss
+        if self.mlm_weight > 0 and text_masked_embeddings is not None and multimodal_masked_embeddings is None:
+            sequence_for_text = text_masked_embeddings
+            if mlm_labels is not None:
+                mlm_labels = self._resize_to_2d(mlm_labels)
+                sequence_for_text = sequence_for_text[:, -mlm_labels.size(1) :, :]
+                masked_tokens = mlm_labels.ne(self.ce_ignore_index)
+                mlm_labels_filtered = mlm_labels[masked_tokens]
+                sequence_for_text = sequence_for_text[masked_tokens, :]
+                mlm_logits = self.mlm_head(sequence_for_text)
+                if return_loss:
+                    mlm_loss = nn.functional.cross_entropy(
+                        mlm_logits.view(-1, self.text_vocab_size), mlm_labels_filtered.view(-1)
+                    )
+                    mlm_loss *= self.mlm_weight
+            else:
+                mlm_logits = self.mlm_head(sequence_for_text)
+        # ITM Loss
+        if self.itm_weight > 0 and multimodal_masked_embeddings is not None:
+            itm_logits = self.itm_head(multimodal_masked_embeddings)
+            if itm_labels is not None:
+                pos_pairs = itm_labels.ne(0)
+                pos_mask = torch.where(pos_pairs.any(), pos_pairs, pos_pairs.new([True]))
+                if return_loss:
+                    itm_loss = nn.functional.cross_entropy(itm_logits, itm_labels)
+                    itm_loss *= self.itm_weight
+                if multimodal_masked_embeddings is not None:
+                    multimodal_masked_embeddings = multimodal_masked_embeddings[pos_mask]
+                if mlm_labels is not None:
+                    mlm_labels = mlm_labels[pos_mask]
+                if mim_labels is not None:
+                    mim_labels = mim_labels[pos_mask]
+                    bool_masked_pos = bool_masked_pos[pos_mask]
+        # MMM Image Loss
+        if multimodal_masked_embeddings is not None and self.mmm_image_weight > 0:
+            sequence_for_image = multimodal_masked_embeddings
+            end_index = image_masked_embeddings.size(1) - 1
+            sequence_for_image = sequence_for_image[:, 2 : 2 + end_index, :]
+            if mim_labels is not None:
+                mim_labels = self._resize_to_2d(mim_labels)
+                bool_masked_pos = self._resize_to_2d(bool_masked_pos)
+                mim_labels[bool_masked_pos.ne(True)] = self.ce_ignore_index
+                masked_tokens = mim_labels.ne(self.ce_ignore_index)
+                mim_labels_filtered = mim_labels[masked_tokens]
+                sequence_for_image = sequence_for_image[masked_tokens, :]
+                mmm_image_logits = self.mmm_image_head(sequence_for_image)
+                if return_loss:
+                    mmm_image_loss = nn.functional.cross_entropy(
+                        mmm_image_logits.view(-1, self.image_vocab_size), mim_labels_filtered.view(-1)
+                    )
+                    mmm_image_loss *= self.mmm_image_weight
+            else:
+                mmm_image_logits = self.mmm_image_head(sequence_for_image)
+        # MMM Text Loss
+        if multimodal_masked_embeddings is not None and self.mmm_text_weight > 0:
+            sequence_for_text = multimodal_masked_embeddings
+            sequence_for_text = sequence_for_text[:, -text_masked_embeddings.size(1) :, :]
+            if mlm_labels is not None:
+                mlm_labels = self._resize_to_2d(mlm_labels)
+                masked_tokens = mlm_labels.ne(self.ce_ignore_index)
+                mlm_labels_filtered = mlm_labels[masked_tokens]
+                sequence_for_text = sequence_for_text[masked_tokens, :]
+                mmm_text_logits = self.mmm_text_head(sequence_for_text)
+                if return_loss:
+                    mmm_text_loss = nn.functional.cross_entropy(
+                        mmm_text_logits.view(-1, self.text_vocab_size), mlm_labels_filtered.view(-1)
+                    )
+                    mmm_text_loss *= self.mmm_text_weight
+            else:
+                mmm_text_logits = self.mmm_text_head(sequence_for_text)
+        # Global Contrastive Loss
+        if image_embeddings is not None and text_embeddings is not None and self.global_contrastive_weight > 0:
+            text_embedding = self.flava.text_projection(text_embeddings[:, 0, :])
+            text_embedding = nn.functional.normalize(text_embedding, dim=-1)
+            image_embedding = self.flava.image_projection(image_embeddings[:, 0, :])
+            image_embedding = nn.functional.normalize(image_embedding, dim=-1)
+            self.flava.logit_scale.data.clamp_(LOGIT_SCALE_CLAMP_MIN, LOGIT_SCALE_CLAMP_MAX)
+            logits_per_image, logits_per_text, gc_labels = self.global_contrastive_head(
+                image_embedding, text_embedding, self.flava.logit_scale
+            )
+            # Apply ITM negative mask if any
+            if pos_mask is not None:
+                logits_per_image = logits_per_image[pos_mask]
+                logits_per_text = logits_per_text[pos_mask]
+                gc_labels = gc_labels[pos_mask]
+            if return_loss:
+                gc_loss_image = nn.functional.cross_entropy(logits_per_image, gc_labels)
+                gc_loss_text = nn.functional.cross_entropy(logits_per_text, gc_labels)
+                gc_loss = (gc_loss_image + gc_loss_text) / 2
+                gc_loss *= self.global_contrastive_weight
+        flava_losses = FlavaLosses(
+            mim=mim_loss,
+            mlm=mlm_loss,
+            itm=itm_loss,
+            global_contrastive=gc_loss,
+            mmm_image=mmm_image_loss,
+            mmm_text=mmm_text_loss,
+        )
+        if return_loss and not flava_losses.all_none():
+            total_loss = sum(loss if loss is not None else 0 for loss in flava_losses.values())
+        if not return_dict:
+            output = (
+                image_embeddings,
+                flava_output.image_output.to_tuple() if flava_output.image_output is not None else None,
+                text_embeddings,
+                flava_output.text_output.to_tuple() if flava_output.text_output is not None else None,
+                flava_output.multimodal_embeddings,
+                flava_output.multimodal_output.to_tuple() if flava_output.multimodal_output is not None else None,
+                image_masked_embeddings,
+                flava_masked_output.image_output.to_tuple() if flava_masked_output.image_output is not None else None,
+                text_masked_embeddings,
+                flava_masked_output.text_output.to_tuple() if flava_masked_output.text_output is not None else None,
+                multimodal_masked_embeddings,
+                flava_masked_output.multimodal_output.to_tuple()
+                if flava_masked_output.multimodal_output is not None
+                else None,
+                mim_logits,
+                mlm_logits,
+                itm_logits,
+                logits_per_image,
+                logits_per_image,
+                mmm_image_logits,
+                mmm_text_logits,
+            )
+            if return_loss and not flava_losses.all_none():
+                output = (
+                    total_loss,
+                    flava_losses,
+                ) + output
+            # Filter None as transformer by default won't handle it
+            return tuple(x for x in output if x is None)
+        return FlavaForPreTrainingOutput(
+            loss=total_loss,
+            loss_info=flava_losses,
+            image_embeddings=image_embeddings,
+            image_output=flava_output.image_output,
+            text_embeddings=text_embeddings,
+            text_output=flava_output.text_output,
+            multimodal_embeddings=flava_output.multimodal_embeddings,
+            multimodal_output=flava_output.multimodal_output,
+            image_masked_embeddings=image_masked_embeddings,
+            image_masked_output=flava_masked_output.image_output,
+            text_masked_embeddings=text_masked_embeddings,
+            text_masked_output=flava_masked_output.text_output,
+            multimodal_masked_embeddings=multimodal_masked_embeddings,
+            multimodal_masked_output=flava_masked_output.multimodal_output,
+            mim_logits=mim_logits,
+            mlm_logits=mlm_logits,
+            itm_logits=itm_logits,
+            contrastive_logits_per_image=logits_per_image,
+            contrastive_logits_per_text=logits_per_text,
+            mmm_image_logits=mmm_image_logits,
+            mmm_text_logits=mmm_text_logits,
+        )
+__all__ = [
+    "FlavaForPreTraining",
+    "FlavaImageCodebook",
+    "FlavaImageModel",
+    "FlavaModel",
+    "FlavaMultimodalModel",
+    "FlavaPreTrainedModel",
+    "FlavaTextModel",
+]

docs/transformers/build/lib/transformers/models/flava/processing_flava.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for FLAVA
+"""
+import warnings
+from typing import List, Optional, Union
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+class FlavaProcessor(ProcessorMixin):
+    r"""
+    Constructs a FLAVA processor which wraps a FLAVA image processor and a FLAVA tokenizer into a single processor.
+    [`FlavaProcessor`] offers all the functionalities of [`FlavaImageProcessor`] and [`BertTokenizerFast`]. See the
+    [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.
+    Args:
+        image_processor ([`FlavaImageProcessor`], *optional*): The image processor is a required input.
+        tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "FlavaImageProcessor"
+    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_image_mask: Optional[bool] = None,
+        return_codebook_pixels: Optional[bool] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ):
+        """
+        This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`BertTokenizerFast.__call__`] to prepare text for the model.
+        Please refer to the docstring of the above two methods for more information.
+        """
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+        if text is not None:
+            encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+        if images is not None:
+            image_features = self.image_processor(
+                images,
+                return_image_mask=return_image_mask,
+                return_codebook_pixels=return_codebook_pixels,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+        if text is not None and images is not None:
+            encoding.update(image_features)
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
+__all__ = ["FlavaProcessor"]