Student0809 commited on Jul 24

Commit

3b47bbc

verified ·

1 Parent(s): 33b613a

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dev_scripts/ci_container_test.sh +41 -0
4JOB_TRAIN.jsonl +0 -0
MANIFEST.in +5 -0
Makefile +25 -0
README.md +423 -0
README_CN.md +413 -0
checkMissing.py +86 -0
clean_transcripts.py +95 -0
count_audios.py +69 -0
count_folders-Copy1.py +122 -0
count_folders.py +122 -0
dialogue_length_distribution.png +0 -0
dialogue_length_ranges.png +0 -0
docs/transformers/build/lib/transformers/models/sam/processing_sam.py +311 -0
docs/transformers/build/lib/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py +396 -0
docs/transformers/build/lib/transformers/models/seamless_m4t/modeling_seamless_m4t.py +0 -0
docs/transformers/build/lib/transformers/models/seamless_m4t/processing_seamless_m4t.py +120 -0
docs/transformers/build/lib/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py +450 -0
docs/transformers/build/lib/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py +404 -0
docs/transformers/build/lib/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +0 -0
docs/transformers/build/lib/transformers/models/segformer/__init__.py +30 -0
docs/transformers/build/lib/transformers/models/segformer/configuration_segformer.py +171 -0
docs/transformers/build/lib/transformers/models/segformer/convert_segformer_original_to_pytorch.py +387 -0
docs/transformers/build/lib/transformers/models/segformer/feature_extraction_segformer.py +38 -0
docs/transformers/build/lib/transformers/models/segformer/image_processing_segformer.py +484 -0
docs/transformers/build/lib/transformers/models/segformer/modeling_segformer.py +840 -0
docs/transformers/build/lib/transformers/models/segformer/modeling_tf_segformer.py +1045 -0
docs/transformers/build/lib/transformers/models/seggpt/__init__.py +28 -0
docs/transformers/build/lib/transformers/models/seggpt/configuration_seggpt.py +143 -0
docs/transformers/build/lib/transformers/models/seggpt/convert_seggpt_to_hf.py +221 -0
docs/transformers/build/lib/transformers/models/seggpt/image_processing_seggpt.py +618 -0
docs/transformers/build/lib/transformers/models/seggpt/modeling_seggpt.py +1031 -0
docs/transformers/build/lib/transformers/models/sew/__init__.py +27 -0
docs/transformers/build/lib/transformers/models/sew/configuration_sew.py +256 -0
docs/transformers/build/lib/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py +305 -0
docs/transformers/build/lib/transformers/models/sew/modeling_sew.py +1498 -0
docs/transformers/build/lib/transformers/models/sew_d/__init__.py +27 -0
docs/transformers/build/lib/transformers/models/sew_d/configuration_sew_d.py +291 -0
docs/transformers/build/lib/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py +317 -0
docs/transformers/build/lib/transformers/models/sew_d/modeling_sew_d.py +1748 -0
docs/transformers/build/lib/transformers/models/shieldgemma2/__init__.py +28 -0
docs/transformers/build/lib/transformers/models/shieldgemma2/configuration_shieldgemma2.py +120 -0
docs/transformers/build/lib/transformers/models/shieldgemma2/convert_shieldgemma2_weights_orbax_to_hf.py +470 -0
docs/transformers/build/lib/transformers/models/shieldgemma2/modeling_shieldgemma2.py +220 -0
docs/transformers/build/lib/transformers/models/shieldgemma2/processing_shieldgemma2.py +195 -0
docs/transformers/build/lib/transformers/models/siglip/__init__.py +31 -0
docs/transformers/build/lib/transformers/models/siglip/configuration_siglip.py +269 -0
docs/transformers/build/lib/transformers/models/siglip/convert_siglip_to_hf.py +533 -0
docs/transformers/build/lib/transformers/models/siglip/image_processing_siglip.py +244 -0
docs/transformers/build/lib/transformers/models/siglip/image_processing_siglip_fast.py +41 -0

.dev_scripts/ci_container_test.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
+    # pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+    pip install -r requirements/tests.txt -i https://mirrors.aliyun.com/pypi/simple/
+    git config --global --add safe.directory /ms-swift
+    git config --global user.email tmp
+    git config --global user.name tmp.com
+    # linter test
+    # use internal project for pre-commit due to the network problem
+    if [ `git remote -v | grep alibaba  | wc -l` -gt 1 ]; then
+        pre-commit run -c .pre-commit-config_local.yaml --all-files
+        if [ $? -ne 0 ]; then
+            echo "linter test failed, please run 'pre-commit run --all-files' to check"
+            echo "From the repository folder"
+            echo "Run 'pip install -r requirements/tests.txt' install test dependencies."
+            echo "Run 'pre-commit install' install pre-commit hooks."
+            echo "Finally run linter with command: 'pre-commit run --all-files' to check."
+            echo "Ensure there is no failure!!!!!!!!"
+            exit -1
+        fi
+    fi
+    pip install -r requirements/framework.txt -U -i https://mirrors.aliyun.com/pypi/simple/
+    pip install diffusers decord einops -U -i https://mirrors.aliyun.com/pypi/simple/
+    pip install autoawq -U --no-deps
+    # test with install
+    pip install .
+    pip install auto_gptq bitsandbytes deepspeed -U -i https://mirrors.aliyun.com/pypi/simple/
+else
+    echo "Running case in release image, run case directly!"
+fi
+# remove torch_extensions folder to avoid ci hang.
+rm -rf ~/.cache/torch_extensions
+if [ $# -eq 0 ]; then
+    ci_command="python tests/run.py --subprocess"
+else
+    ci_command="$@"
+fi
+echo "Running case with command: $ci_command"
+$ci_command

4JOB_TRAIN.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,5 @@

+recursive-include swift/utils *.py
+recursive-include swift/llm/dataset/data *.*
+recursive-include swift/llm/ds_config *.json
+recursive-include requirements *.txt
+recursive-include swift/plugin/loss_scale/config *.json

Makefile ADDED Viewed

	@@ -0,0 +1,25 @@

+WHL_BUILD_DIR :=package
+DOC_BUILD_DIR :=docs/build/
+# default rule
+default: whl docs
+.PHONY: docs
+docs:
+	bash .dev_scripts/build_docs.sh
+.PHONY: linter
+linter:
+	bash .dev_scripts/linter.sh
+.PHONY: test
+test:
+	bash .dev_scripts/citest.sh
+.PHONY: whl
+whl:
+	python setup.py sdist bdist_wheel
+.PHONY: clean
+clean:
+	rm -rf  $(WHL_BUILD_DIR) $(DOC_BUILD_DIR)

README.md ADDED Viewed

	@@ -0,0 +1,423 @@

+# SWIFT (Scalable lightWeight Infrastructure for Fine-Tuning)
+<p align="center">
+    <br>
+    <img src="asset/banner.png"/>
+    <br>
+<p>
+<p align="center">
+<a href="https://modelscope.cn/home">ModelScope Community Website</a>
+<br>
+        <a href="README_CN.md">中文</a> &nbsp ｜ &nbsp English &nbsp
+</p>
+<p align="center">
+<img src="https://img.shields.io/badge/python-3.10-5be.svg">
+<img src="https://img.shields.io/badge/pytorch-%E2%89%A52.0-orange.svg">
+<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.19-5D91D4.svg"></a>
+<a href="https://pypi.org/project/ms-swift/"><img src="https://badge.fury.io/py/ms-swift.svg"></a>
+<a href="https://github.com/modelscope/swift/blob/main/LICENSE"><img src="https://img.shields.io/github/license/modelscope/swift"></a>
+<a href="https://pepy.tech/project/ms-swift"><img src="https://pepy.tech/badge/ms-swift"></a>
+<a href="https://github.com/modelscope/swift/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
+</p>
+<p align="center">
+<a href="https://trendshift.io/repositories/6427" target="_blank"><img src="https://trendshift.io/api/badge/repositories/6427" alt="modelscope%2Fswift | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
+<p align="center">
+        <a href="https://arxiv.org/abs/2408.05517">Paper</a> &nbsp ｜ <a href="https://swift.readthedocs.io/en/latest/">English Documentation</a> &nbsp ｜ &nbsp <a href="https://swift.readthedocs.io/zh-cn/latest/">中文文档</a> &nbsp
+</p>
+## 📖 Table of Contents
+- [Groups](#-Groups)
+- [Introduction](#-introduction)
+- [News](#-news)
+- [Installation](#%EF%B8%8F-installation)
+- [Quick Start](#-quick-Start)
+- [Usage](#-Usage)
+- [License](#-License)
+- [Citation](#-citation)
+## ☎ Groups
+You can contact us and communicate with us by adding our group:
+[Discord Group](https://discord.com/invite/D27yfEFVz5)              |  WeChat Group
+:-------------------------:|:-------------------------:
+<img src="asset/discord_qr.jpg" width="200" height="200">  |  <img src="asset/wechat.png" width="200" height="200">
+## 📝 Introduction
+🍲 ms-swift is an official framework provided by the ModelScope community for fine-tuning and deploying large language models and multi-modal large models. It currently supports the training (pre-training, fine-tuning, human alignment), inference, evaluation, quantization, and deployment of 500+ large models and 200+ multi-modal large models. These large language models (LLMs) include models such as Qwen3, Qwen3-MoE, Qwen2.5, InternLM3, GLM4, Mistral, DeepSeek-R1, Yi1.5, TeleChat2, Baichuan2, and Gemma2. The multi-modal LLMs include models such as Qwen2.5-VL, Qwen2-Audio, Llama3.4, Llava, InternVL2.5, MiniCPM-V-2.6, GLM4v, Xcomposer2.5, Yi-VL, DeepSeek-VL2, Phi3.5-Vision, and GOT-OCR2.
+🍔 Additionally, ms-swift incorporates the latest training technologies, including lightweight techniques such as LoRA, QLoRA, Llama-Pro, LongLoRA, GaLore, Q-GaLore, LoRA+, LISA, DoRA, FourierFt, ReFT, UnSloth, and Liger, as well as human alignment training methods like DPO, GRPO, RM, PPO, KTO, CPO, SimPO, and ORPO. ms-swift supports acceleration of inference, evaluation, and deployment modules using vLLM and LMDeploy, and it supports model quantization with technologies like GPTQ, AWQ, and BNB. Furthermore, ms-swift offers a Gradio-based Web UI and a wealth of best practices.
+**Why choose ms-swift?**
+- 🍎 **Model Types**: Supports 500+ pure text large models, **200+ multi-modal large models**, as well as All-to-All multi-modal models, sequence classification models, and embedding models, **covering the entire process from training to deployment**.
+- **Dataset Types**: Comes with 150+ pre-training, fine-tuning, human alignment, multi-modal datasets, and supports custom datasets.
+- **Hardware Support**: Compatible with CPU, RTX series, T4/V100, A10/A100/H100, Ascend NPU, MPS, etc.
+- 🍊 **Lightweight Training**: Supports lightweight fine-tuning methods like LoRA, QLoRA, DoRA, LoRA+, ReFT, RS-LoRA, LLaMAPro, Adapter, GaLore, Q-Galore, LISA, UnSloth, Liger-Kernel.
+- **Distributed Training**: Supports distributed data parallel (DDP), device_map simple model parallelism, DeepSpeed ZeRO2/ZeRO3, FSDP, and other distributed training techniques.
+- **Quantization Training**: Supports training quantized models like BNB, AWQ, GPTQ, AQLM, HQQ, EETQ.
+- **RLHF Training**: Supports human alignment training methods such as DPO, GRPO, RM, PPO, KTO, CPO, SimPO, ORPO for both pure text and multi-modal large models.
+- 🍓 **Multi-Modal Training**: Supports training on different modalities like images, videos, and audio, for tasks like VQA, captioning, OCR, and grounding.
+- **Interface Training**: Provides capabilities for training, inference, evaluation, quantization through an interface, completing the whole large model pipeline.
+- **Plugin and Extension**: Supports custom model and dataset extensions, as well as customization of components like loss, metric, trainer, loss-scale, callback, optimizer.
+- 🍉 **Toolbox Capabilities**: Offers not only training support for large models and multi-modal large models but also covers the entire process of inference, evaluation, quantization, and deployment.
+- **Inference Acceleration**: Supports inference acceleration engines like PyTorch, vLLM, LmDeploy, and provides OpenAI API for accelerating inference, deployment, and evaluation modules.
+- **Model Evaluation**: Uses EvalScope as the evaluation backend and supports evaluation on 100+ datasets for both pure text and multi-modal models.
+- **Model Quantization**: Supports AWQ, GPTQ, and BNB quantized exports, with models that can use vLLM/LmDeploy for inference acceleration and continue training.
+## 🎉 News
+- 🎁 2025.05.11: GRPO now supports custom processing logic for reward models. See the GenRM example [here](./docs/source_en/Instruction/GRPO.md#customized-reward-models) .
+- 🎁 2025.04.15: The ms-swift paper has been accepted by AAAI 2025. You can find the paper at [this link](https://ojs.aaai.org/index.php/AAAI/article/view/35383).
+- 🎁 2025.03.23: Multi-round GRPO is now supported for training multi-turn dialogue scenarios (e.g., agent tool calling). Please refer to the [training script](https://idealab.alibaba-inc.com/examples/train/grpo/internal/train_multi_round.sh).
+- 🎁 2025.03.16: Support for Megatron's parallel training techniques is now available. Please see the [Megatron-SWIFT training documentation](https://swift.readthedocs.io/zh-cn/latest/Instruction/Megatron-SWIFT训练.html).
+- 🎁 2025.03.15: Fine-tuning of embedding models for both pure text and multimodal models is supported. Please check the [training script](https://idealab.alibaba-inc.com/examples/train/embedding).
+- 🎁 2025.03.05: The hybrid mode for GRPO is supported, with a script for training a 72B model on 4 GPUs (4*80G) available [here](https://idealab.alibaba-inc.com/examples/train/grpo/internal/train_72b_4gpu.sh). Tensor parallelism with vllm is also supported, with the training script available [here](https://idealab.alibaba-inc.com/examples/train/grpo/internal/multi_gpu_mp_colocate.sh).
+- 🎁 2025.02.21: The GRPO algorithm now supports LMDeploy, with the training script available [here](https://idealab.alibaba-inc.com/examples/train/grpo/internal/full_lmdeploy.sh). Additionally, the performance of the GRPO algorithm has been tested, achieving a training speed increase of up to 300% using various tricks. Please check the WanDB table [here](https://wandb.ai/tastelikefeet/grpo_perf_test?nw=nwuseryuzezyz).
+- 🎁 2025.02.21: The `swift sample` command is now supported. The reinforcement fine-tuning script can be found [here](https://idealab.alibaba-inc.com/docs/source/Instruction/强化微调.md), and the large model API distillation sampling script is available [here](https://idealab.alibaba-inc.com/examples/sampler/distill/distill.sh).
+- 🔥 2025.02.12: Support for the GRPO (Group Relative Policy Optimization) training algorithm has been added. Documentation is available [here](https://idealab.alibaba-inc.com/docs/source/Instruction/GRPO.md).
+- 🎁 2024.12.04: Major update to **ms-swift 3.0**. Please refer to the [release notes and changes](https://swift.readthedocs.io/zh-cn/latest/Instruction/ReleaseNote3.0.html).
+<details><summary>More</summary>
+- 🎉 2024.08.12: The ms-swift paper has been published on arXiv and can be read [here](https://arxiv.org/abs/2408.05517).
+- 🔥 2024.08.05: Support for using [evalscope](https://github.com/modelscope/evalscope/) as a backend for evaluating large models and multimodal models.
+- 🔥 2024.07.29: Support for using [vllm](https://github.com/vllm-project/vllm) and [lmdeploy](https://github.com/InternLM/lmdeploy) to accelerate inference for large models and multimodal models. When performing infer/deploy/eval, you can specify `--infer_backend vllm/lmdeploy`.
+- 🔥 2024.07.24: Support for human preference alignment training for multimodal large models, including DPO/ORPO/SimPO/CPO/KTO/RM/PPO.
+- 🔥 2024.02.01: Support for Agent training! The training algorithm is derived from [this paper](https://arxiv.org/pdf/2309.00986.pdf).
+</details>
+## 🛠️ Installation
+To install using pip:
+```shell
+pip install ms-swift -U
+```
+To install from source:
+```shell
+# pip install git+https://github.com/modelscope/ms-swift.git
+git clone https://github.com/modelscope/ms-swift.git
+cd ms-swift
+pip install -e .
+```
+Running Environment:
+|              | Range        | Recommended | Notes                                     |
+| ------------ |--------------| ----------- | ----------------------------------------- |
+| python       | >=3.9        | 3.10        |                                           |
+| cuda         |              | cuda12      | No need to install if using CPU, NPU, MPS |
+| torch        | >=2.0        |             |                                           |
+| transformers | >=4.33       | 4.51      |                                           |
+| modelscope   | >=1.23       |             |                                           |
+| peft | >=0.11,<0.16 | ||
+| trl | >=0.13,<0.18 | 0.17 |RLHF|
+| deepspeed    | >=0.14       | 0.14.5 | Training                                  |
+| vllm         | >=0.5.1      | 0.7.3/0.8       | Inference/Deployment/Evaluation           |
+| lmdeploy     | >=0.5        | 0.8       | Inference/Deployment/Evaluation           |
+| evalscope | >=0.11       |  | Evaluation |
+For more optional dependencies, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh).
+## 🚀 Quick Start
+10 minutes of self-cognition fine-tuning of Qwen2.5-7B-Instruct on a single 3090 GPU:
+### Command Line Interface
+```shell
+# 22GB
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --train_type lora \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+              'swift/self-cognition#500' \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --gradient_accumulation_steps 16 \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --model_author swift \
+    --model_name swift-robot
+```
+Tips:
+- If you want to train with a custom dataset, you can refer to [this guide](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html) to organize your dataset format and specify `--dataset <dataset_path>`.
+- The `--model_author` and `--model_name` parameters are only effective when the dataset includes `swift/self-cognition`.
+- To train with a different model, simply modify `--model <model_id/model_path>`.
+- By default, ModelScope is used for downloading models and datasets. If you want to use HuggingFace, simply specify `--use_hf true`.
+After training is complete, use the following command to infer with the trained weights:
+- Here, `--adapters` should be replaced with the last checkpoint folder generated during training. Since the adapters folder contains the training parameter file `args.json`, there is no need to specify `--model`, `--system` separately; Swift will automatically read these parameters. To disable this behavior, you can set `--load_args false`.
+```shell
+# Using an interactive command line for inference.
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --temperature 0 \
+    --max_new_tokens 2048
+# merge-lora and use vLLM for inference acceleration
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --merge_lora true \
+    --infer_backend vllm \
+    --max_model_len 8192 \
+    --temperature 0 \
+    --max_new_tokens 2048
+```
+Finally, use the following command to push the model to ModelScope:
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+swift export \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --push_to_hub true \
+    --hub_model_id '<your-model-id>' \
+    --hub_token '<your-sdk-token>' \
+    --use_hf false
+```
+### Web-UI
+The Web-UI is a **zero-threshold** training and deployment interface solution based on Gradio interface technology. For more details, you can check [here](https://swift.readthedocs.io/en/latest/GetStarted/Web-UI.html).
+```shell
+SWIFT_UI_LANG=en swift web-ui
+```
+![image.png](./docs/resources/web-ui-en.jpg)
+### Using Python
+ms-swift also supports training and inference using Python. Below is pseudocode for training and inference. For more details, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-self-cognition/self-cognition-sft.ipynb).
+Training:
+```python
+# Retrieve the model and template, and add a trainable LoRA module
+model, tokenizer = get_model_tokenizer(model_id_or_path, ...)
+template = get_template(model.model_meta.template, tokenizer, ...)
+model = Swift.prepare_model(model, lora_config)
+# Download and load the dataset, and encode the text into tokens
+train_dataset, val_dataset = load_dataset(dataset_id_or_path, ...)
+train_dataset = EncodePreprocessor(template=template)(train_dataset, num_proc=num_proc)
+val_dataset = EncodePreprocessor(template=template)(val_dataset, num_proc=num_proc)
+# Train the model
+trainer = Seq2SeqTrainer(
+    model=model,
+    args=training_args,
+    data_collator=template.data_collator,
+    train_dataset=train_dataset,
+    eval_dataset=val_dataset,
+    template=template,
+)
+trainer.train()
+```
+Inference:
+```python
+# Perform inference using the native PyTorch engine
+engine = PtEngine(model_id_or_path, adapters=[lora_checkpoint])
+infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}])
+request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature)
+resp_list = engine.infer([infer_request], request_config)
+print(f'response: {resp_list[0].choices[0].message.content}')
+```
+## ✨ Usage
+Here is a minimal example of training to deployment using ms-swift. For more details, you can check the [examples](https://github.com/modelscope/ms-swift/tree/main/examples).
+- If you want to use other models or datasets (including multimodal models and datasets), you only need to modify `--model` to specify the corresponding model's ID or path, and modify `--dataset` to specify the corresponding dataset's ID or path.
+- By default, ModelScope is used for downloading models and datasets. If you want to use HuggingFace, simply specify `--use_hf true`.
+|   Useful Links |
+| ------ |
+|   [🔥Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html)   |
+|   [Supported Models and Datasets](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-and-datasets.html)   |
+|   [Custom Models](https://swift.readthedocs.io/en/latest/Customization/Custom-model.html), [🔥Custom Datasets](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html)   |
+|   [LLM Tutorial](https://github.com/modelscope/modelscope-classroom/tree/main/LLM-tutorial)   |
+### Training
+Supported Training Methods:
+| Method                             | Full-Parameter                                               | LoRA                                                                                        | QLoRA                                                        | Deepspeed                                                    | Multi-Node                                                   | Multi-Modal                                                                                  |
+|------------------------------------|--------------------------------------------------------------|---------------------------------------------------------------------------------------------|--------------------------------------------------------------|--------------------------------------------------------------|--------------------------------------------------------------|----------------------------------------------------------------------------------------------|
+| Pre-training                       | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/pretrain/train.sh) | ✅                                                                                           | ✅                                                            | ✅                                                            | ✅                                                            | ✅                                                                                            |
+| Instruction Supervised Fine-tuning | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/train.sh) | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/lora_sft.sh)            | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-gpu/deepspeed) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node)                                                            | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal)              |
+| DPO Training                       | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo.sh)            | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo.sh) | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/dpo.sh)  |
+| GRPO Training                      | [✅]((https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/internal/grpo_zero2.sh)) | ✅                                                                                           | ✅                                                            | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/internal/multi_node)                                    | ✅                                                                                            |
+| Reward Model Training              | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh)             | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh) | ✅                                                            | ✅                                                                                            |
+| PPO Training                       | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo.sh)            | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo.sh) | ✅                                                            | ❌                                                                                            |
+| KTO Training                       | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh)            | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh) | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/kto.sh)  |
+| CPO Training                       | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh)            | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh) | ✅                                                            | ✅                                                                                            |
+| SimPO Training                     | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh)          | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh) | ✅                                                            | ✅                                                                                            |
+| ORPO Training                      | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh)           | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh) | ✅                                                            | ✅                                                                                            |
+| Classification Model Training      | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls/qwen2_5/sft.sh) | ✅                                                            | ✅                                                            | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls/qwen2_vl/sft.sh) |
+| Embedding Model Training           | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/train_gte.sh) | ✅                                                            | ✅                                                            | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/train_gme.sh)  |
+Pre-training:
+```shell
+# 8*A100
+NPROC_PER_NODE=8 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+swift pt \
+    --model Qwen/Qwen2.5-7B \
+    --dataset swift/chinese-c4 \
+    --streaming true \
+    --train_type full \
+    --deepspeed zero2 \
+    --output_dir output \
+    --max_steps 10000 \
+    ...
+```
+Fine-tuning:
+```shell
+CUDA_VISIBLE_DEVICES=0 swift sft \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --dataset AI-ModelScope/alpaca-gpt4-data-en \
+    --train_type lora \
+    --output_dir output \
+    ...
+```
+RLHF:
+```shell
+CUDA_VISIBLE_DEVICES=0 swift rlhf \
+    --rlhf_type dpo \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
+    --train_type lora \
+    --output_dir output \
+    ...
+```
+### Inference
+```shell
+CUDA_VISIBLE_DEVICES=0 swift infer \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --stream true \
+    --infer_backend pt \
+    --max_new_tokens 2048
+# LoRA
+CUDA_VISIBLE_DEVICES=0 swift infer \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --adapters swift/test_lora \
+    --stream true \
+    --infer_backend pt \
+    --temperature 0 \
+    --max_new_tokens 2048
+```
+### Interface Inference
+```shell
+CUDA_VISIBLE_DEVICES=0 swift app \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --stream true \
+    --infer_backend pt \
+    --max_new_tokens 2048
+```
+### Deployment
+```shell
+CUDA_VISIBLE_DEVICES=0 swift deploy \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --infer_backend vllm
+```
+### Sampling
+```shell
+CUDA_VISIBLE_DEVICES=0 swift sample \
+    --model LLM-Research/Meta-Llama-3.1-8B-Instruct \
+    --sampler_engine pt \
+    --num_return_sequences 5 \
+    --dataset AI-ModelScope/alpaca-gpt4-data-zh#5
+```
+### Evaluation
+```shell
+CUDA_VISIBLE_DEVICES=0 swift eval \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --infer_backend lmdeploy \
+    --eval_backend OpenCompass \
+    --eval_dataset ARC_c
+```
+### Quantization
+```shell
+CUDA_VISIBLE_DEVICES=0 swift export \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --quant_bits 4 --quant_method awq \
+    --dataset AI-ModelScope/alpaca-gpt4-data-zh \
+    --output_dir Qwen2.5-7B-Instruct-AWQ
+```
+### Push Model
+```shell
+swift export \
+    --model <model-path> \
+    --push_to_hub true \
+    --hub_model_id '<model-id>' \
+    --hub_token '<sdk-token>'
+```
+## 🏛 License
+This framework is licensed under the [Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE). For models and datasets, please refer to the original resource page and follow the corresponding License.
+## 📎 Citation
+```bibtex
+@misc{zhao2024swiftascalablelightweightinfrastructure,
+      title={SWIFT:A Scalable lightWeight Infrastructure for Fine-Tuning},
+      author={Yuze Zhao and Jintao Huang and Jinghan Hu and Xingjun Wang and Yunlin Mao and Daoze Zhang and Zeyinzi Jiang and Zhikai Wu and Baole Ai and Ang Wang and Wenmeng Zhou and Yingda Chen},
+      year={2024},
+      eprint={2408.05517},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2408.05517},
+}
+```
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=modelscope/swift&type=Date)](https://star-history.com/#modelscope/ms-swift&Date)

README_CN.md ADDED Viewed

	@@ -0,0 +1,413 @@

+# SWIFT (Scalable lightWeight Infrastructure for Fine-Tuning)
+<p align="center">
+    <br>
+    <img src="asset/banner.png"/>
+    <br>
+<p>
+<p align="center">
+<a href="https://modelscope.cn/home">魔搭社区官网</a>
+<br>
+        中文&nbsp ｜ &nbsp<a href="README.md">English</a>&nbsp
+</p>
+<p align="center">
+<img src="https://img.shields.io/badge/python-3.10-5be.svg">
+<img src="https://img.shields.io/badge/pytorch-%E2%89%A52.0-orange.svg">
+<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.19-5D91D4.svg"></a>
+<a href="https://pypi.org/project/ms-swift/"><img src="https://badge.fury.io/py/ms-swift.svg"></a>
+<a href="https://github.com/modelscope/swift/blob/main/LICENSE"><img src="https://img.shields.io/github/license/modelscope/swift"></a>
+<a href="https://pepy.tech/project/ms-swift"><img src="https://pepy.tech/badge/ms-swift"></a>
+<a href="https://github.com/modelscope/swift/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
+</p>
+<p align="center">
+<a href="https://trendshift.io/repositories/6427" target="_blank"><img src="https://trendshift.io/api/badge/repositories/6427" alt="modelscope%2Fswift | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
+<p align="center">
+        <a href="https://arxiv.org/abs/2408.05517">论文</a> &nbsp ｜ <a href="https://swift.readthedocs.io/en/latest/">English Documentation</a> &nbsp ｜ &nbsp <a href="https://swift.readthedocs.io/zh-cn/latest/">中文文档</a> &nbsp
+</p>
+##  📖 目录
+- [用户群](#-用户群)
+- [简介](#-简介)
+- [新闻](#-新闻)
+- [安装](#%EF%B8%8F-安装)
+- [快速开始](#-快速开始)
+- [如何使用](#-如何使用)
+- [License](#-license)
+- [引用](#-引用)
+## ☎ 用户群
+请扫描下面的二维码来加入我们的交流群：
+[Discord Group](https://discord.com/invite/D27yfEFVz5)              |  微信群
+:-------------------------:|:-------------------------:
+<img src="asset/discord_qr.jpg" width="200" height="200">  |  <img src="asset/wechat.png" width="200" height="200">
+## 📝 简介
+🍲 ms-swift是魔搭社区提供的大模型与多模态大模型微调部署框架，现已支持500+大模型与200+多模态大模型的训练（预训练、微调、人类对齐）、推理、评测、量化与部署。其中大模型包括：Qwen3、Qwen3-MoE、Qwen2.5、InternLM3、GLM4、Mistral、DeepSeek-R1、Yi1.5、TeleChat2、Baichuan2、Gemma2等模型，多模态大模型包括：Qwen2.5-VL、Qwen2-Audio、Llama4、Llava、InternVL2.5、MiniCPM-V-2.6、GLM4v、Xcomposer2.5、Yi-VL、DeepSeek-VL2、Phi3.5-Vision、GOT-OCR2等模型。
+🍔 除此之外，ms-swift汇集了最新的训练技术，包括LoRA、QLoRA、Llama-Pro、LongLoRA、GaLore、Q-GaLore、LoRA+、LISA、DoRA、FourierFt、ReFT、UnSloth、和Liger等轻量化训练技术，以及DPO、GRPO、RM、PPO、KTO、CPO、SimPO、ORPO等人类对齐训练方法。ms-swift支持使用vLLM和LMDeploy对推理、评测和部署模块进行加速，并支持使用GPTQ、AWQ、BNB等技术对大模型进行量化。ms-swift还提供了基于Gradio的Web-UI界面及丰富的最佳实践。
+**为什么选择ms-swift？**
+- 🍎 **模型类型**：支持500+纯文本大模型、**200+多模态大模型**以及All-to-All全模态模型、序列分类模型、Embedding模型**训练到部署全流程**。
+- **数据集类型**：内置150+预训练、微调、人类对齐、多模态等各种类型的数据集，并支持自定义数据集。
+- **硬件支持**：CPU、RTX系列、T4/V100、A10/A100/H100、Ascend NPU、MPS等。
+- 🍊 **轻量训练**：支持了LoRA、QLoRA、DoRA、LoRA+、ReFT、RS-LoRA、LLaMAPro、Adapter、GaLore、Q-Galore、LISA、UnSloth、Liger-Kernel等轻量微调方式。
+- **分布式训练**：支持分布式数据并行（DDP）、device_map简易模型并行、DeepSpeed ZeRO2 ZeRO3、FSDP等分布式训练技术。
+- **量化训练**：支持对BNB、AWQ、GPTQ、AQLM、HQQ、EETQ量化模型进行训练。
+- **RLHF训练**：支持纯文本大模型和多模态大模型的DPO、GRPO、RM、PPO、KTO、CPO、SimPO、ORPO等人类对齐训练方法。
+- 🍓 **多模态训练**：支持对图像、视频和语音不同模态模型进行训练，支持VQA、Caption、OCR、Grounding任务的训练。
+- **界面训练**：以界面的方式提供训练、推理、评测、量化的能力，完成大模型的全链路。
+- **插件化与拓展**：支持自定义模型和数据集拓展，支持对loss、metric、trainer、loss-scale、callback、optimizer等组件进行自定义。
+- 🍉 **工具箱能力**：不仅提供大模型和多模态大模型的训练支持，还涵盖其推理、评测、量化和部署全流程。
+- **推理加速**：支持PyTorch、vLLM、LmDeploy推理加速引擎，并提供OpenAI接口，为推理、部署和评测模块提供加速。
+- **模型评测**：以EvalScope作为评测后端，支持100+评测数据集对纯��本和多模态模型进行评测。
+- **模型量化**：支持AWQ、GPTQ和BNB的量化导出，导出的模型支持使用vLLM/LmDeploy推理加速，并支持继续训练。
+## 🎉 新闻
+- 🎁 2025.05.11: GRPO中的奖励模型支持自定义处理逻辑，GenRM的例子参考[这里](./docs/source/Instruction/GRPO.md#自定义奖励模型)
+- 🎁 2025.04.15: ms-swift论文已经被AAAI 2025接收，论文地址在[这里](https://ojs.aaai.org/index.php/AAAI/article/view/35383)。
+- 🎁 2025.03.23: 支持了多轮GRPO，用于构建多轮对话场景的训练(例如agent tool calling)，请查看[训练脚本](examples/train/grpo/internal/train_multi_round.sh)。
+- 🎁 2025.03.16: 支持了Megatron的并行技术进行训练，请查看[Megatron-SWIFT训练文档](https://swift.readthedocs.io/zh-cn/latest/Instruction/Megatron-SWIFT训练.html)。
+- 🎁 2025.03.15: 支持纯文本和多模态模型的embedding模型的微调，请查看[训练脚本](examples/train/embedding)。
+- 🎁 2025.03.05: 支持GRPO的hybrid模式，4GPU(4*80G)训练72B模型的脚本参考[这里](examples/train/grpo/internal/train_72b_4gpu.sh)。同时支持vllm的tensor并行，训练脚本参考[这里](examples/train/grpo/internal/multi_gpu_mp_colocate.sh)。
+- 🎁 2025.02.21: GRPO算法支持使用LMDeploy，训练脚本参考[这里](examples/train/grpo/internal/full_lmdeploy.sh)。此外测试了GRPO算法的性能，使用一些tricks使训练速度提高到300%。WanDB表格请查看[这里](https://wandb.ai/tastelikefeet/grpo_perf_test?nw=nwuseryuzezyz)。
+- 🎁 2025.02.21: 支持`swift sample`命令。强化微调脚本参考[这里](docs/source/Instruction/强化微调.md)，大模型API蒸馏采样脚本参考[这里](examples/sampler/distill/distill.sh)。
+- 🔥 2025.02.12: 支持GRPO (Group Relative Policy Optimization) 训练算法，文档参考[这里](docs/source/Instruction/GRPO.md)。
+- 🎁 2024.12.04: **ms-swift3.0**大版本更新。请查看[发布说明和更改](https://swift.readthedocs.io/zh-cn/latest/Instruction/ReleaseNote3.0.html)。
+<details><summary>更多</summary>
+- 🎉 2024.08.12: ms-swift论文已经发布到arXiv上，可以点击[这里](https://arxiv.org/abs/2408.05517)阅读。
+- 🔥 2024.08.05: 支持使用[evalscope](https://github.com/modelscope/evalscope/)作为后端进行大模型和多模态模型的评测。
+- 🔥 2024.07.29: 支持使用[vllm](https://github.com/vllm-project/vllm), [lmdeploy](https://github.com/InternLM/lmdeploy)对大模型和多模态大模型进行推理加速，在infer/deploy/eval时额外指定`--infer_backend vllm/lmdeploy`即可。
+- 🔥 2024.07.24: 支持对多模态大模型进行人类偏好对齐训练，包括DPO/ORPO/SimPO/CPO/KTO/RM/PPO。
+- 🔥 2024.02.01: 支持Agent训练！训练算法源自这篇[论文](https://arxiv.org/pdf/2309.00986.pdf)。
+</details>
+## 🛠️ 安装
+使用pip进行安装：
+```shell
+pip install ms-swift -U
+```
+从源代码安装：
+```shell
+# pip install git+https://github.com/modelscope/ms-swift.git
+git clone https://github.com/modelscope/ms-swift.git
+cd ms-swift
+pip install -e .
+```
+运行环境：
+|        | 范围           | 推荐 | 备注 |
+| ------ |--------------| ---- | --|
+| python | >=3.9        | 3.10 ||
+| cuda |              | cuda12 |使用cpu、npu、mps则无需安装|
+| torch | >=2.0        |  ||
+| transformers | >=4.33       | 4.51 ||
+| modelscope | >=1.23       |  ||
+| peft | >=0.11,<0.16 | ||
+| trl | >=0.13,<0.18 | 0.17 |RLHF|
+| deepspeed | >=0.14       | 0.14.5 |训练|
+| vllm | >=0.5.1      | 0.7.3/0.8 |推理/部署/评测|
+| lmdeploy | >=0.5        | 0.8 |推理/部署/评测|
+| evalscope | >=0.11       | |评测|
+更多可选依赖可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh)。
+## 🚀 快速开始
+**10分钟**在单卡3090上对Qwen2.5-7B-Instruct进行自我认知微调：
+### 命令行
+```shell
+# 22GB
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --train_type lora \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+              'swift/self-cognition#500' \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --gradient_accumulation_steps 16 \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --model_author swift \
+    --model_name swift-robot
+```
+小贴士：
+- 如果要使用自定义数据集进行训练，你可以参考[这里](https://swift.readthedocs.io/zh-cn/latest/Customization/%E8%87%AA%E5%AE%9A%E4%B9%89%E6%95%B0%E6%8D%AE%E9%9B%86.html)组织数据集格式，并指定`--dataset <dataset_path>`。
+- `--model_author`和`--model_name`参数只有当数据集中包含`swift/self-cognition`时才生效。
+- 如果要使用其他模型进行训练，你只需要修改`--model <model_id/model_path>`即可。
+- 默认使用ModelScope进行模型和数据集的下载。如果要使用HuggingFace，指定`--use_hf true`即可。
+训练完成后，使用以下命令对训练后的权重进行推理：
+- 这里的`--adapters`需要替换成训练生成的last checkpoint文件夹。由于adapters文件夹中包含了训练的参数文件`args.json`，因此不需要额外指定`--model`，`--system`，swift会自动读取这些参数。如果要关闭此行为，可以设置`--load_args false`。
+```shell
+# 使用交互式命令行进行推理
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --temperature 0 \
+    --max_new_tokens 2048
+# merge-lora并使用vLLM进行推理加速
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --merge_lora true \
+    --infer_backend vllm \
+    --max_model_len 8192 \
+    --temperature 0 \
+    --max_new_tokens 2048
+```
+最后，使用以下命令将模型推送到ModelScope：
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+swift export \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --push_to_hub true \
+    --hub_model_id '<your-model-id>' \
+    --hub_token '<your-sdk-token>' \
+    --use_hf false
+```
+### Web-UI
+Web-UI是基于gradio界面技术的**零门槛**训练、部署界面方案，具体可以查看[这里](https://swift.readthedocs.io/zh-cn/latest/GetStarted/Web-UI.html)。
+```shell
+swift web-ui
+```
+![image.png](./docs/resources/web-ui.jpg)
+### 使用Python
+ms-swift也支持使用python的方式进行训练和推理。下面给出训练和推理的**伪代码**，具体可以查看[这里](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-self-cognition/self-cognition-sft.ipynb)。
+训练：
+```python
+# 获取模型和template，并加入可训练的LoRA模块
+model, tokenizer = get_model_tokenizer(model_id_or_path, ...)
+template = get_template(model.model_meta.template, tokenizer, ...)
+model = Swift.prepare_model(model, lora_config)
+# 下载并载入数据集，并将文本encode成tokens
+train_dataset, val_dataset = load_dataset(dataset_id_or_path, ...)
+train_dataset = EncodePreprocessor(template=template)(train_dataset, num_proc=num_proc)
+val_dataset = EncodePreprocessor(template=template)(val_dataset, num_proc=num_proc)
+# 进行训练
+trainer = Seq2SeqTrainer(
+    model=model,
+    args=training_args,
+    data_collator=template.data_collator,
+    train_dataset=train_dataset,
+    eval_dataset=val_dataset,
+    template=template,
+)
+trainer.train()
+```
+推理：
+```python
+# 使用原生pytorch引擎进行推理
+engine = PtEngine(model_id_or_path, adapters=[lora_checkpoint])
+infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}])
+request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature)
+resp_list = engine.infer([infer_request], request_config)
+print(f'response: {resp_list[0].choices[0].message.content}')
+```
+## ✨ 如何使用
+这里给出使用ms-swift进行训练到部署到最简示例，具体可以查看[examples](https://github.com/modelscope/ms-swift/tree/main/examples)。
+- 若想使用其他模型或者数据集（含多模态模型和数据集），你只需要修改`--model`指定对应模型的id或者path，修改`--dataset`指定对应数据集的id或者path即可。
+- 默认使用ModelScope进行模型和数据集的下载。如果要使用HuggingFace，指定`--use_hf true`即可。
+|   常用链接 |
+| ------ |
+|   [🔥命令行参数](https://swift.readthedocs.io/zh-cn/latest/Instruction/%E5%91%BD%E4%BB%A4%E8%A1%8C%E5%8F%82%E6%95%B0.html)   |
+|   [支持的模型和数据集](https://swift.readthedocs.io/zh-cn/latest/Instruction/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.html)   |
+|   [自定义模型](https://swift.readthedocs.io/zh-cn/latest/Customization/%E8%87%AA%E5%AE%9A%E4%B9%89%E6%A8%A1%E5%9E%8B.html), [🔥自定义数据集](https://swift.readthedocs.io/zh-cn/latest/Customization/%E8%87%AA%E5%AE%9A%E4%B9%89%E6%95%B0%E6%8D%AE%E9%9B%86.html)   |
+|   [大模型教程](https://github.com/modelscope/modelscope-classroom/tree/main/LLM-tutorial)   |
+### 训练
+支持的训练方法：
+| 方法   | 全参数 | LoRA                                                                                        | QLoRA | Deepspeed | 多机 | 多模态                                                                                          |
+| ------ | ------ |---------------------------------------------------------------------------------------------| ----- | ------ | ------ |----------------------------------------------------------------------------------------------|
+| 预训练 | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/pretrain/train.sh) | ✅                                                                                           | ✅ | ✅ | ✅ | ✅                                                                                            |
+| 指令监督微调 | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/train.sh) | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/lora_sft.sh)            | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-gpu/deepspeed) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal)              |
+| DPO训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo.sh)            | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/dpo.sh)  |
+| GRPO训练 | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/internal/grpo_zero2.sh) | ✅                                                                                           | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/internal/multi_node) | ✅                                                                                            |
+| 奖励模型训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh)             | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh) | ✅ | ✅                                                                                            |
+| PPO训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo.sh)            | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo.sh) | ✅ | ❌                                                                                            |
+| KTO训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh)            | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/kto.sh)  |
+| CPO训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh)            | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh) | ✅ | ✅                                                                                            |
+| SimPO训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh)          | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh) | ✅ | ✅                                                                                            |
+| ORPO训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh)           | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh) | ✅ | ✅                                                                                            |
+| 分类模型训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls/qwen2_5/sft.sh) | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls/qwen2_vl/sft.sh) |
+| Embedding模型训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/train_gte.sh) | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/train_gme.sh)  |
+预训练：
+```shell
+# 8*A100
+NPROC_PER_NODE=8 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+swift pt \
+    --model Qwen/Qwen2.5-7B \
+    --dataset swift/chinese-c4 \
+    --streaming true \
+    --train_type full \
+    --deepspeed zero2 \
+    --output_dir output \
+    --max_steps 10000 \
+    ...
+```
+微调：
+```shell
+CUDA_VISIBLE_DEVICES=0 swift sft \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --dataset AI-ModelScope/alpaca-gpt4-data-zh \
+    --train_type lora \
+    --output_dir output \
+    ...
+```
+RLHF：
+```shell
+CUDA_VISIBLE_DEVICES=0 swift rlhf \
+    --rlhf_type dpo \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
+    --train_type lora \
+    --output_dir output \
+    ...
+```
+### 推理
+```shell
+CUDA_VISIBLE_DEVICES=0 swift infer \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --stream true \
+    --infer_backend pt \
+    --max_new_tokens 2048
+# LoRA
+CUDA_VISIBLE_DEVICES=0 swift infer \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --adapters swift/test_lora \
+    --stream true \
+    --infer_backend pt \
+    --temperature 0 \
+    --max_new_tokens 2048
+```
+### 界面推理
+```shell
+CUDA_VISIBLE_DEVICES=0 swift app \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --stream true \
+    --infer_backend pt \
+    --max_new_tokens 2048 \
+    --lang zh
+```
+### 部署
+```shell
+CUDA_VISIBLE_DEVICES=0 swift deploy \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --infer_backend vllm
+```
+### 采样
+```shell
+CUDA_VISIBLE_DEVICES=0 swift sample \
+    --model LLM-Research/Meta-Llama-3.1-8B-Instruct \
+    --sampler_engine pt \
+    --num_return_sequences 5 \
+    --dataset AI-ModelScope/alpaca-gpt4-data-zh#5
+```
+### 评测
+```shell
+CUDA_VISIBLE_DEVICES=0 swift eval \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --infer_backend lmdeploy \
+    --eval_backend OpenCompass \
+    --eval_dataset ARC_c
+```
+### 量化
+```shell
+CUDA_VISIBLE_DEVICES=0 swift export \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --quant_bits 4 --quant_method awq \
+    --dataset AI-ModelScope/alpaca-gpt4-data-zh \
+    --output_dir Qwen2.5-7B-Instruct-AWQ
+```
+### 推送模型
+```shell
+swift export \
+    --model <model-path> \
+    --push_to_hub true \
+    --hub_model_id '<model-id>' \
+    --hub_token '<sdk-token>'
+```
+## 🏛 License
+本框架使用[Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE)进行许可。模型和数据集请查看原资源页面并遵守对应License。
+## 📎 引用
+```bibtex
+@misc{zhao2024swiftascalablelightweightinfrastructure,
+      title={SWIFT:A Scalable lightWeight Infrastructure for Fine-Tuning},
+      author={Yuze Zhao and Jintao Huang and Jinghan Hu and Xingjun Wang and Yunlin Mao and Daoze Zhang and Zeyinzi Jiang and Zhikai Wu and Baole Ai and Ang Wang and Wenmeng Zhou and Yingda Chen},
+      year={2024},
+      eprint={2408.05517},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2408.05517},
+}
+```
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=modelscope/swift&type=Date)](https://star-history.com/#modelscope/ms-swift&Date)

checkMissing.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import json
+import torchaudio
+from tqdm import tqdm
+import os
+import sys
+from collections import defaultdict
+def validate_jsonl_audios(jsonl_path):
+    """验证JSONL文件中所有音频文件的完整性"""
+    stats = defaultdict(int)
+    error_log = []
+    valid_samples = 0
+    # 第一次遍历：统计总行数（用于进度条）
+    with open(jsonl_path, 'r') as f:
+        total_lines = sum(1 for _ in f)
+    # 第二次遍历：实际验证
+    with open(jsonl_path, 'r') as f:
+        for line_num, line in enumerate(tqdm(f, total=total_lines, desc="验证进度", unit="line")):
+            try:
+                data = json.loads(line.strip())
+                if 'audios' not in data or not data['audios']:
+                    stats['no_audio_field'] += 1
+                    continue
+                for audio_path in data['audios']:
+                    # 检查文件是否存在
+                    if not os.path.exists(audio_path):
+                        stats['missing'] += 1
+                        error_log.append(f"[行{line_num+1}] 缺失文件: {audio_path}")
+                        continue
+                    # 检查文件大小
+                    if os.path.getsize(audio_path) == 0:
+                        stats['zero_size'] += 1
+                        error_log.append(f"[行{line_num+1}] 空文件: {audio_path}")
+                        continue
+                    # 验证音频内容
+                    try:
+                        waveform, sr = torchaudio.load(audio_path)
+                        if waveform.numel() == 0:
+                            stats['empty_audio'] += 1
+                            error_log.append(f"[行{line_num+1}] 空音频: {audio_path}")
+                        elif sr not in [8000, 16000, 22050, 44100, 48000]:
+                            stats['abnormal_sr'] += 1
+                            error_log.append(f"[行{line_num+1}] 异常采样率({sr}Hz): {audio_path}")
+                        else:
+                            stats['valid'] += 1
+                    except Exception as e:
+                        stats['corrupted'] += 1
+                        error_type = str(e).split('(')[0]
+                        error_log.append(f"[行{line_num+1}] 损坏文件({error_type}): {audio_path}")
+                valid_samples += 1
+            except json.JSONDecodeError:
+                stats['invalid_json'] += 1
+                error_log.append(f"[行{line_num+1}] 无效JSON格式")
+    # 打印统计报告
+    print("\n===== 验证报告 =====")
+    print(f"总行数: {total_lines}")
+    print(f"有效样本: {valid_samples}")
+    print("--- 问题统计 ---")
+    for k, v in sorted(stats.items()):
+        print(f"{k}: {v}")
+    # 保存错误日志
+    if error_log:
+        log_file = f"{os.path.splitext(jsonl_path)[0]}_audio_errors.log"
+        with open(log_file, 'w') as f:
+            f.write("\n".join(error_log))
+        print(f"\n发现 {len(error_log)} 个问题，已保存到 {log_file}")
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("使用方法: python validate_audio_jsonl.py <input.jsonl>")
+        sys.exit(1)
+    if not os.path.exists(sys.argv[1]):
+        print(f"错误: 文件 {sys.argv[1]} 不存在")
+        sys.exit(1)
+    validate_jsonl_audios(sys.argv[1])

clean_transcripts.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import json
+import re
+from typing import List, Dict, Tuple
+def parse_timestamp(timestamp: str) -> Tuple[int, int]:
+    """Convert timestamp string like '00:15' to seconds."""
+    minutes, seconds = map(int, timestamp.split(':'))
+    return minutes * 60 + seconds
+def extract_time_and_speaker(line: str) -> Tuple[Tuple[int, int], str]:
+    """Extract time range and speaker from a line."""
+    # Extract time range
+    time_match = re.match(r'\[(\d{2}:\d{2}) - (\d{2}:\d{2})\] (Speaker [A-Z]):', line)
+    if not time_match:
+        return None, None
+    start_time = parse_timestamp(time_match.group(1))
+    end_time = parse_timestamp(time_match.group(2))
+    speaker = time_match.group(3)
+    return (start_time, end_time), speaker
+def has_overlap(range1: Tuple[int, int], range2: Tuple[int, int]) -> bool:
+    """Check if two time ranges overlap."""
+    start1, end1 = range1
+    start2, end2 = range2
+    return not (end1 <= start2 or end2 <= start1)
+def has_same_speaker_overlap(transcript: str) -> bool:
+    """Check if a transcript contains overlapping timestamps for the same speaker."""
+    lines = transcript.split('\n')
+    # Dictionary to store time ranges for each speaker
+    speaker_ranges = {}
+    for line in lines:
+        if not line.strip():
+            continue
+        time_range, speaker = extract_time_and_speaker(line)
+        if time_range is None or speaker is None:
+            continue
+        # Check for overlaps with existing ranges of the same speaker
+        if speaker in speaker_ranges:
+            for existing_range in speaker_ranges[speaker]:
+                if has_overlap(time_range, existing_range):
+                    return True
+            speaker_ranges[speaker].append(time_range)
+        else:
+            speaker_ranges[speaker] = [time_range]
+    return False
+def process_file(input_file: str, output_file: str, delete_file: str):
+    """Process the JSON file and separate entries with same-speaker overlapping timestamps."""
+    with open(input_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    if isinstance(data, dict):
+        data = [data]
+    cleaned_data = []
+    deleted_data = []
+    removed_count = 0
+    for entry in data:
+        if 'model_output' in entry:
+            if not has_same_speaker_overlap(entry['model_output']):
+                cleaned_data.append(entry)
+            else:
+                deleted_data.append(entry)
+                removed_count += 1
+                print(f"Removing entry with key: {entry.get('key', 'unknown')}")
+    # Save cleaned data
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(cleaned_data, f, ensure_ascii=False, indent=2)
+    # Save deleted data
+    with open(delete_file, 'w', encoding='utf-8') as f:
+        json.dump(deleted_data, f, ensure_ascii=False, indent=2)
+    print(f"\nProcessing Summary:")
+    print(f"Processed {len(data)} entries")
+    print(f"Removed {removed_count} entries with same-speaker overlapping timestamps")
+    print(f"Remaining entries: {len(cleaned_data)}")
+if __name__ == '__main__':
+    input_file = 'silence_overlaps/transcriptions.json'
+    output_file = 'silence_overlaps/cleaned_transcriptions2.json'
+    delete_file = 'silence_overlaps/delete_transcript2.json'
+    process_file(input_file, output_file, delete_file)
+    print(f"\nCleaned transcriptions have been saved to {output_file}")
+    print(f"Deleted entries have been saved to {delete_file}")

count_audios.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import json
+import os
+from collections import Counter
+from pathlib import Path
+def collect_unique_audio_paths(json_file_path):
+    """
+    提取 JSONL 文件中所有不重复的 audios 路径
+    """
+    audio_set = set()
+    with open(json_file_path, 'r', encoding='utf-8') as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+                if isinstance(data, dict) and 'audios' in data and data['audios']:
+                    for audio_path in data['audios']:
+                        audio_set.add(audio_path)
+            except Exception as e:
+                print(f"第 {line_num} 行处理错误: {e}")
+    return audio_set
+def extract_first_subfolder_after_data(audio_path):
+    """
+    提取 audio_path 中 'data/' 后的第一级子文件夹名称
+    例如：
+    /.../data/output_xxx/yyy/file.wav → 返回 output_xxx
+    """
+    try:
+        path = Path(audio_path)
+        parts = path.parts
+        if "wavrewardDataset" in parts:
+            data_idx = parts.index("wavrewardDataset")
+            if data_idx + 1 < len(parts):
+                return parts[data_idx + 1]
+        return "unknown"
+    except Exception as e:
+        print(f"路径解析错误: {audio_path}, 错误: {e}")
+        return "error"
+def main():
+    json_file = "all_dataset_train_resampled_16000.jsonl"
+    if not os.path.exists(json_file):
+        print(f"文件 {json_file} 不存在")
+        return
+    print(f"正在处理文件: {json_file}")
+    print("=" * 50)
+    # 步骤 1：收集去重后的音频路径
+    unique_audio_paths = collect_unique_audio_paths(json_file)
+    print(f"不重复音频文件数: {len(unique_audio_paths)}")
+    # 步骤 2：按 data 后的一级子目录统计
+    folder_counter = Counter()
+    for audio_path in unique_audio_paths:
+        first_subfolder = extract_first_subfolder_after_data(audio_path)
+        folder_counter[first_subfolder] += 1
+    print("\n按 data 后一级子文件夹统计（基于去重后的路径）:")
+    print("-" * 50)
+    for folder, count in sorted(folder_counter.items(), key=lambda x: -x[1]):
+        print(f"{folder}: {count} 个文件")
+if __name__ == "__main__":
+    main()

count_folders-Copy1.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import json
+import os
+from collections import Counter
+from pathlib import Path
+def count_folder_prefixes(json_file_path):
+    """
+    统计JSON文件中音频路径的前缀文件夹出现次数
+    提取到 /root/autodl-tmp/wavrewardDataset/ultrachat_200k/data 层级
+    然后根据子文件夹进行分类统计
+    支持JSONL格式（每行一个JSON对象）
+    """
+    folder_counts = Counter()
+    # 读取JSON文件（支持JSONL格式）
+    with open(json_file_path, 'r', encoding='utf-8') as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:  # 跳过空行
+                continue
+            try:
+                # 解析每一行的JSON对象
+                data = json.loads(line)
+                # 处理数据
+                if isinstance(data, dict) and 'audios' in data and data['audios']:
+                    for audio_path in data['audios']:
+                        prefix = extract_folder_prefix(audio_path)
+                        if prefix:
+                            folder_counts[prefix] += 1
+            except json.JSONDecodeError as e:
+                print(f"第 {line_num} 行JSON解析错误: {e}")
+                continue
+            except Exception as e:
+                print(f"第 {line_num} 行处理错误: {e}")
+                continue
+    return folder_counts
+def extract_folder_prefix(audio_path):
+    """
+    从音频路径中提取到指定层级的前缀，然后包含子文件夹进行分类
+    例如: /root/autodl-tmp/wavrewardDataset/ultrachat_200k/data/output_2000_3000_wrongpause/xxx.wav
+    提取到: /root/autodl-tmp/wavrewardDataset/ultrachat_200k/data/output_2000_3000_wrongpause
+    """
+    try:
+        # 使用Path对象处理路径
+        path = Path(audio_path)
+        # 查找目标文件夹在路径中的位置
+        parts = path.parts
+        target_folder = "newdataset_10k"
+        if target_folder in parts:
+            # 找到目标文件夹的位置
+            target_index = parts.index(target_folder)
+            # 提取到目标文件夹（包含目标文件夹）
+            prefix_parts = parts[:target_index + 1]
+            # 如果目标文件夹后面还有子文件夹，也包含进来
+            if target_index + 1 < len(parts):
+                # 包含下一个子文件夹（如 output_2000_3000_wrongpause）
+                prefix_parts = parts[:target_index + 2]
+            return str(Path(*prefix_parts))
+        else:
+            # 如果没找到目标文件夹，返回路径的前几级
+            if len(parts) >= 4:  # /root/autodl-tmp/xxx/...
+                return str(Path(*parts[:4]))
+            else:
+                return str(path.parent)
+    except Exception as e:
+        print(f"处理路径时出错: {audio_path}, 错误: {e}")
+        return None
+def main():
+    # 指定JSON文件路径
+    json_file = "needtouse/adddata_absolute.jsonl"
+    if not os.path.exists(json_file):
+        print(f"文件 {json_file} 不存在")
+        return
+    print(f"正在统计文件: {json_file}")
+    print("=" * 50)
+    # 统计文件夹前缀
+    folder_counts = count_folder_prefixes(json_file)
+    # 输出结果
+    if folder_counts:
+        print("文件夹前缀统计结果:")
+        print("-" * 50)
+        for folder, count in sorted(folder_counts.items()):
+            print(f"{folder}: {count} 次")
+        print(f"\n总计: {len(folder_counts)} 个不同的文件夹前缀")
+        print(f"总音频文件数: {sum(folder_counts.values())}")
+        # 按子文件夹类型统计
+        print("\n按子文件夹类型统计:")
+        print("-" * 30)
+        subfolder_counts = Counter()
+        for folder in folder_counts.keys():
+            path = Path(folder)
+            if len(path.parts) > 0:
+                # 获取最后一个文件夹名作为子文件夹类型
+                subfolder = path.parts[-1]
+                subfolder_counts[subfolder] += folder_counts[folder]
+        for subfolder, count in sorted(subfolder_counts.items()):
+            print(f"{subfolder}: {count} 次")
+    else:
+        print("未找到任何音频路径")
+if __name__ == "__main__":
+    main()

count_folders.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import json
+import os
+from collections import Counter
+from pathlib import Path
+def count_folder_prefixes(json_file_path):
+    """
+    统计JSON文件中音频路径的前缀文件夹出现次数
+    提取到 /root/autodl-tmp/wavrewardDataset/ultrachat_200k/data 层级
+    然后根据子文件夹进行分类统计
+    支持JSONL格式（每行一个JSON对象）
+    """
+    folder_counts = Counter()
+    # 读取JSON文件（支持JSONL格式）
+    with open(json_file_path, 'r', encoding='utf-8') as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:  # 跳过空行
+                continue
+            try:
+                # 解析每一行的JSON对象
+                data = json.loads(line)
+                # 处理数据
+                if isinstance(data, dict) and 'audios' in data and data['audios']:
+                    for audio_path in data['audios']:
+                        prefix = extract_folder_prefix(audio_path)
+                        if prefix:
+                            folder_counts[prefix] += 1
+            except json.JSONDecodeError as e:
+                print(f"第 {line_num} 行JSON解析错误: {e}")
+                continue
+            except Exception as e:
+                print(f"第 {line_num} 行处理错误: {e}")
+                continue
+    return folder_counts
+def extract_folder_prefix(audio_path):
+    """
+    从音频路径中提取到指定层级的前缀，然后包含子文件夹进行分类
+    例如: /root/autodl-tmp/wavrewardDataset/ultrachat_200k/data/output_2000_3000_wrongpause/xxx.wav
+    提取到: /root/autodl-tmp/wavrewardDataset/ultrachat_200k/data/output_2000_3000_wrongpause
+    """
+    try:
+        # 使用Path对象处理路径
+        path = Path(audio_path)
+        # 查找目标文件夹在路径中的位置
+        parts = path.parts
+        target_folder = "data"
+        if target_folder in parts:
+            # 找到目标文件夹的位置
+            target_index = parts.index(target_folder)
+            # 提取到目标文件夹（包含目标文件夹）
+            prefix_parts = parts[:target_index + 1]
+            # 如果目标文件夹后面还有子文件夹，也包含进来
+            if target_index + 1 < len(parts):
+                # 包含下一个子文件夹（如 output_2000_3000_wrongpause）
+                prefix_parts = parts[:target_index + 2]
+            return str(Path(*prefix_parts))
+        else:
+            # 如果没找到目标文件夹，返回路径的前几级
+            if len(parts) >= 4:  # /root/autodl-tmp/xxx/...
+                return str(Path(*parts[:4]))
+            else:
+                return str(path.parent)
+    except Exception as e:
+        print(f"处理路径时出错: {audio_path}, 错误: {e}")
+        return None
+def main():
+    # 指定JSON文件路径
+    json_file = "dataset_4JOB.jsonl"
+    if not os.path.exists(json_file):
+        print(f"文件 {json_file} 不存在")
+        return
+    print(f"正在统计文件: {json_file}")
+    print("=" * 50)
+    # 统计文件夹前缀
+    folder_counts = count_folder_prefixes(json_file)
+    # 输出结果
+    if folder_counts:
+        print("文件夹前缀统计结果:")
+        print("-" * 50)
+        for folder, count in sorted(folder_counts.items()):
+            print(f"{folder}: {count} 次")
+        print(f"\n总计: {len(folder_counts)} 个不同的文件夹前缀")
+        print(f"总音频文件数: {sum(folder_counts.values())}")
+        # 按子文件夹类型统计
+        print("\n按子文件夹类型统计:")
+        print("-" * 30)
+        subfolder_counts = Counter()
+        for folder in folder_counts.keys():
+            path = Path(folder)
+            if len(path.parts) > 0:
+                # 获取最后一个文件夹名作为子文件夹类型
+                subfolder = path.parts[-1]
+                subfolder_counts[subfolder] += folder_counts[folder]
+        for subfolder, count in sorted(subfolder_counts.items()):
+            print(f"{subfolder}: {count} 次")
+    else:
+        print("未找到任何音频路径")
+if __name__ == "__main__":
+    main()

dialogue_length_distribution.png ADDED Viewed

dialogue_length_ranges.png ADDED Viewed

docs/transformers/build/lib/transformers/models/sam/processing_sam.py ADDED Viewed

	@@ -0,0 +1,311 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for SAM.
+"""
+from copy import deepcopy
+from typing import List, Optional, Union
+import numpy as np
+from ...image_utils import ImageInput, VideoInput
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import AudioInput, BatchEncoding, PreTokenizedInput, TextInput
+from ...utils import is_tf_available, is_torch_available
+if is_torch_available():
+    import torch
+if is_tf_available():
+    import tensorflow as tf
+class SamImagesKwargs(ImagesKwargs):
+    segmentation_maps: Optional[ImageInput]
+    input_points: Optional[List[List[float]]]
+    input_labels: Optional[List[List[int]]]
+    input_boxes: Optional[List[List[List[float]]]]
+    point_pad_value: Optional[int]
+class SamProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: SamImagesKwargs
+    _defaults = {
+        "images_kwargs": {
+            "point_pad_value": -10,
+        }
+    }
+class SamProcessor(ProcessorMixin):
+    r"""
+    Constructs a SAM processor which wraps a SAM image processor and an 2D points & Bounding boxes processor into a
+    single processor.
+    [`SamProcessor`] offers all the functionalities of [`SamImageProcessor`]. See the docstring of
+    [`~SamImageProcessor.__call__`] for more information.
+    Args:
+        image_processor (`SamImageProcessor`):
+            An instance of [`SamImageProcessor`]. The image processor is a required input.
+    """
+    attributes = ["image_processor"]
+    image_processor_class = "SamImageProcessor"
+    # For backward compatibility. See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details.
+    optional_call_args = [
+        "segmentation_maps",
+        "input_points",
+        "input_labels",
+        "input_boxes",
+    ]
+    def __init__(self, image_processor):
+        super().__init__(image_processor)
+        self.target_size = self.image_processor.size["longest_edge"]
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        # The following is to capture `segmentation_maps`, `input_points`, `input_labels` and `input_boxes`
+        # arguments that may be passed as a positional argument.
+        # See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details,
+        # or this conversation for more context:
+        # https://github.com/huggingface/transformers/pull/32544#discussion_r1720208116
+        # This behavior is only needed for backward compatibility and will be removed in future versions.
+        *args,  # to be deprecated
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio: Optional[AudioInput] = None,
+        video: Optional[VideoInput] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method uses [`SamImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D
+        points and bounding boxes for the model if they are provided.
+        """
+        output_kwargs = self._merge_kwargs(
+            SamProcessorKwargs,
+            tokenizer_init_kwargs={},
+            **kwargs,
+            **self.prepare_and_validate_optional_call_args(*args),
+        )
+        input_points = output_kwargs["images_kwargs"].pop("input_points", None)
+        input_labels = output_kwargs["images_kwargs"].pop("input_labels", None)
+        input_boxes = output_kwargs["images_kwargs"].pop("input_boxes", None)
+        point_pad_value = output_kwargs["images_kwargs"].pop("point_pad_value", None)
+        encoding_image_processor = self.image_processor(
+            images,
+            **output_kwargs["images_kwargs"],
+        )
+        # pop arguments that are not used in the foward but used nevertheless
+        original_sizes = encoding_image_processor["original_sizes"]
+        if hasattr(original_sizes, "numpy"):  # Checks if Torch or TF tensor
+            original_sizes = original_sizes.numpy()
+        input_points, input_labels, input_boxes = self._check_and_preprocess_points(
+            input_points=input_points,
+            input_labels=input_labels,
+            input_boxes=input_boxes,
+        )
+        encoding_image_processor = self._normalize_and_convert(
+            encoding_image_processor,
+            original_sizes,
+            input_points=input_points,
+            input_labels=input_labels,
+            input_boxes=input_boxes,
+            return_tensors=output_kwargs["common_kwargs"].get("return_tensors"),
+            point_pad_value=point_pad_value,
+        )
+        return encoding_image_processor
+    def _normalize_and_convert(
+        self,
+        encoding_image_processor,
+        original_sizes,
+        input_points=None,
+        input_labels=None,
+        input_boxes=None,
+        return_tensors="pt",
+        point_pad_value=-10,
+    ):
+        if input_points is not None:
+            if len(original_sizes) != len(input_points):
+                input_points = [
+                    self._normalize_coordinates(self.target_size, point, original_sizes[0]) for point in input_points
+                ]
+            else:
+                input_points = [
+                    self._normalize_coordinates(self.target_size, point, original_size)
+                    for point, original_size in zip(input_points, original_sizes)
+                ]
+            # check that all arrays have the same shape
+            if not all(point.shape == input_points[0].shape for point in input_points):
+                if input_labels is not None:
+                    input_points, input_labels = self._pad_points_and_labels(
+                        input_points, input_labels, point_pad_value
+                    )
+            input_points = np.array(input_points)
+        if input_labels is not None:
+            input_labels = np.array(input_labels)
+        if input_boxes is not None:
+            if len(original_sizes) != len(input_boxes):
+                input_boxes = [
+                    self._normalize_coordinates(self.target_size, box, original_sizes[0], is_bounding_box=True)
+                    for box in input_boxes
+                ]
+            else:
+                input_boxes = [
+                    self._normalize_coordinates(self.target_size, box, original_size, is_bounding_box=True)
+                    for box, original_size in zip(input_boxes, original_sizes)
+                ]
+            input_boxes = np.array(input_boxes)
+        if input_boxes is not None:
+            if return_tensors == "pt":
+                input_boxes = torch.from_numpy(input_boxes)
+                # boxes batch size of 1 by default
+                input_boxes = input_boxes.unsqueeze(1) if len(input_boxes.shape) != 3 else input_boxes
+            elif return_tensors == "tf":
+                input_boxes = tf.convert_to_tensor(input_boxes)
+                # boxes batch size of 1 by default
+                input_boxes = tf.expand_dims(input_boxes, 1) if len(input_boxes.shape) != 3 else input_boxes
+            encoding_image_processor.update({"input_boxes": input_boxes})
+        if input_points is not None:
+            if return_tensors == "pt":
+                input_points = torch.from_numpy(input_points)
+                # point batch size of 1 by default
+                input_points = input_points.unsqueeze(1) if len(input_points.shape) != 4 else input_points
+            elif return_tensors == "tf":
+                input_points = tf.convert_to_tensor(input_points)
+                # point batch size of 1 by default
+                input_points = tf.expand_dims(input_points, 1) if len(input_points.shape) != 4 else input_points
+            encoding_image_processor.update({"input_points": input_points})
+        if input_labels is not None:
+            if return_tensors == "pt":
+                input_labels = torch.from_numpy(input_labels)
+                # point batch size of 1 by default
+                input_labels = input_labels.unsqueeze(1) if len(input_labels.shape) != 3 else input_labels
+            elif return_tensors == "tf":
+                input_labels = tf.convert_to_tensor(input_labels)
+                # point batch size of 1 by default
+                input_labels = tf.expand_dims(input_labels, 1) if len(input_labels.shape) != 3 else input_labels
+            encoding_image_processor.update({"input_labels": input_labels})
+        return encoding_image_processor
+    def _pad_points_and_labels(self, input_points, input_labels, point_pad_value):
+        r"""
+        The method pads the 2D points and labels to the maximum number of points in the batch.
+        """
+        expected_nb_points = max([point.shape[0] for point in input_points])
+        processed_input_points = []
+        for i, point in enumerate(input_points):
+            if point.shape[0] != expected_nb_points:
+                point = np.concatenate(
+                    [point, np.zeros((expected_nb_points - point.shape[0], 2)) + point_pad_value], axis=0
+                )
+                input_labels[i] = np.append(input_labels[i], [point_pad_value])
+            processed_input_points.append(point)
+        input_points = processed_input_points
+        return input_points, input_labels
+    def _normalize_coordinates(
+        self, target_size: int, coords: np.ndarray, original_size, is_bounding_box=False
+    ) -> np.ndarray:
+        """
+        Expects a numpy array of length 2 in the final dimension. Requires the original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.image_processor._get_preprocess_shape(original_size, longest_edge=target_size)
+        coords = deepcopy(coords).astype(float)
+        if is_bounding_box:
+            coords = coords.reshape(-1, 2, 2)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        if is_bounding_box:
+            coords = coords.reshape(-1, 4)
+        return coords
+    def _check_and_preprocess_points(
+        self,
+        input_points=None,
+        input_labels=None,
+        input_boxes=None,
+    ):
+        r"""
+        Check and preprocesses the 2D points, labels and bounding boxes. It checks if the input is valid and if they
+        are, it converts the coordinates of the points and bounding boxes. If a user passes directly a `torch.Tensor`,
+        it is converted to a `numpy.ndarray` and then to a `list`.
+        """
+        if input_points is not None:
+            if hasattr(input_points, "numpy"):  # Checks for TF or Torch tensor
+                input_points = input_points.numpy().tolist()
+            if not isinstance(input_points, list) or not isinstance(input_points[0], list):
+                raise ValueError("Input points must be a list of list of floating points.")
+            input_points = [np.array(input_point) for input_point in input_points]
+        else:
+            input_points = None
+        if input_labels is not None:
+            if hasattr(input_labels, "numpy"):
+                input_labels = input_labels.numpy().tolist()
+            if not isinstance(input_labels, list) or not isinstance(input_labels[0], list):
+                raise ValueError("Input labels must be a list of list integers.")
+            input_labels = [np.array(label) for label in input_labels]
+        else:
+            input_labels = None
+        if input_boxes is not None:
+            if hasattr(input_boxes, "numpy"):
+                input_boxes = input_boxes.numpy().tolist()
+            if (
+                not isinstance(input_boxes, list)
+                or not isinstance(input_boxes[0], list)
+                or not isinstance(input_boxes[0][0], list)
+            ):
+                raise ValueError("Input boxes must be a list of list of list of floating points.")
+            input_boxes = [np.array(box).astype(np.float32) for box in input_boxes]
+        else:
+            input_boxes = None
+        return input_points, input_labels, input_boxes
+    @property
+    def model_input_names(self):
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(image_processor_input_names))
+    def post_process_masks(self, *args, **kwargs):
+        return self.image_processor.post_process_masks(*args, **kwargs)
+__all__ = ["SamProcessor"]

docs/transformers/build/lib/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py ADDED Viewed

	@@ -0,0 +1,396 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Converting Meta SeamlessM4T checkpoints from seamless_communication to HF."""
+import argparse
+import os
+from pathlib import Path
+import torch
+from accelerate.utils.modeling import find_tied_parameters
+from seamless_communication.models.inference.translator import Translator
+from transformers import (
+    SeamlessM4TConfig,
+    SeamlessM4TFeatureExtractor,
+    SeamlessM4TModel,
+    SeamlessM4TProcessor,
+    SeamlessM4TTokenizer,
+)
+from transformers.utils import logging
+UNIT_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kan__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tam__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__", ]  # fmt: skip
+VOCODER_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__",]  # fmt: skip
+MEDIUM_SUPPORTED_LANGUAGES = ["ace","ace_Latn","acm","acq","aeb","afr","ajp","aka","amh","apc","arb","ars","ary","arz","asm","ast","awa","ayr","azb","azj","bak","bam","ban","bel","bem","ben","bho","bjn","bjn_Latn","bod","bos","bug","bul","cat","ceb","ces","cjk","ckb","crh","cym","dan","deu","dik","dyu","dzo","ell","eng","epo","est","eus","ewe","fao","pes","fij","fin","fon","fra","fur","fuv","gla","gle","glg","grn","guj","hat","hau","heb","hin","hne","hrv","hun","hye","ibo","ilo","ind","isl","ita","jav","jpn","kab","kac","kam","kan","kas","kas_Deva","kat","knc","knc_Latn","kaz","kbp","kea","khm","kik","kin","kir","kmb","kon","kor","kmr","lao","lvs","lij","lim","lin","lit","lmo","ltg","ltz","lua","lug","luo","lus","mag","mai","mal","mar","min","mkd","plt","mlt","mni","khk","mos","mri","zsm","mya","nld","nno","nob","npi","nso","nus","nya","oci","gaz","ory","pag","pan","pap","pol","por","prs","pbt","quy","ron","run","rus","sag","san","sat","scn","shn","sin","slk","slv","smo","sna","snd","som","sot","spa","als","srd","srp","ssw","sun","swe","swh","szl","tam","tat","tel","tgk","tgl","tha","tir","taq","taq_Tfng","tpi","tsn","tso","tuk","tum","tur","twi","tzm","uig","ukr","umb","urd","uzn","vec","vie","war","wol","xho","ydd","yor","yue","cmn","cmn_Hant","zul",]  # fmt: skip
+LARGE_SUPPORTED_LANGUAGES = ["afr","amh","arb","ary","arz","asm","azj","bel","ben","bos","bul","cat","ceb","ces","ckb","cmn","cmn_Hant","cym","dan","deu","ell","eng","est","eus","fin","fra","fuv","gaz","gle","glg","guj","heb","hin","hrv","hun","hye","ibo","ind","isl","ita","jav","jpn","kan","kat","kaz","khk","khm","kir","kor","lao","lit","lug","luo","lvs","mai","mal","mar","mkd","mlt","mni","mya","nld","nno","nob","npi","nya","ory","pan","pbt","pes","pol","por","ron","rus","sat","slk","slv","sna","snd","som","spa","srp","swe","swh","tam","tel","tgk","tgl","tha","tur","ukr","urd","uzn","vie","yor","yue","zlm","zul",]  # fmt: skip
+def assert_param_count(model_1, model_2):
+    count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
+    count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
+    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
+def param_count(model):
+    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
+def _grab_best_device(use_gpu=True):
+    if torch.cuda.device_count() > 0 and use_gpu:
+        device = "cuda"
+    else:
+        device = "cpu"
+    return torch.device(device)
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+vocoder_convert_list = [
+    ("ups", "hifi_gan.upsampler"),
+    ("conv_pre", "hifi_gan.conv_pre"),
+    ("resblocks", "hifi_gan.resblocks"),
+    ("conv_post", "hifi_gan.conv_post"),
+    ("lang", "language_embedding"),
+    ("spkr", "speaker_embedding"),
+    ("dict.", "unit_embedding."),
+    ("dur_predictor.conv1.0", "dur_predictor.conv1"),
+    ("dur_predictor.conv2.0", "dur_predictor.conv2"),
+]
+# order is important
+wav2vec_convert_list = [
+    ("speech_encoder_frontend.model_dim_proj", "feature_projection.projection"),
+    ("speech_encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
+    ("speech_encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
+    ("speech_encoder.inner.layers", "encoder.layers"),
+    ("speech_encoder.inner_layer_norm", "encoder.layer_norm"),
+    ("speech_encoder.adaptor_layers", "adapter.layers"),
+    ("inner_proj", "intermediate_dense"),
+    ("self_attn.output_proj", "self_attn.linear_out"),
+    ("output_proj", "output_dense"),
+    ("self_attn.k_proj", "self_attn.linear_k"),
+    ("self_attn.v_proj", "self_attn.linear_v"),
+    ("self_attn.q_proj", "self_attn.linear_q"),
+    ("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
+    ("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
+    ("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
+    ("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
+    ("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
+    ("conv.depthwise_conv", "conv_module.depthwise_conv"),
+    ("conv.batch_norm", "conv_module.batch_norm"),
+    ("conv_layer_norm", "conv_module.layer_norm"),
+    ("speech_encoder.proj1", "intermediate_ffn.intermediate_dense"),
+    ("speech_encoder.proj2", "intermediate_ffn.output_dense"),
+    ("speech_encoder.layer_norm", "inner_layer_norm"),
+]
+t2u_convert_list = [
+    ("t2u_model.final_proj", "lm_head"),
+    ("t2u_model.", "model."),
+    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
+    ("encoder_decoder_attn", "cross_attention"),
+    ("linear_k", "k_proj"),
+    ("linear_v", "v_proj"),
+    ("linear_q", "q_proj"),
+    ("ffn.inner_proj", "ffn.fc1"),
+    ("ffn.output_proj", "ffn.fc2"),
+    ("output_proj", "out_proj"),
+    ("decoder_frontend.embed", "decoder.embed_tokens"),
+]
+text_convert_list = [
+    ("text_encoder.", ""),
+    ("text_decoder.", ""),
+    ("text_encoder_frontend.embed", "embed_tokens"),
+    ("text_decoder_frontend.embed", "embed_tokens"),
+    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
+    ("encoder_decoder_attn", "cross_attention"),
+    ("linear_k", "k_proj"),
+    ("linear_v", "v_proj"),
+    ("linear_q", "q_proj"),
+    ("ffn.inner_proj", "ffn.fc1"),
+    ("ffn.output_proj", "ffn.fc2"),
+    ("output_proj", "out_proj"),
+    ("final_proj", "lm_head"),
+]
+CUR_PATH = os.path.dirname(os.path.abspath(__file__))
+default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
+CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "huggingface", "hub")
+def _load_hf_config(model_type="medium"):
+    if model_type == "medium":
+        kwargs = {
+            "vocab_size": 256206,
+            "t2u_vocab_size": 10082,
+            "hidden_size": 1024,
+            "max_position_embeddings": 4096,
+            "encoder_layers": 12,
+            "decoder_layers": 12,
+            "encoder_ffn_dim": 4096,
+            "decoder_ffn_dim": 4096,
+            "t2u_encoder_layers": 4,
+            "t2u_decoder_layers": 4,
+            "speech_encoder_layers": 12,
+        }
+        return SeamlessM4TConfig(**kwargs)
+    else:
+        return SeamlessM4TConfig()
+def _convert_model(
+    original_model,
+    hf_model,
+    convert_list,
+    device,
+    unwanted_prefix="model.",
+    filter_state_dict="speech",
+    exclude_state_dict=None,
+):
+    state_dict = original_model.state_dict()
+    # filter func
+    if isinstance(filter_state_dict, str):
+        def filter_func(x):
+            return filter_state_dict in x[0]
+    else:
+        def filter_func(item):
+            if exclude_state_dict is not None and exclude_state_dict in item[0]:
+                return False
+            for filter_el in filter_state_dict:
+                if filter_el in item[0]:
+                    return True
+            return False
+    state_dict = dict(filter(filter_func, state_dict.items()))
+    for k, v in list(state_dict.items()):
+        new_k = k[len(unwanted_prefix) :]
+        for old_layer_name, new_layer_name in convert_list:
+            if old_layer_name in new_k:
+                new_k = new_k.replace(old_layer_name, new_layer_name)
+        # must do it by hand
+        if ".layer_norm" in new_k and new_k.split(".layer_norm")[0][-1].isnumeric():
+            new_k = new_k.replace("layer_norm", "final_layer_norm")
+        state_dict[new_k] = state_dict.pop(k)
+    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
+    extra_keys = set(extra_keys)
+    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
+    missing_keys = set({k for k in missing_keys if "final_logits_bias" not in k})
+    if len(extra_keys) != 0:
+        raise ValueError(f"extra keys found: {extra_keys}")
+    if len(missing_keys) != 0:
+        raise ValueError(f"missing keys: {missing_keys}")
+    hf_model.load_state_dict(state_dict, strict=False)
+    n_params = param_count(hf_model)
+    logger.info(f"model loaded: {round(n_params / 1e6, 1)}M params")
+    hf_model.eval()
+    hf_model.to(device)
+    del state_dict
+    return hf_model
+def load_model(save_dir, model_type, repo_id):
+    """
+    Meta SeamlessM4T is made of 8 main components:
+    - speech_encoder (#1) and speech_encoder_frontend (#2)
+    - t2u_model (#3)
+    - text_encoder (#4) and text_encoder_frontend (#5)
+    - text_decoder (#6) [and text_decoder_frontend (#5) = equals to text_encoder_frontend]
+    - final_proj (#7)
+    - vocoder (#8)
+    """
+    device = _grab_best_device()
+    if model_type == "medium":
+        name = "seamlessM4T_medium"
+    else:
+        name = "seamlessM4T_large"
+    original_model = Translator(name, "vocoder_36langs", device, torch.float32)
+    ######### TOKENIZER
+    langs = MEDIUM_SUPPORTED_LANGUAGES if model_type == "medium" else LARGE_SUPPORTED_LANGUAGES
+    langs = [f"__{lang}__" for lang in langs]
+    vocab_file = os.path.join(os.path.expanduser("~"), "tokenizer", model_type, "tokenizer.model")
+    save_dir = os.path.join(save_dir, name)
+    Path(save_dir).mkdir(exist_ok=True)
+    tokenizer = SeamlessM4TTokenizer(vocab_file, additional_special_tokens=langs)
+    sanity_check_lang_id = tokenizer.convert_tokens_to_ids("__fra__")
+    tokenizer.save_pretrained(save_dir)
+    tokenizer = SeamlessM4TTokenizer.from_pretrained(save_dir)
+    if sanity_check_lang_id != tokenizer.convert_tokens_to_ids("__fra__"):
+        raise ValueError(
+            f"Error in tokenizer saving/loading - __fra__ lang id is not coherent: {sanity_check_lang_id} vs {tokenizer.convert_tokens_to_ids('__fra__')}"
+        )
+    ####### get language to ids dict
+    text_decoder_lang_code_to_id = {lang.replace("__", ""): tokenizer.convert_tokens_to_ids(lang) for lang in langs}
+    # offset: vocoder unit vocab size + 5 (for EOS/PAD/BOS/UNK/MSK) + len(supported_languages)
+    t2u_lang_code_to_id = {
+        code.replace("__", ""): i + 10005 + len(UNIT_SUPPORTED_LANGUAGES)
+        for i, code in enumerate(UNIT_SUPPORTED_LANGUAGES)
+    }
+    vocoder_lang_code_to_id = {code.replace("__", ""): i for i, code in enumerate(VOCODER_SUPPORTED_LANGUAGES)}
+    ######### FE
+    fe = SeamlessM4TFeatureExtractor(language_code=langs)
+    fe.save_pretrained(save_dir)
+    fe = SeamlessM4TFeatureExtractor.from_pretrained(save_dir)
+    processor = SeamlessM4TProcessor(feature_extractor=fe, tokenizer=tokenizer)
+    processor.save_pretrained(save_dir)
+    processor.push_to_hub(repo_id=repo_id, create_pr=True)
+    processor = SeamlessM4TProcessor.from_pretrained(save_dir)
+    ######## Model
+    # init model
+    hf_config = _load_hf_config(model_type)
+    hf_model = SeamlessM4TModel(hf_config)
+    hf_model.generation_config.__setattr__("text_decoder_lang_to_code_id", text_decoder_lang_code_to_id)
+    hf_model.generation_config.__setattr__("t2u_lang_code_to_id", t2u_lang_code_to_id)
+    hf_model.generation_config.__setattr__("vocoder_lang_code_to_id", vocoder_lang_code_to_id)
+    # -1. take care of vocoder
+    # similarly to speech T5 must apply and remove weight norm
+    hf_model.vocoder.apply_weight_norm()
+    hf_model.vocoder = _convert_model(
+        original_model,
+        hf_model.vocoder,
+        vocoder_convert_list,
+        device,
+        unwanted_prefix="vocoder.code_generator.",
+        filter_state_dict="vocoder",
+    )
+    hf_model.vocoder.remove_weight_norm()
+    # 1. take care of speech encoder
+    wav2vec = hf_model.speech_encoder
+    hf_model.speech_encoder = _convert_model(
+        original_model, wav2vec, wav2vec_convert_list, device, unwanted_prefix="model.", filter_state_dict="speech"
+    )
+    # 2. take care of t2u
+    hf_model.t2u_model = _convert_model(
+        original_model,
+        hf_model.t2u_model,
+        t2u_convert_list,
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict="t2u_model",
+    )
+    # 3. take care of text encoder
+    hf_model.text_encoder = _convert_model(
+        original_model,
+        hf_model.text_encoder,
+        text_convert_list,
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict=["model.text_encoder"],
+        exclude_state_dict="t2u_model",
+    )
+    # 4. take care of text decoder
+    hf_model.text_decoder = _convert_model(
+        original_model,
+        hf_model.text_decoder,
+        text_convert_list,
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict=["model.text_decoder"],
+        exclude_state_dict="t2u_model",
+    )
+    # 5. take care of final proj
+    hf_model.lm_head = _convert_model(
+        original_model,
+        hf_model.lm_head,
+        [("final_proj.", "")],
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict=["model.final_proj"],
+        exclude_state_dict="t2u_model",
+    )
+    # sanity check
+    print(find_tied_parameters(hf_model))
+    count_1 = param_count(hf_model)
+    count_2 = param_count(original_model)
+    print(f"HF MODEL:{count_1}, ORIGINAL_MODEL: {count_2}, diff:{count_1 - count_2}")
+    print(f"HF MODEL excluding embeddings:{hf_model.num_parameters(exclude_embeddings=True)}")
+    del original_model
+    hf_model.generation_config._from_model_config = False
+    hf_model.save_pretrained(save_dir)
+    hf_model.push_to_hub(repo_id=repo_id, create_pr=True)
+    hf_model = SeamlessM4TModel.from_pretrained(save_dir)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_type",
+        default="medium",
+        type=str,
+        help="Model type.",
+    )
+    parser.add_argument(
+        "--save_dir",
+        default="/home/ubuntu/weights",
+        type=str,
+        help="Path to the output PyTorch model.",
+    )
+    parser.add_argument(
+        "--repo_id",
+        default="facebook/hf-seamless-m4t-medium",
+        type=str,
+        help="Repo ID.",
+    )
+    args = parser.parse_args()
+    load_model(args.save_dir, args.model_type, args.repo_id)

docs/transformers/build/lib/transformers/models/seamless_m4t/modeling_seamless_m4t.py ADDED Viewed

The diff for this file is too large to render. See raw diff

docs/transformers/build/lib/transformers/models/seamless_m4t/processing_seamless_m4t.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Audio/Text processor class for SeamlessM4T
+"""
+from ...processing_utils import ProcessorMixin
+class SeamlessM4TProcessor(ProcessorMixin):
+    r"""
+    Constructs a SeamlessM4T processor which wraps a SeamlessM4T feature extractor and a SeamlessM4T tokenizer into a
+    single processor.
+    [`SeamlessM4TProcessor`] offers all the functionalities of [`SeamlessM4TFeatureExtractor`] and
+    [`SeamlessM4TTokenizerFast`]. See the [`~SeamlessM4TProcessor.__call__`] and [`~SeamlessM4TProcessor.decode`] for
+    more information.
+    Args:
+        feature_extractor ([`SeamlessM4TFeatureExtractor`]):
+            The audio processor is a required input.
+        tokenizer ([`SeamlessM4TTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+    feature_extractor_class = "SeamlessM4TFeatureExtractor"
+    tokenizer_class = ("SeamlessM4TTokenizer", "SeamlessM4TTokenizerFast")
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+    def __call__(self, text=None, audios=None, src_lang=None, tgt_lang=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
+        and `kwargs` arguments to SeamlessM4TTokenizerFast's [`~SeamlessM4TTokenizerFast.__call__`] if `text` is not
+        `None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
+        SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audios` is not `None`. Please refer
+        to the docstring of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            audios (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
+                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
+                and T the sample length of the audio.
+            src_lang (`str`, *optional*):
+                The language code of the input texts/audios. If not specified, the last `src_lang` specified will be
+                used.
+            tgt_lang (`str`, *optional*):
+                The code of the target language. If not specified, the last `tgt_lang` specified will be used.
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to the feature extractor and/or the
+                tokenizer.
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **input_features** -- Audio input features to be fed to a model. Returned when `audios` is not `None`.
+        """
+        sampling_rate = kwargs.pop("sampling_rate", None)
+        if text is None and audios is None:
+            raise ValueError("You have to specify either text or audios. Both cannot be none.")
+        elif text is not None and audios is not None:
+            raise ValueError(
+                "Text and audios are mututally exclusive when passed to `SeamlessM4T`. Specify one or another."
+            )
+        elif text is not None:
+            if tgt_lang is not None:
+                self.tokenizer.tgt_lang = tgt_lang
+            if src_lang is not None:
+                self.tokenizer.src_lang = src_lang
+            encoding = self.tokenizer(text, **kwargs)
+            return encoding
+        else:
+            encoding = self.feature_extractor(audios, sampling_rate=sampling_rate, **kwargs)
+            return encoding
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SeamlessM4TTokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
+        Please refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SeamlessM4TTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        feature_extractor_input_names = self.feature_extractor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
+__all__ = ["SeamlessM4TProcessor"]

docs/transformers/build/lib/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py ADDED Viewed

	@@ -0,0 +1,450 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization class for SeamlessM4T."""
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple, Union
+from tokenizers import processors
+from ...tokenization_utils import (
+    BatchEncoding,
+    PreTokenizedInput,
+    TextInput,
+)
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import PaddingStrategy, is_sentencepiece_available, logging
+if is_sentencepiece_available():
+    from .tokenization_seamless_m4t import SeamlessM4TTokenizer
+else:
+    SeamlessM4TTokenizer = None
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
+class SeamlessM4TTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" SeamlessM4T tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+    The tokenization method is `<language code> <tokens> <eos>` for source language documents, and `<eos> <language
+    code> <tokens> <eos>` for target language documents.
+    Examples:
+    ```python
+    >>> from transformers import SeamlessM4TTokenizerFast
+    >>> tokenizer = SeamlessM4TTokenizerFast.from_pretrained(
+    ...     "facebook/hf-seamless-m4t-medium", src_lang="eng", tgt_lang="fra"
+    ... )
+    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
+    >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
+    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
+    ```
+    Args:
+        vocab_file (`str`, *optional*):
+            Path to the vocabulary file.
+        tokenizer_file (`str`, *optional*):
+            The path to a tokenizer file to use instead of the vocab file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+            </Tip>
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+            </Tip>
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        src_lang (`str`, *optional*, defaults to `"eng"`):
+            The language to use as source language for translation.
+        tgt_lang (`str`, *optional*, defaults to `"fra"`):
+            The language to use as target language for translation.
+        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
+            A tuple or a list of additional special tokens.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = SeamlessM4TTokenizer
+    model_input_names = ["input_ids", "attention_mask"]
+    prefix_tokens: List[int] = []
+    suffix_tokens: List[int] = []
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        src_lang="eng",
+        tgt_lang="fra",
+        additional_special_tokens=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            tokenizer_file=tokenizer_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+        self._src_lang = f"__{src_lang}__" if "__" not in src_lang else src_lang
+        self._tgt_lang = f"__{tgt_lang}__" if "__" not in tgt_lang else tgt_lang
+        self.set_src_lang_special_tokens(self._src_lang)
+        self.set_tgt_lang_special_tokens(self._tgt_lang)
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+    @property
+    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.src_lang
+    def src_lang(self) -> str:
+        return self._src_lang
+    @src_lang.setter
+    def src_lang(self, new_src_lang: str) -> None:
+        if "__" not in new_src_lang:
+            self._src_lang = f"__{new_src_lang}__"
+        else:
+            self._src_lang = new_src_lang
+        self.set_src_lang_special_tokens(self._src_lang)
+    @property
+    def tgt_lang(self) -> str:
+        return self._tgt_lang
+    @tgt_lang.setter
+    def tgt_lang(self, new_tgt_lang: str) -> None:
+        if "__" not in new_tgt_lang:
+            self._tgt_lang = f"__{new_tgt_lang}__"
+        else:
+            self._tgt_lang = new_tgt_lang
+        self.set_tgt_lang_special_tokens(self._tgt_lang)
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. The special tokens depend on calling set_lang.
+        An SeamlessM4T sequence has the following format, where `X` represents the sequence:
+        - `input_ids` (for encoder) `[src_lang_code] X [eos]`
+        - `decoder_input_ids`: (for decoder) `[eos, tgt_lang_code] X [eos]`
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
+        make use of token type ids, therefore a list of zeros is returned.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+    def _build_translation_inputs(
+        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
+    ):
+        """Used by translation pipeline, to prepare inputs for the generate function"""
+        if src_lang is None or tgt_lang is None:
+            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
+        self.src_lang = src_lang
+        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
+        if "__" not in tgt_lang:
+            tgt_lang = f"__{tgt_lang}__"
+        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
+        inputs["forced_bos_token_id"] = tgt_lang_id
+        return inputs
+    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.prepare_seq2seq_batch with "fra_Latn"->"fra", "eng_Latn"->"eng"
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        src_lang: str = "eng",
+        tgt_texts: Optional[List[str]] = None,
+        tgt_lang: str = "fra",
+        **kwargs,
+    ) -> BatchEncoding:
+        self.src_lang = src_lang
+        self.tgt_lang = tgt_lang
+        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
+    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast._switch_to_input_mode
+    def _switch_to_input_mode(self):
+        return self.set_src_lang_special_tokens(self.src_lang)
+    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast._switch_to_target_mode
+    def _switch_to_target_mode(self):
+        return self.set_tgt_lang_special_tokens(self.tgt_lang)
+    def set_src_lang_special_tokens(self, src_lang) -> None:
+        """Reset the special tokens to the source lang setting.
+        Prefix=[src_lang_code], suffix = [eos]
+        """
+        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
+        if self.cur_lang_code == self.unk_token_id:
+            logger.warning_once(
+                f"`tgt_lang={src_lang}` has not be found in the `vocabulary`. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
+            )
+        self.init_kwargs["src_lang"] = src_lang
+        self.prefix_tokens = [self.cur_lang_code]
+        self.suffix_tokens = [self.eos_token_id]
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+    def set_tgt_lang_special_tokens(self, lang: str) -> None:
+        """Reset the special tokens to the target lang setting.
+        Prefix=[eos, tgt_lang_code] and suffix=[eos].
+        """
+        self.cur_lang_code = self.convert_tokens_to_ids(lang)
+        if self.cur_lang_code == self.unk_token_id:
+            logger.warning_once(
+                f"`tgt_lang={lang}` has not be found in the `vocabulary`. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
+            )
+        self.init_kwargs["tgt_lang"] = lang
+        self.prefix_tokens = [self.eos_token_id, self.cur_lang_code]
+        self.suffix_tokens = [self.eos_token_id]
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        return (out_vocab_file,)
+    @classmethod
+    def _from_pretrained(
+        cls,
+        resolved_vocab_files,
+        pretrained_model_name_or_path,
+        init_configuration,
+        *init_inputs,
+        token=None,
+        cache_dir=None,
+        local_files_only=False,
+        _commit_hash=None,
+        _is_local=False,
+        **kwargs,
+    ):
+        tokenizer = super()._from_pretrained(
+            resolved_vocab_files,
+            pretrained_model_name_or_path,
+            init_configuration,
+            *init_inputs,
+            token=token,
+            cache_dir=cache_dir,
+            local_files_only=local_files_only,
+            _commit_hash=_commit_hash,
+            _is_local=_is_local,
+            **kwargs,
+        )
+        # ensure also set after from pretrained
+        tokenizer.set_src_lang_special_tokens(tokenizer._src_lang)
+        tokenizer.set_tgt_lang_special_tokens(tokenizer._tgt_lang)
+        return tokenizer
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        text_pair_target: Optional[
+            Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
+        ] = None,
+        padding: Union[bool, str, PaddingStrategy] = True,
+        pad_to_multiple_of: Optional[int] = 2,
+        src_lang: Optional[str] = None,
+        tgt_lang: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_pair (`str`, `List[str]`, `List[List[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
+                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
+                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_pair_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
+                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
+                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                 index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            src_lang (`str`, *optional*):
+                A string representing the source language. If not specified, the last `src_lang` specified (either
+                during initialization or when calling this tokenizer) will be used.
+            tgt_lang (`str`, *optional*):
+                A string representing the target language. If not specified, the last `tgt_lang` specified (either
+                during initialization or when calling this tokenizer) will be used.
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to [`PreTrainedTokenizerFast.__call__`].
+        """
+        if src_lang is not None:
+            self.src_lang = src_lang
+        if tgt_lang is not None:
+            self.tgt_lang = tgt_lang
+        output = super().__call__(
+            text=text,
+            text_pair=text_pair,
+            text_target=text_target,
+            text_pair_target=text_pair_target,
+            padding=padding,
+            pad_to_multiple_of=pad_to_multiple_of,
+            **kwargs,
+        )
+        return output
+__all__ = ["SeamlessM4TTokenizerFast"]

docs/transformers/build/lib/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py ADDED Viewed

	@@ -0,0 +1,404 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Converting Meta SeamlessM4Tv2 checkpoints from seamless_communication to HF."""
+import argparse
+import os
+from pathlib import Path
+import torch
+from accelerate.utils.modeling import find_tied_parameters
+from seamless_communication.inference import Translator
+from transformers import (
+    SeamlessM4TFeatureExtractor,
+    SeamlessM4TProcessor,
+    SeamlessM4TTokenizer,
+    SeamlessM4Tv2Config,
+    SeamlessM4Tv2Model,
+)
+from transformers.utils import logging
+# fmt: off
+UNIT_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kan__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tam__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__", ]
+# fmt: on
+# fmt: off
+VOCODER_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__",]
+# fmt: on
+# fmt: off
+LARGE_SUPPORTED_LANGUAGES = ["afr","amh","arb","ary","arz","asm","azj","bel","ben","bos","bul","cat","ceb","ces","ckb","cmn","cmn_Hant","cym","dan","deu","ell","eng","est","eus","fin","fra","fuv","gaz","gle","glg","guj","heb","hin","hrv","hun","hye","ibo","ind","isl","ita","jav","jpn","kan","kat","kaz","khk","khm","kir","kor","lao","lit","lug","luo","lvs","mai","mal","mar","mkd","mlt","mni","mya","nld","nno","nob","npi","nya","ory","pan","pbt","pes","pol","por","ron","rus","sat","slk","slv","sna","snd","som","spa","srp","swe","swh","tam","tel","tgk","tgl","tha","tur","ukr","urd","uzn","vie","yor","yue","zlm","zul",]
+# fmt: on
+def assert_param_count(model_1, model_2):
+    count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
+    count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
+    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
+def param_count(model):
+    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
+def _grab_best_device(use_gpu=True):
+    if torch.cuda.device_count() > 0 and use_gpu:
+        device = "cuda"
+    else:
+        device = "cpu"
+    return torch.device(device)
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+vocoder_convert_list = [
+    ("ups", "hifi_gan.upsampler"),
+    ("conv_pre", "hifi_gan.conv_pre"),
+    ("resblocks", "hifi_gan.resblocks"),
+    ("conv_post", "hifi_gan.conv_post"),
+    ("lang", "language_embedding"),
+    ("spkr", "speaker_embedding"),
+    ("dict.", "unit_embedding."),
+    ("dur_predictor.conv1.0", "dur_predictor.conv1"),
+    ("dur_predictor.conv2.0", "dur_predictor.conv2"),
+]
+# order is important
+wav2vec_convert_list = [
+    ("speech_encoder_frontend.model_dim_proj", "feature_projection.projection"),
+    ("speech_encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
+    ("speech_encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
+    ("speech_encoder.inner.layers", "encoder.layers"),
+    ("speech_encoder.inner_layer_norm", "encoder.layer_norm"),
+    ("speech_encoder.adaptor_layers", "adapter.layers"),
+    ("inner_proj", "intermediate_dense"),
+    ("self_attn.output_proj", "self_attn.linear_out"),
+    ("output_proj", "output_dense"),
+    ("self_attn.k_proj", "self_attn.linear_k"),
+    ("self_attn.v_proj", "self_attn.linear_v"),
+    ("self_attn.q_proj", "self_attn.linear_q"),
+    ("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
+    ("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
+    ("self_attn.sdpa.rel_k_embed", "self_attn.distance_embedding"),
+    ("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
+    ("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
+    ("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
+    ("conv.depthwise_conv", "conv_module.depthwise_conv"),
+    ("conv.batch_norm", "conv_module.batch_norm"),
+    ("conv.layer_norm", "conv_module.depthwise_layer_norm"),
+    ("conv_layer_norm", "conv_module.layer_norm"),
+    ("speech_encoder.proj1", "intermediate_ffn.intermediate_dense"),
+    ("speech_encoder.proj2", "intermediate_ffn.output_dense"),
+    ("speech_encoder.layer_norm", "inner_layer_norm"),
+]
+t2u_convert_list = [
+    ("t2u_model.final_proj", "lm_head"),
+    ("t2u_model.", "model."),
+    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
+    ("encoder_decoder_attn", "cross_attention"),
+    ("linear_k", "k_proj"),
+    ("linear_v", "v_proj"),
+    ("linear_q", "q_proj"),
+    ("ffn.inner_proj", "ffn.fc1"),
+    ("ffn.output_proj", "ffn.fc2"),
+    ("output_proj", "out_proj"),
+    ("decoder_frontend.embed_char", "decoder.embed_char"),
+    ("decoder_frontend.pos_emb_alpha_char", "decoder.pos_emb_alpha_char"),
+    ("decoder_frontend.embed", "decoder.embed_tokens"),
+    ("decoder_frontend.pos_emb_alpha", "decoder.pos_emb_alpha"),
+    ("conv1d.conv", "conv"),
+    ("conv1d_layer_norm", "conv_layer_norm"),
+    ("decoder_frontend.variance_adaptor", "decoder"),
+    ("duration_predictor.conv1.0", "duration_predictor.conv1"),
+    ("duration_predictor.conv2.0", "duration_predictor.conv2"),
+]
+text_convert_list = [
+    ("text_encoder.", ""),
+    ("text_decoder.", ""),
+    ("text_encoder_frontend.embed", "embed_tokens"),
+    ("text_decoder_frontend.embed", "embed_tokens"),
+    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
+    ("encoder_decoder_attn", "cross_attention"),
+    ("linear_k", "k_proj"),
+    ("linear_v", "v_proj"),
+    ("linear_q", "q_proj"),
+    ("ffn.inner_proj", "ffn.fc1"),
+    ("ffn.output_proj", "ffn.fc2"),
+    ("output_proj", "out_proj"),
+    ("final_proj", "lm_head"),
+]
+CUR_PATH = os.path.dirname(os.path.abspath(__file__))
+default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
+CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "huggingface", "hub")
+def _load_hf_config():
+    return SeamlessM4Tv2Config()
+def _convert_model(
+    original_model,
+    hf_model,
+    convert_list,
+    device,
+    unwanted_prefix="model.",
+    filter_state_dict="speech",
+    exclude_state_dict=None,
+):
+    state_dict = original_model.state_dict()
+    # filter func
+    if isinstance(filter_state_dict, str):
+        def filter_func(x):
+            return filter_state_dict in x[0]
+    else:
+        def filter_func(item):
+            if exclude_state_dict is not None and exclude_state_dict in item[0]:
+                return False
+            for filter_el in filter_state_dict:
+                if filter_el in item[0]:
+                    return True
+            return False
+    state_dict = dict(filter(filter_func, state_dict.items()))
+    for k, v in list(state_dict.items()):
+        new_k = k[len(unwanted_prefix) :]
+        for old_layer_name, new_layer_name in convert_list:
+            if old_layer_name in new_k:
+                new_k = new_k.replace(old_layer_name, new_layer_name)
+        # must do it by hand
+        if ".layer_norm" in new_k and new_k.split(".layer_norm")[0][-1].isnumeric():
+            new_k = new_k.replace("layer_norm", "final_layer_norm")
+        state_dict[new_k] = state_dict.pop(k)
+    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
+    extra_keys = set(extra_keys)
+    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
+    missing_keys = set({k for k in missing_keys if "final_logits_bias" not in k})
+    if len(extra_keys) != 0:
+        raise ValueError(f"extra keys found: {extra_keys}")
+    if len(missing_keys) != 0:
+        raise ValueError(f"missing keys: {missing_keys}")
+    hf_model.load_state_dict(state_dict, strict=False)
+    n_params = param_count(hf_model)
+    logger.info(f"model loaded: {round(n_params / 1e6, 1)}M params")
+    hf_model.eval()
+    hf_model.to(device)
+    del state_dict
+    return hf_model
+def load_model(save_dir, model_type, repo_id):
+    """
+    Meta SeamlessM4Tv2 is made of 8 main components:
+    - speech_encoder (#1) and speech_encoder_frontend (#2)
+    - t2u_model (#3)
+    - text_encoder (#4) and text_encoder_frontend (#5)
+    - text_decoder (#6) [and text_decoder_frontend (#5) = equals to text_encoder_frontend]
+    - final_proj (#7)
+    - vocoder (#8)
+    """
+    device = _grab_best_device()
+    name = "seamlessM4T_v2_large"
+    original_model = Translator(name, "vocoder_v2", device, dtype=torch.float32)
+    ######### TOKENIZER
+    langs = LARGE_SUPPORTED_LANGUAGES
+    langs = [f"__{lang}__" for lang in langs]
+    vocab_file = os.path.join(os.path.expanduser("~"), "tokenizer", model_type, "tokenizer.model")
+    save_dir = os.path.join(save_dir, name)
+    Path(save_dir).mkdir(exist_ok=True)
+    tokenizer = SeamlessM4TTokenizer(vocab_file, additional_special_tokens=langs)
+    sanity_check_lang_id = tokenizer.convert_tokens_to_ids("__fra__")
+    tokenizer.save_pretrained(save_dir)
+    tokenizer = SeamlessM4TTokenizer.from_pretrained(save_dir)
+    if sanity_check_lang_id != tokenizer.convert_tokens_to_ids("__fra__"):
+        raise ValueError(
+            f"Error in tokenizer saving/loading - __fra__ lang id is not coherent: {sanity_check_lang_id} vs {tokenizer.convert_tokens_to_ids('__fra__')}"
+        )
+    ####### get language to ids dict
+    text_decoder_lang_code_to_id = {lang.replace("__", ""): tokenizer.convert_tokens_to_ids(lang) for lang in langs}
+    # offset: vocoder unit vocab size + 5 (for EOS/PAD/BOS/UNK/MSK) + len(supported_languages)
+    t2u_lang_code_to_id = {
+        code.replace("__", ""): i + 10005 + len(UNIT_SUPPORTED_LANGUAGES)
+        for i, code in enumerate(UNIT_SUPPORTED_LANGUAGES)
+    }
+    vocoder_lang_code_to_id = {code.replace("__", ""): i for i, code in enumerate(VOCODER_SUPPORTED_LANGUAGES)}
+    ######### FE
+    fe = SeamlessM4TFeatureExtractor(language_code=langs)
+    fe.save_pretrained(save_dir)
+    fe = SeamlessM4TFeatureExtractor.from_pretrained(save_dir)
+    processor = SeamlessM4TProcessor(feature_extractor=fe, tokenizer=tokenizer)
+    processor.save_pretrained(save_dir)
+    processor.push_to_hub(repo_id=repo_id, create_pr=True)
+    processor = SeamlessM4TProcessor.from_pretrained(save_dir)
+    ######## Model
+    # init config
+    hf_config = _load_hf_config()
+    ######## get id_to_text and char_to_id from original model tokenizers
+    id_to_text = {i: original_model.text_tokenizer.model.index_to_token(i) for i in range(hf_config.vocab_size)}
+    char_to_id = {
+        original_model.model.t2u_model.decoder_frontend.char_tokenizer.model.index_to_token(i): i for i in range(10904)
+    }
+    # init model
+    hf_model = SeamlessM4Tv2Model(hf_config)
+    hf_model.generation_config.__setattr__("text_decoder_lang_to_code_id", text_decoder_lang_code_to_id)
+    hf_model.generation_config.__setattr__("t2u_lang_code_to_id", t2u_lang_code_to_id)
+    hf_model.generation_config.__setattr__("vocoder_lang_code_to_id", vocoder_lang_code_to_id)
+    hf_model.generation_config.__setattr__("id_to_text", id_to_text)
+    hf_model.generation_config.__setattr__("char_to_id", char_to_id)
+    # -1. take care of vocoder
+    # similarly to speech T5 must apply and remove weight norm
+    hf_model.vocoder.apply_weight_norm()
+    hf_model.vocoder = _convert_model(
+        original_model,
+        hf_model.vocoder,
+        vocoder_convert_list,
+        device,
+        unwanted_prefix="vocoder.code_generator.",
+        filter_state_dict="vocoder",
+    )
+    hf_model.vocoder.remove_weight_norm()
+    # 1. take care of speech encoder
+    wav2vec = hf_model.speech_encoder
+    hf_model.speech_encoder = _convert_model(
+        original_model, wav2vec, wav2vec_convert_list, device, unwanted_prefix="model.", filter_state_dict="speech"
+    )
+    # 2. take care of t2u
+    hf_model.t2u_model = _convert_model(
+        original_model,
+        hf_model.t2u_model,
+        t2u_convert_list,
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict="t2u_model",
+    )
+    # 3. take care of text encoder
+    hf_model.text_encoder = _convert_model(
+        original_model,
+        hf_model.text_encoder,
+        text_convert_list,
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict=["model.text_encoder"],
+        exclude_state_dict="t2u_model",
+    )
+    # 4. take care of text decoder
+    hf_model.text_decoder = _convert_model(
+        original_model,
+        hf_model.text_decoder,
+        text_convert_list,
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict=["model.text_decoder"],
+        exclude_state_dict="t2u_model",
+    )
+    # 5. take care of final proj
+    hf_model.lm_head = _convert_model(
+        original_model,
+        hf_model.lm_head,
+        [("final_proj.", "")],
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict=["model.final_proj"],
+        exclude_state_dict="t2u_model",
+    )
+    # sanity check
+    print(find_tied_parameters(hf_model))
+    count_1 = param_count(hf_model)
+    count_2 = param_count(original_model)
+    print(f"HF MODEL:{count_1}, ORIGINAL_MODEL: {count_2}, diff:{count_1 - count_2}")
+    print(f"HF MODEL excluding embeddings:{hf_model.num_parameters(exclude_embeddings=True)}")
+    del original_model
+    hf_model.generation_config._from_model_config = False
+    hf_model.save_pretrained(save_dir)
+    hf_model.push_to_hub(repo_id=repo_id, create_pr=True)
+    hf_model = SeamlessM4Tv2Model.from_pretrained(save_dir)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_type",
+        default="large",
+        type=str,
+        help="Model type.",
+    )
+    parser.add_argument(
+        "--save_dir",
+        default="/home/ubuntu/weights_v2",
+        type=str,
+        help="Path to the output PyTorch model.",
+    )
+    parser.add_argument(
+        "--repo_id",
+        default="facebook/seamless-m4t-v2-large",
+        type=str,
+        help="Repo ID.",
+    )
+    args = parser.parse_args()
+    load_model(args.save_dir, args.model_type, args.repo_id)

docs/transformers/build/lib/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

docs/transformers/build/lib/transformers/models/segformer/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_segformer import *
+    from .feature_extraction_segformer import *
+    from .image_processing_segformer import *
+    from .modeling_segformer import *
+    from .modeling_tf_segformer import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/segformer/configuration_segformer.py ADDED Viewed

	@@ -0,0 +1,171 @@

+# coding=utf-8
+# Copyright 2021 NVIDIA and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SegFormer model configuration"""
+import warnings
+from collections import OrderedDict
+from typing import Mapping
+from packaging import version
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+class SegformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SegformerModel`]. It is used to instantiate an
+    SegFormer model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the SegFormer
+    [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
+    architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_encoder_blocks (`int`, *optional*, defaults to 4):
+            The number of encoder blocks (i.e. stages in the Mix Transformer encoder).
+        depths (`List[int]`, *optional*, defaults to `[2, 2, 2, 2]`):
+            The number of layers in each encoder block.
+        sr_ratios (`List[int]`, *optional*, defaults to `[8, 4, 2, 1]`):
+            Sequence reduction ratios in each encoder block.
+        hidden_sizes (`List[int]`, *optional*, defaults to `[32, 64, 160, 256]`):
+            Dimension of each of the encoder blocks.
+        patch_sizes (`List[int]`, *optional*, defaults to `[7, 3, 3, 3]`):
+            Patch size before each encoder block.
+        strides (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
+            Stride before each encoder block.
+        num_attention_heads (`List[int]`, *optional*, defaults to `[1, 2, 5, 8]`):
+            Number of attention heads for each attention layer in each block of the Transformer encoder.
+        mlp_ratios (`List[int]`, *optional*, defaults to `[4, 4, 4, 4]`):
+            Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
+            encoder blocks.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability before the classification head.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        decoder_hidden_size (`int`, *optional*, defaults to 256):
+            The dimension of the all-MLP decode head.
+        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
+            The index that is ignored by the loss function of the semantic segmentation model.
+    Example:
+    ```python
+    >>> from transformers import SegformerModel, SegformerConfig
+    >>> # Initializing a SegFormer nvidia/segformer-b0-finetuned-ade-512-512 style configuration
+    >>> configuration = SegformerConfig()
+    >>> # Initializing a model from the nvidia/segformer-b0-finetuned-ade-512-512 style configuration
+    >>> model = SegformerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "segformer"
+    def __init__(
+        self,
+        num_channels=3,
+        num_encoder_blocks=4,
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        hidden_sizes=[32, 64, 160, 256],
+        patch_sizes=[7, 3, 3, 3],
+        strides=[4, 2, 2, 2],
+        num_attention_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        classifier_dropout_prob=0.1,
+        initializer_range=0.02,
+        drop_path_rate=0.1,
+        layer_norm_eps=1e-6,
+        decoder_hidden_size=256,
+        semantic_loss_ignore_index=255,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if "reshape_last_stage" in kwargs and kwargs["reshape_last_stage"] is False:
+            warnings.warn(
+                "Reshape_last_stage is set to False in this config. This argument is deprecated and will soon be"
+                " removed, as the behaviour will default to that of reshape_last_stage = True.",
+                FutureWarning,
+            )
+        self.num_channels = num_channels
+        self.num_encoder_blocks = num_encoder_blocks
+        self.depths = depths
+        self.sr_ratios = sr_ratios
+        self.hidden_sizes = hidden_sizes
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.mlp_ratios = mlp_ratios
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.classifier_dropout_prob = classifier_dropout_prob
+        self.initializer_range = initializer_range
+        self.drop_path_rate = drop_path_rate
+        self.layer_norm_eps = layer_norm_eps
+        self.decoder_hidden_size = decoder_hidden_size
+        self.reshape_last_stage = kwargs.get("reshape_last_stage", True)
+        self.semantic_loss_ignore_index = semantic_loss_ignore_index
+class SegformerOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+    @property
+    def default_onnx_opset(self) -> int:
+        return 12
+__all__ = ["SegformerConfig", "SegformerOnnxConfig"]

docs/transformers/build/lib/transformers/models/segformer/convert_segformer_original_to_pytorch.py ADDED Viewed

	@@ -0,0 +1,387 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert SegFormer checkpoints."""
+import argparse
+import json
+from collections import OrderedDict
+from pathlib import Path
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from transformers import (
+    SegformerConfig,
+    SegformerForImageClassification,
+    SegformerForSemanticSegmentation,
+    SegformerImageProcessor,
+)
+from transformers.utils import logging
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+def rename_keys(state_dict, encoder_only=False):
+    new_state_dict = OrderedDict()
+    for key, value in state_dict.items():
+        if encoder_only and not key.startswith("head"):
+            key = "segformer.encoder." + key
+        if key.startswith("backbone"):
+            key = key.replace("backbone", "segformer.encoder")
+        if "patch_embed" in key:
+            # replace for example patch_embed1 by patch_embeddings.0
+            idx = key[key.find("patch_embed") + len("patch_embed")]
+            key = key.replace(f"patch_embed{idx}", f"patch_embeddings.{int(idx) - 1}")
+        if "norm" in key:
+            key = key.replace("norm", "layer_norm")
+        if "segformer.encoder.layer_norm" in key:
+            # replace for example layer_norm1 by layer_norm.0
+            idx = key[key.find("segformer.encoder.layer_norm") + len("segformer.encoder.layer_norm")]
+            key = key.replace(f"layer_norm{idx}", f"layer_norm.{int(idx) - 1}")
+        if "layer_norm1" in key:
+            key = key.replace("layer_norm1", "layer_norm_1")
+        if "layer_norm2" in key:
+            key = key.replace("layer_norm2", "layer_norm_2")
+        if "block" in key:
+            # replace for example block1 by block.0
+            idx = key[key.find("block") + len("block")]
+            key = key.replace(f"block{idx}", f"block.{int(idx) - 1}")
+        if "attn.q" in key:
+            key = key.replace("attn.q", "attention.self.query")
+        if "attn.proj" in key:
+            key = key.replace("attn.proj", "attention.output.dense")
+        if "attn" in key:
+            key = key.replace("attn", "attention.self")
+        if "fc1" in key:
+            key = key.replace("fc1", "dense1")
+        if "fc2" in key:
+            key = key.replace("fc2", "dense2")
+        if "linear_pred" in key:
+            key = key.replace("linear_pred", "classifier")
+        if "linear_fuse" in key:
+            key = key.replace("linear_fuse.conv", "linear_fuse")
+            key = key.replace("linear_fuse.bn", "batch_norm")
+        if "linear_c" in key:
+            # replace for example linear_c4 by linear_c.3
+            idx = key[key.find("linear_c") + len("linear_c")]
+            key = key.replace(f"linear_c{idx}", f"linear_c.{int(idx) - 1}")
+        if key.startswith("head"):
+            key = key.replace("head", "classifier")
+        new_state_dict[key] = value
+    return new_state_dict
+def read_in_k_v(state_dict, config):
+    # for each of the encoder blocks:
+    for i in range(config.num_encoder_blocks):
+        for j in range(config.depths[i]):
+            # read in weights + bias of keys and values (which is a single matrix in the original implementation)
+            kv_weight = state_dict.pop(f"segformer.encoder.block.{i}.{j}.attention.self.kv.weight")
+            kv_bias = state_dict.pop(f"segformer.encoder.block.{i}.{j}.attention.self.kv.bias")
+            # next, add keys and values (in that order) to the state dict
+            state_dict[f"segformer.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[
+                : config.hidden_sizes[i], :
+            ]
+            state_dict[f"segformer.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]]
+            state_dict[f"segformer.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[
+                config.hidden_sizes[i] :, :
+            ]
+            state_dict[f"segformer.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[
+                config.hidden_sizes[i] :
+            ]
+# We will verify our results on a COCO image
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+@torch.no_grad()
+def convert_segformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our SegFormer structure.
+    """
+    # load default SegFormer configuration
+    config = SegformerConfig()
+    encoder_only = False
+    # set attributes based on model_name
+    repo_id = "huggingface/label-files"
+    if "segformer" in model_name:
+        size = model_name[len("segformer.") : len("segformer.") + 2]
+        if "ade" in model_name:
+            config.num_labels = 150
+            filename = "ade20k-id2label.json"
+            expected_shape = (1, 150, 128, 128)
+        elif "city" in model_name:
+            config.num_labels = 19
+            filename = "cityscapes-id2label.json"
+            expected_shape = (1, 19, 128, 128)
+        else:
+            raise ValueError(f"Model {model_name} not supported")
+    elif "mit" in model_name:
+        encoder_only = True
+        size = model_name[4:6]
+        config.num_labels = 1000
+        filename = "imagenet-1k-id2label.json"
+        expected_shape = (1, 1000)
+    else:
+        raise ValueError(f"Model {model_name} not supported")
+    # set config attributes
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+    if size == "b0":
+        pass
+    elif size == "b1":
+        config.hidden_sizes = [64, 128, 320, 512]
+        config.decoder_hidden_size = 256
+    elif size == "b2":
+        config.hidden_sizes = [64, 128, 320, 512]
+        config.decoder_hidden_size = 768
+        config.depths = [3, 4, 6, 3]
+    elif size == "b3":
+        config.hidden_sizes = [64, 128, 320, 512]
+        config.decoder_hidden_size = 768
+        config.depths = [3, 4, 18, 3]
+    elif size == "b4":
+        config.hidden_sizes = [64, 128, 320, 512]
+        config.decoder_hidden_size = 768
+        config.depths = [3, 8, 27, 3]
+    elif size == "b5":
+        config.hidden_sizes = [64, 128, 320, 512]
+        config.decoder_hidden_size = 768
+        config.depths = [3, 6, 40, 3]
+    else:
+        raise ValueError(f"Size {size} not supported")
+    # load image processor (only resize + normalize)
+    image_processor = SegformerImageProcessor(
+        image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
+    )
+    # prepare image
+    image = prepare_img()
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    logger.info(f"Converting model {model_name}...")
+    # load original state dict
+    if encoder_only:
+        state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"), weights_only=True)
+    else:
+        state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"), weights_only=True)["state_dict"]
+    # rename keys
+    state_dict = rename_keys(state_dict, encoder_only=encoder_only)
+    if not encoder_only:
+        del state_dict["decode_head.conv_seg.weight"]
+        del state_dict["decode_head.conv_seg.bias"]
+    # key and value matrices need special treatment
+    read_in_k_v(state_dict, config)
+    # create HuggingFace model and load state dict
+    if encoder_only:
+        config.reshape_last_stage = False
+        model = SegformerForImageClassification(config)
+    else:
+        model = SegformerForSemanticSegmentation(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+    # forward pass
+    outputs = model(pixel_values)
+    logits = outputs.logits
+    # set expected_slice based on model name
+    # ADE20k checkpoints
+    if model_name == "segformer.b0.512x512.ade.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]],
+                [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]],
+                [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]],
+            ]
+        )
+    elif model_name == "segformer.b1.512x512.ade.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-7.5820, -8.7231, -8.3215], [-8.0600, -10.3529, -10.0304], [-7.5208, -9.4103, -9.6239]],
+                [[-12.6918, -13.8994, -13.7137], [-13.3196, -15.7523, -15.4789], [-12.9343, -14.8757, -14.9689]],
+                [[-11.1911, -11.9421, -11.3243], [-11.3342, -13.6839, -13.3581], [-10.3909, -12.1832, -12.4858]],
+            ]
+        )
+    elif model_name == "segformer.b2.512x512.ade.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-11.8173, -14.3850, -16.3128], [-14.5648, -16.5804, -18.6568], [-14.7223, -15.7387, -18.4218]],
+                [[-15.7290, -17.9171, -19.4423], [-18.3105, -19.9448, -21.4661], [-17.9296, -18.6497, -20.7910]],
+                [[-15.0783, -17.0336, -18.2789], [-16.8771, -18.6870, -20.1612], [-16.2454, -17.1426, -19.5055]],
+            ]
+        )
+    elif model_name == "segformer.b3.512x512.ade.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-9.0878, -10.2081, -10.1891], [-9.3144, -10.7941, -10.9843], [-9.2294, -10.3855, -10.5704]],
+                [[-12.2316, -13.9068, -13.6102], [-12.9161, -14.3702, -14.3235], [-12.5233, -13.7174, -13.7932]],
+                [[-14.6275, -15.2490, -14.9727], [-14.3400, -15.9687, -16.2827], [-14.1484, -15.4033, -15.8937]],
+            ]
+        )
+    elif model_name == "segformer.b4.512x512.ade.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-12.3144, -13.2447, -14.0802], [-13.3614, -14.5816, -15.6117], [-13.3340, -14.4433, -16.2219]],
+                [[-19.2781, -20.4128, -20.7506], [-20.6153, -21.6566, -22.0998], [-19.9800, -21.0430, -22.1494]],
+                [[-18.8739, -19.7804, -21.1834], [-20.1233, -21.6765, -23.2944], [-20.0315, -21.2641, -23.6944]],
+            ]
+        )
+    elif model_name == "segformer.b5.640x640.ade.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-9.5524, -12.0835, -11.7348], [-10.5229, -13.6446, -14.5662], [-9.5842, -12.8851, -13.9414]],
+                [[-15.3432, -17.5323, -17.0818], [-16.3330, -18.9255, -19.2101], [-15.1340, -17.7848, -18.3971]],
+                [[-12.6072, -14.9486, -14.6631], [-13.7629, -17.0907, -17.7745], [-12.7899, -16.1695, -17.1671]],
+            ]
+        )
+    # Cityscapes checkpoints
+    elif model_name == "segformer.b0.1024x1024.city.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-11.9295, -13.4057, -14.8106], [-13.3431, -14.8179, -15.3781], [-14.2836, -15.5942, -16.1588]],
+                [[-11.4906, -12.8067, -13.6564], [-13.1189, -14.0500, -14.1543], [-13.8748, -14.5136, -14.8789]],
+                [[0.5374, 0.1067, -0.4742], [0.1141, -0.2255, -0.7099], [-0.3000, -0.5924, -1.3105]],
+            ]
+        )
+    elif model_name == "segformer.b0.512x1024.city.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-7.8217, -9.8767, -10.1717], [-9.4438, -10.9058, -11.4047], [-9.7939, -12.3495, -12.1079]],
+                [[-7.1514, -9.5336, -10.0860], [-9.7776, -11.6822, -11.8439], [-10.1411, -12.7655, -12.8972]],
+                [[0.3021, 0.0805, -0.2310], [-0.0328, -0.1605, -0.2714], [-0.1408, -0.5477, -0.6976]],
+            ]
+        )
+    elif model_name == "segformer.b0.640x1280.city.160k":
+        expected_slice = torch.tensor(
+            [
+                [
+                    [-1.1372e01, -1.2787e01, -1.3477e01],
+                    [-1.2536e01, -1.4194e01, -1.4409e01],
+                    [-1.3217e01, -1.4888e01, -1.5327e01],
+                ],
+                [
+                    [-1.4791e01, -1.7122e01, -1.8277e01],
+                    [-1.7163e01, -1.9192e01, -1.9533e01],
+                    [-1.7897e01, -1.9991e01, -2.0315e01],
+                ],
+                [
+                    [7.6723e-01, 4.1921e-01, -7.7878e-02],
+                    [4.7772e-01, 9.5557e-03, -2.8082e-01],
+                    [3.6032e-01, -2.4826e-01, -5.1168e-01],
+                ],
+            ]
+        )
+    elif model_name == "segformer.b0.768x768.city.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-9.4959, -11.3087, -11.7479], [-11.0025, -12.6540, -12.3319], [-11.4064, -13.0487, -12.9905]],
+                [[-9.8905, -11.3084, -12.0854], [-11.1726, -12.7698, -12.9583], [-11.5985, -13.3278, -14.1774]],
+                [[0.2213, 0.0192, -0.2466], [-0.1731, -0.4213, -0.4874], [-0.3126, -0.6541, -1.1389]],
+            ]
+        )
+    elif model_name == "segformer.b1.1024x1024.city.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-13.5748, -13.9111, -12.6500], [-14.3500, -15.3683, -14.2328], [-14.7532, -16.0424, -15.6087]],
+                [[-17.1651, -15.8725, -12.9653], [-17.2580, -17.3718, -14.8223], [-16.6058, -16.8783, -16.7452]],
+                [[-3.6456, -3.0209, -1.4203], [-3.0797, -3.1959, -2.0000], [-1.8757, -1.9217, -1.6997]],
+            ]
+        )
+    elif model_name == "segformer.b2.1024x1024.city.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-16.0976, -16.4856, -17.3962], [-16.6234, -19.0342, -19.7685], [-16.0900, -18.0661, -19.1180]],
+                [[-18.4750, -18.8488, -19.5074], [-19.4030, -22.1570, -22.5977], [-19.1191, -20.8486, -22.3783]],
+                [[-4.5178, -5.5037, -6.5109], [-5.0884, -7.2174, -8.0334], [-4.4156, -5.8117, -7.2970]],
+            ]
+        )
+    elif model_name == "segformer.b3.1024x1024.city.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-14.2081, -14.4732, -14.1977], [-14.5867, -16.4423, -16.6356], [-13.4441, -14.9685, -16.8696]],
+                [[-14.4576, -14.7073, -15.0451], [-15.0816, -17.6237, -17.9873], [-14.4213, -16.0199, -18.5992]],
+                [[-4.7349, -4.9588, -5.0966], [-4.3210, -6.9325, -7.2591], [-3.4312, -4.7484, -7.1917]],
+            ]
+        )
+    elif model_name == "segformer.b4.1024x1024.city.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-11.7737, -11.9526, -11.3273], [-13.6692, -14.4574, -13.8878], [-13.8937, -14.6924, -15.9345]],
+                [[-14.6706, -14.5330, -14.1306], [-16.1502, -16.8180, -16.4269], [-16.8338, -17.8939, -20.1746]],
+                [[1.0491, 0.8289, 1.0310], [1.1044, 0.5219, 0.8055], [1.0899, 0.6926, 0.5590]],
+            ]
+        )
+    elif model_name == "segformer.b5.1024x1024.city.160k":
+        expected_slice = torch.tensor(
+            [
+                [[-12.5641, -13.4777, -13.0684], [-13.9587, -15.8983, -16.6557], [-13.3109, -15.7350, -16.3141]],
+                [[-14.7074, -15.4352, -14.5944], [-16.6353, -18.1663, -18.6120], [-15.1702, -18.0329, -18.1547]],
+                [[-1.7990, -2.0951, -1.7784], [-2.6397, -3.8245, -3.9686], [-1.5264, -2.8126, -2.9316]],
+            ]
+        )
+    else:
+        predicted_class_idx = logits.argmax(-1).item()
+        print("Predicted class:", model.config.id2label[predicted_class_idx])
+    # verify logits
+    if not encoder_only:
+        assert logits.shape == expected_shape
+        assert torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-2)
+    # finally, save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        default="segformer.b0.512x512.ade.160k",
+        type=str,
+        help="Name of the model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, help="Path to the original PyTorch checkpoint (.pth file)."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_segformer_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path)

docs/transformers/build/lib/transformers/models/segformer/feature_extraction_segformer.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for SegFormer."""
+import warnings
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_segformer import SegformerImageProcessor
+logger = logging.get_logger(__name__)
+@requires(backends=("vision",))
+class SegformerFeatureExtractor(SegformerImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class SegformerFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
+            " Please use SegformerImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+__all__ = ["SegformerFeatureExtractor"]

docs/transformers/build/lib/transformers/models/segformer/image_processing_segformer.py ADDED Viewed

	@@ -0,0 +1,484 @@

+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Segformer."""
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+from ...image_processing_utils import INIT_SERVICE_KWARGS, BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_torch_available,
+    is_torch_tensor,
+    is_vision_available,
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.import_utils import requires
+if is_vision_available():
+    import PIL.Image
+if is_torch_available():
+    import torch
+logger = logging.get_logger(__name__)
+@requires(backends=("vision",))
+class SegformerImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Segformer image processor.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 512, "width": 512}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_reduce_labels (`bool`, *optional*, defaults to `False`):
+            Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is
+            used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
+            background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the
+            `preprocess` method.
+    """
+    model_input_names = ["pixel_values"]
+    @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.41.0")
+    @filter_out_non_signature_kwargs(extra=INIT_SERVICE_KWARGS)
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_reduce_labels: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 512, "width": 512}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_reduce_labels = do_reduce_labels
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to save support of deprecated `reduce_labels` in old configs
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "reduce_labels" in image_processor_dict:
+            image_processor_dict["do_reduce_labels"] = image_processor_dict.pop("reduce_labels")
+        return super().from_dict(image_processor_dict, **kwargs)
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.reduce_label
+    def reduce_label(self, label: ImageInput) -> np.ndarray:
+        label = to_numpy_array(label)
+        # Avoid using underflow conversion
+        label[label == 0] = 255
+        label = label - 1
+        label[label == 254] = 255
+        return label
+    def _preprocess(
+        self,
+        image: ImageInput,
+        do_reduce_labels: bool,
+        do_resize: bool,
+        do_rescale: bool,
+        do_normalize: bool,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        rescale_factor: Optional[float] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        if do_reduce_labels:
+            image = self.reduce_label(image)
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+        return image
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+        if do_rescale and is_scaled_image(image):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+        image = self._preprocess(
+            image=image,
+            do_reduce_labels=False,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            input_data_format=input_data_format,
+        )
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+    def _preprocess_mask(
+        self,
+        segmentation_map: ImageInput,
+        do_reduce_labels: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single mask."""
+        segmentation_map = to_numpy_array(segmentation_map)
+        # Add channel dimension if missing - needed for certain transformations
+        if segmentation_map.ndim == 2:
+            added_channel_dim = True
+            segmentation_map = segmentation_map[None, ...]
+            input_data_format = ChannelDimension.FIRST
+        else:
+            added_channel_dim = False
+            if input_data_format is None:
+                input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1)
+        # reduce zero label if needed
+        segmentation_map = self._preprocess(
+            image=segmentation_map,
+            do_reduce_labels=do_reduce_labels,
+            do_resize=do_resize,
+            resample=PILImageResampling.NEAREST,
+            size=size,
+            do_rescale=False,
+            do_normalize=False,
+            input_data_format=input_data_format,
+        )
+        # Remove extra channel dimension if added for processing
+        if added_channel_dim:
+            segmentation_map = segmentation_map.squeeze(0)
+        segmentation_map = segmentation_map.astype(np.int64)
+        return segmentation_map
+    def __call__(self, images, segmentation_maps=None, **kwargs):
+        """
+        Preprocesses a batch of images and optionally segmentation maps.
+        Overrides the `__call__` method of the `Preprocessor` class so that both images and segmentation maps can be
+        passed in as positional arguments.
+        """
+        return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
+    @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.41.0")
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[ImageInput] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_reduce_labels: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            segmentation_maps (`ImageInput`, *optional*):
+                Segmentation map to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after `resize` is applied.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+                Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+                is used for background, and background itself is not included in all classes of a dataset (e.g.
+                ADE20k). The background label will be replaced by 255.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self.do_reduce_labels
+        resample = resample if resample is not None else self.resample
+        size = size if size is not None else self.size
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        images = make_list_of_images(images)
+        if segmentation_maps is not None:
+            segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        images = [
+            self._preprocess_image(
+                image=img,
+                do_resize=do_resize,
+                resample=resample,
+                size=size,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for img in images
+        ]
+        data = {"pixel_values": images}
+        if segmentation_maps is not None:
+            segmentation_maps = [
+                self._preprocess_mask(
+                    segmentation_map=segmentation_map,
+                    do_reduce_labels=do_reduce_labels,
+                    do_resize=do_resize,
+                    size=size,
+                    input_data_format=input_data_format,
+                )
+                for segmentation_map in segmentation_maps
+            ]
+            data["labels"] = segmentation_maps
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->Segformer
+    def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
+        """
+        Converts the output of [`SegformerForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
+        Args:
+            outputs ([`SegformerForSemanticSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+        Returns:
+            semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
+            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
+            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
+        """
+        # TODO: add support for other frameworks
+        logits = outputs.logits
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if len(logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+            if is_torch_tensor(target_sizes):
+                target_sizes = target_sizes.numpy()
+            semantic_segmentation = []
+            for idx in range(len(logits)):
+                resized_logits = torch.nn.functional.interpolate(
+                    logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
+                )
+                semantic_map = resized_logits[0].argmax(dim=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = logits.argmax(dim=1)
+            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
+        return semantic_segmentation
+__all__ = ["SegformerImageProcessor"]

docs/transformers/build/lib/transformers/models/segformer/modeling_segformer.py ADDED Viewed

	@@ -0,0 +1,840 @@

+# coding=utf-8
+# Copyright 2021 NVIDIA The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch SegFormer model."""
+import math
+from typing import Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput, SemanticSegmenterOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_segformer import SegformerConfig
+logger = logging.get_logger(__name__)
+# General docstring
+_CONFIG_FOR_DOC = "SegformerConfig"
+# Base docstring
+_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
+_EXPECTED_OUTPUT_SHAPE = [1, 256, 16, 16]
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "nvidia/mit-b0"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+class SegFormerImageClassifierOutput(ImageClassifierOutput):
+    """
+    Base class for outputs of image classification models.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
+            called feature maps) of the model at the output of each stage.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->Segformer
+class SegformerDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+class SegformerOverlapPatchEmbeddings(nn.Module):
+    """Construct the overlapping patch embeddings."""
+    def __init__(self, patch_size, stride, num_channels, hidden_size):
+        super().__init__()
+        self.proj = nn.Conv2d(
+            num_channels,
+            hidden_size,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=patch_size // 2,
+        )
+        self.layer_norm = nn.LayerNorm(hidden_size)
+    def forward(self, pixel_values):
+        embeddings = self.proj(pixel_values)
+        _, _, height, width = embeddings.shape
+        # (batch_size, num_channels, height, width) -> (batch_size, num_channels, height*width) -> (batch_size, height*width, num_channels)
+        # this can be fed to a Transformer layer
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+        embeddings = self.layer_norm(embeddings)
+        return embeddings, height, width
+class SegformerEfficientSelfAttention(nn.Module):
+    """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
+    paper](https://arxiv.org/abs/2102.12122)."""
+    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})"
+            )
+        self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(self.hidden_size, self.all_head_size)
+        self.key = nn.Linear(self.hidden_size, self.all_head_size)
+        self.value = nn.Linear(self.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.sr_ratio = sequence_reduction_ratio
+        if sequence_reduction_ratio > 1:
+            self.sr = nn.Conv2d(
+                hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio
+            )
+            self.layer_norm = nn.LayerNorm(hidden_size)
+    def transpose_for_scores(self, hidden_states):
+        new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        hidden_states = hidden_states.view(new_shape)
+        return hidden_states.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        height,
+        width,
+        output_attentions=False,
+    ):
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+        if self.sr_ratio > 1:
+            batch_size, seq_len, num_channels = hidden_states.shape
+            # Reshape to (batch_size, num_channels, height, width)
+            hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+            # Apply sequence reduction
+            hidden_states = self.sr(hidden_states)
+            # Reshape back to (batch_size, seq_len, num_channels)
+            hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
+            hidden_states = self.layer_norm(hidden_states)
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        return outputs
+class SegformerSelfOutput(nn.Module):
+    def __init__(self, config, hidden_size):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class SegformerAttention(nn.Module):
+    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
+        super().__init__()
+        self.self = SegformerEfficientSelfAttention(
+            config=config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
+        )
+        self.output = SegformerSelfOutput(config, hidden_size=hidden_size)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(self, hidden_states, height, width, output_attentions=False):
+        self_outputs = self.self(hidden_states, height, width, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class SegformerDWConv(nn.Module):
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+    def forward(self, hidden_states, height, width):
+        batch_size, seq_len, num_channels = hidden_states.shape
+        hidden_states = hidden_states.transpose(1, 2).view(batch_size, num_channels, height, width)
+        hidden_states = self.dwconv(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        return hidden_states
+class SegformerMixFFN(nn.Module):
+    def __init__(self, config, in_features, hidden_features=None, out_features=None):
+        super().__init__()
+        out_features = out_features or in_features
+        self.dense1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = SegformerDWConv(hidden_features)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.dense2 = nn.Linear(hidden_features, out_features)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, height, width):
+        hidden_states = self.dense1(hidden_states)
+        hidden_states = self.dwconv(hidden_states, height, width)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class SegformerLayer(nn.Module):
+    """This corresponds to the Block class in the original implementation."""
+    def __init__(self, config, hidden_size, num_attention_heads, drop_path, sequence_reduction_ratio, mlp_ratio):
+        super().__init__()
+        self.layer_norm_1 = nn.LayerNorm(hidden_size)
+        self.attention = SegformerAttention(
+            config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
+        )
+        self.drop_path = SegformerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.layer_norm_2 = nn.LayerNorm(hidden_size)
+        mlp_hidden_size = int(hidden_size * mlp_ratio)
+        self.mlp = SegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size)
+    def forward(self, hidden_states, height, width, output_attentions=False):
+        self_attention_outputs = self.attention(
+            self.layer_norm_1(hidden_states),  # in Segformer, layernorm is applied before self-attention
+            height,
+            width,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        # first residual connection (with stochastic depth)
+        attention_output = self.drop_path(attention_output)
+        hidden_states = attention_output + hidden_states
+        mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)
+        # second residual connection (with stochastic depth)
+        mlp_output = self.drop_path(mlp_output)
+        layer_output = mlp_output + hidden_states
+        outputs = (layer_output,) + outputs
+        return outputs
+class SegformerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        drop_path_decays = [
+            x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")
+        ]
+        # patch embeddings
+        embeddings = []
+        for i in range(config.num_encoder_blocks):
+            embeddings.append(
+                SegformerOverlapPatchEmbeddings(
+                    patch_size=config.patch_sizes[i],
+                    stride=config.strides[i],
+                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
+                    hidden_size=config.hidden_sizes[i],
+                )
+            )
+        self.patch_embeddings = nn.ModuleList(embeddings)
+        # Transformer blocks
+        blocks = []
+        cur = 0
+        for i in range(config.num_encoder_blocks):
+            # each block consists of layers
+            layers = []
+            if i != 0:
+                cur += config.depths[i - 1]
+            for j in range(config.depths[i]):
+                layers.append(
+                    SegformerLayer(
+                        config,
+                        hidden_size=config.hidden_sizes[i],
+                        num_attention_heads=config.num_attention_heads[i],
+                        drop_path=drop_path_decays[cur + j],
+                        sequence_reduction_ratio=config.sr_ratios[i],
+                        mlp_ratio=config.mlp_ratios[i],
+                    )
+                )
+            blocks.append(nn.ModuleList(layers))
+        self.block = nn.ModuleList(blocks)
+        # Layer norms
+        self.layer_norm = nn.ModuleList(
+            [nn.LayerNorm(config.hidden_sizes[i]) for i in range(config.num_encoder_blocks)]
+        )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        batch_size = pixel_values.shape[0]
+        hidden_states = pixel_values
+        for idx, x in enumerate(zip(self.patch_embeddings, self.block, self.layer_norm)):
+            embedding_layer, block_layer, norm_layer = x
+            # first, obtain patch embeddings
+            hidden_states, height, width = embedding_layer(hidden_states)
+            # second, send embeddings through blocks
+            for i, blk in enumerate(block_layer):
+                layer_outputs = blk(hidden_states, height, width, output_attentions)
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attentions = all_self_attentions + (layer_outputs[1],)
+            # third, apply layer norm
+            hidden_states = norm_layer(hidden_states)
+            # fourth, optionally reshape back to (batch_size, num_channels, height, width)
+            if idx != len(self.patch_embeddings) - 1 or (
+                idx == len(self.patch_embeddings) - 1 and self.config.reshape_last_stage
+            ):
+                hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class SegformerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = SegformerConfig
+    base_model_prefix = "segformer"
+    main_input_name = "pixel_values"
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+SEGFORMER_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`SegformerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+SEGFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`SegformerImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.",
+    SEGFORMER_START_DOCSTRING,
+)
+class SegformerModel(SegformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        # hierarchical Transformer encoder
+        self.encoder = SegformerEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_outputs = self.encoder(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden
+    states) e.g. for ImageNet.
+    """,
+    SEGFORMER_START_DOCSTRING,
+)
+class SegformerForImageClassification(SegformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.segformer = SegformerModel(config)
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_sizes[-1], config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=SegFormerImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SegFormerImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.segformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        # convert last hidden states to (batch_size, height*width, hidden_size)
+        batch_size = sequence_output.shape[0]
+        if self.config.reshape_last_stage:
+            # (batch_size, num_channels, height, width) -> (batch_size, height, width, num_channels)
+            sequence_output = sequence_output.permute(0, 2, 3, 1)
+        sequence_output = sequence_output.reshape(batch_size, -1, self.config.hidden_sizes[-1])
+        # global average pooling
+        sequence_output = sequence_output.mean(dim=1)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SegFormerImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class SegformerMLP(nn.Module):
+    """
+    Linear Embedding.
+    """
+    def __init__(self, config: SegformerConfig, input_dim):
+        super().__init__()
+        self.proj = nn.Linear(input_dim, config.decoder_hidden_size)
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        hidden_states = self.proj(hidden_states)
+        return hidden_states
+class SegformerDecodeHead(SegformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
+        mlps = []
+        for i in range(config.num_encoder_blocks):
+            mlp = SegformerMLP(config, input_dim=config.hidden_sizes[i])
+            mlps.append(mlp)
+        self.linear_c = nn.ModuleList(mlps)
+        # the following 3 layers implement the ConvModule of the original implementation
+        self.linear_fuse = nn.Conv2d(
+            in_channels=config.decoder_hidden_size * config.num_encoder_blocks,
+            out_channels=config.decoder_hidden_size,
+            kernel_size=1,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm2d(config.decoder_hidden_size)
+        self.activation = nn.ReLU()
+        self.dropout = nn.Dropout(config.classifier_dropout_prob)
+        self.classifier = nn.Conv2d(config.decoder_hidden_size, config.num_labels, kernel_size=1)
+        self.config = config
+    def forward(self, encoder_hidden_states: torch.FloatTensor) -> torch.Tensor:
+        batch_size = encoder_hidden_states[-1].shape[0]
+        all_hidden_states = ()
+        for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.linear_c):
+            if self.config.reshape_last_stage is False and encoder_hidden_state.ndim == 3:
+                height = width = int(math.sqrt(encoder_hidden_state.shape[-1]))
+                encoder_hidden_state = (
+                    encoder_hidden_state.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
+                )
+            # unify channel dimension
+            height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3]
+            encoder_hidden_state = mlp(encoder_hidden_state)
+            encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1)
+            encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width)
+            # upsample
+            encoder_hidden_state = nn.functional.interpolate(
+                encoder_hidden_state, size=encoder_hidden_states[0].size()[2:], mode="bilinear", align_corners=False
+            )
+            all_hidden_states += (encoder_hidden_state,)
+        hidden_states = self.linear_fuse(torch.cat(all_hidden_states[::-1], dim=1))
+        hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        # logits are of shape (batch_size, num_labels, height/4, width/4)
+        logits = self.classifier(hidden_states)
+        return logits
+@add_start_docstrings(
+    """SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.""",
+    SEGFORMER_START_DOCSTRING,
+)
+class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.segformer = SegformerModel(config)
+        self.decode_head = SegformerDecodeHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SemanticSegmenterOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, SegformerForSemanticSegmentation
+        >>> from PIL import Image
+        >>> import requests
+        >>> image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+        >>> model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits  # shape (batch_size, num_labels, height/4, width/4)
+        >>> list(logits.shape)
+        [1, 150, 128, 128]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        if labels is not None and self.config.num_labels < 1:
+            raise ValueError(f"Number of labels should be >=0: {self.config.num_labels}")
+        outputs = self.segformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=True,  # we need the intermediate hidden states
+            return_dict=return_dict,
+        )
+        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
+        logits = self.decode_head(encoder_hidden_states)
+        loss = None
+        if labels is not None:
+            # upsample logits to the images' original size
+            upsampled_logits = nn.functional.interpolate(
+                logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
+            )
+            if self.config.num_labels > 1:
+                loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
+                loss = loss_fct(upsampled_logits, labels)
+            elif self.config.num_labels == 1:
+                valid_mask = ((labels >= 0) & (labels != self.config.semantic_loss_ignore_index)).float()
+                loss_fct = BCEWithLogitsLoss(reduction="none")
+                loss = loss_fct(upsampled_logits.squeeze(1), labels.float())
+                loss = (loss * valid_mask).mean()
+        if not return_dict:
+            if output_hidden_states:
+                output = (logits,) + outputs[1:]
+            else:
+                output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SemanticSegmenterOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+__all__ = [
+    "SegformerDecodeHead",
+    "SegformerForImageClassification",
+    "SegformerForSemanticSegmentation",
+    "SegformerLayer",
+    "SegformerModel",
+    "SegformerPreTrainedModel",
+]

docs/transformers/build/lib/transformers/models/segformer/modeling_tf_segformer.py ADDED Viewed

	@@ -0,0 +1,1045 @@

+# coding=utf-8
+# Copyright 2022 NVIDIA The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorFlow SegFormer model."""
+from __future__ import annotations
+import math
+from typing import Optional, Tuple, Union
+import tensorflow as tf
+from ...activations_tf import get_tf_activation
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_tf_outputs import TFBaseModelOutput, TFSemanticSegmenterOutput, TFSequenceClassifierOutput
+from ...modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import logging
+from .configuration_segformer import SegformerConfig
+logger = logging.get_logger(__name__)
+# General docstring
+_CONFIG_FOR_DOC = "SegformerConfig"
+# Base docstring
+_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
+_EXPECTED_OUTPUT_SHAPE = [1, 256, 16, 16]
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "nvidia/mit-b0"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->Segformer
+class TFSegformerDropPath(keras.layers.Layer):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    References:
+        (1) github.com:rwightman/pytorch-image-models
+    """
+    def __init__(self, drop_path: float, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_path = drop_path
+    def call(self, x: tf.Tensor, training=None):
+        if training:
+            keep_prob = 1 - self.drop_path
+            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+            random_tensor = tf.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+class TFSegformerOverlapPatchEmbeddings(keras.layers.Layer):
+    """Construct the overlapping patch embeddings."""
+    def __init__(self, patch_size, stride, num_channels, hidden_size, **kwargs):
+        super().__init__(**kwargs)
+        self.padding = keras.layers.ZeroPadding2D(padding=patch_size // 2)
+        self.proj = keras.layers.Conv2D(
+            filters=hidden_size, kernel_size=patch_size, strides=stride, padding="VALID", name="proj"
+        )
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm")
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size
+    def call(self, pixel_values: tf.Tensor) -> Tuple[tf.Tensor, int, int]:
+        embeddings = self.proj(self.padding(pixel_values))
+        height = shape_list(embeddings)[1]
+        width = shape_list(embeddings)[2]
+        hidden_dim = shape_list(embeddings)[3]
+        # (batch_size, height, width, num_channels) -> (batch_size, height*width, num_channels)
+        # this can be fed to a Transformer layer
+        embeddings = tf.reshape(embeddings, (-1, height * width, hidden_dim))
+        embeddings = self.layer_norm(embeddings)
+        return embeddings, height, width
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "proj", None) is not None:
+            with tf.name_scope(self.proj.name):
+                self.proj.build([None, None, None, self.num_channels])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.hidden_size])
+class TFSegformerEfficientSelfAttention(keras.layers.Layer):
+    """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
+    paper](https://arxiv.org/abs/2102.12122)."""
+    def __init__(
+        self,
+        config: SegformerConfig,
+        hidden_size: int,
+        num_attention_heads: int,
+        sequence_reduction_ratio: int,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})"
+            )
+        self.attention_head_size = self.hidden_size // self.num_attention_heads
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+        self.query = keras.layers.Dense(self.all_head_size, name="query")
+        self.key = keras.layers.Dense(self.all_head_size, name="key")
+        self.value = keras.layers.Dense(self.all_head_size, name="value")
+        self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
+        self.sr_ratio = sequence_reduction_ratio
+        if sequence_reduction_ratio > 1:
+            self.sr = keras.layers.Conv2D(
+                filters=hidden_size, kernel_size=sequence_reduction_ratio, strides=sequence_reduction_ratio, name="sr"
+            )
+            self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm")
+    def transpose_for_scores(self, tensor: tf.Tensor) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size]
+        # to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        batch_size = shape_list(tensor)[0]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size]
+        # to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        height: int,
+        width: int,
+        output_attentions: bool = False,
+        training: bool = False,
+    ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
+        batch_size = shape_list(hidden_states)[0]
+        num_channels = shape_list(hidden_states)[2]
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+        if self.sr_ratio > 1:
+            # Reshape to (batch_size, height, width, num_channels)
+            hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
+            # Apply sequence reduction
+            hidden_states = self.sr(hidden_states)
+            # Reshape back to (batch_size, seq_len, num_channels)
+            hidden_states = tf.reshape(hidden_states, (batch_size, -1, num_channels))
+            hidden_states = self.layer_norm(hidden_states)
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+        scale = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, scale)
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs, training=training)
+        context_layer = tf.matmul(attention_probs, value_layer)
+        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
+        # (batch_size, seq_len_q, all_head_size)
+        context_layer = tf.reshape(context_layer, (batch_size, -1, self.all_head_size))
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        return outputs
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.hidden_size])
+        if getattr(self, "sr", None) is not None:
+            with tf.name_scope(self.sr.name):
+                self.sr.build([None, None, None, self.hidden_size])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.hidden_size])
+class TFSegformerSelfOutput(keras.layers.Layer):
+    def __init__(self, config: SegformerConfig, hidden_size: int, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = keras.layers.Dense(hidden_size, name="dense")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
+        self.hidden_size = hidden_size
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        return hidden_states
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.hidden_size])
+class TFSegformerAttention(keras.layers.Layer):
+    def __init__(
+        self,
+        config: SegformerConfig,
+        hidden_size: int,
+        num_attention_heads: int,
+        sequence_reduction_ratio: int,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.self = TFSegformerEfficientSelfAttention(
+            config=config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
+            name="self",
+        )
+        self.dense_output = TFSegformerSelfOutput(config, hidden_size=hidden_size, name="output")
+    def call(
+        self, hidden_states: tf.Tensor, height: int, width: int, output_attentions: bool = False
+    ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
+        self_outputs = self.self(hidden_states, height, width, output_attentions)
+        attention_output = self.dense_output(self_outputs[0])
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self", None) is not None:
+            with tf.name_scope(self.self.name):
+                self.self.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+class TFSegformerDWConv(keras.layers.Layer):
+    def __init__(self, dim: int = 768, **kwargs):
+        super().__init__(**kwargs)
+        self.depthwise_convolution = keras.layers.Conv2D(
+            filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv"
+        )
+        self.dim = dim
+    def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor:
+        batch_size = shape_list(hidden_states)[0]
+        num_channels = shape_list(hidden_states)[-1]
+        hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
+        hidden_states = self.depthwise_convolution(hidden_states)
+        new_height = shape_list(hidden_states)[1]
+        new_width = shape_list(hidden_states)[2]
+        num_channels = shape_list(hidden_states)[3]
+        hidden_states = tf.reshape(hidden_states, (batch_size, new_height * new_width, num_channels))
+        return hidden_states
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "depthwise_convolution", None) is not None:
+            with tf.name_scope(self.depthwise_convolution.name):
+                self.depthwise_convolution.build([None, None, None, self.dim])
+class TFSegformerMixFFN(keras.layers.Layer):
+    def __init__(
+        self,
+        config: SegformerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        out_features = out_features or in_features
+        self.dense1 = keras.layers.Dense(hidden_features, name="dense1")
+        self.depthwise_convolution = TFSegformerDWConv(hidden_features, name="dwconv")
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.dense2 = keras.layers.Dense(out_features, name="dense2")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
+        self.hidden_features = hidden_features
+        self.in_features = in_features
+    def call(self, hidden_states: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense1(hidden_states)
+        hidden_states = self.depthwise_convolution(hidden_states, height, width)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = self.dense2(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        return hidden_states
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense1", None) is not None:
+            with tf.name_scope(self.dense1.name):
+                self.dense1.build([None, None, self.in_features])
+        if getattr(self, "depthwise_convolution", None) is not None:
+            with tf.name_scope(self.depthwise_convolution.name):
+                self.depthwise_convolution.build(None)
+        if getattr(self, "dense2", None) is not None:
+            with tf.name_scope(self.dense2.name):
+                self.dense2.build([None, None, self.hidden_features])
+class TFSegformerLayer(keras.layers.Layer):
+    """This corresponds to the Block class in the original implementation."""
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        num_attention_heads: int,
+        drop_path: float,
+        sequence_reduction_ratio: int,
+        mlp_ratio: int,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.layer_norm_1 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1")
+        self.attention = TFSegformerAttention(
+            config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
+            name="attention",
+        )
+        self.drop_path = TFSegformerDropPath(drop_path) if drop_path > 0.0 else keras.layers.Activation("linear")
+        self.layer_norm_2 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2")
+        mlp_hidden_size = int(hidden_size * mlp_ratio)
+        self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp")
+        self.hidden_size = hidden_size
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        height: int,
+        width: int,
+        output_attentions: bool = False,
+        training: bool = False,
+    ) -> Tuple:
+        self_attention_outputs = self.attention(
+            self.layer_norm_1(hidden_states),  # in Segformer, layernorm is applied before self-attention
+            height,
+            width,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        # first residual connection (with stochastic depth)
+        attention_output = self.drop_path(attention_output, training=training)
+        hidden_states = attention_output + hidden_states
+        mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)
+        # second residual connection (with stochastic depth)
+        mlp_output = self.drop_path(mlp_output, training=training)
+        layer_output = mlp_output + hidden_states
+        outputs = (layer_output,) + outputs
+        return outputs
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norm_1", None) is not None:
+            with tf.name_scope(self.layer_norm_1.name):
+                self.layer_norm_1.build([None, None, self.hidden_size])
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "layer_norm_2", None) is not None:
+            with tf.name_scope(self.layer_norm_2.name):
+                self.layer_norm_2.build([None, None, self.hidden_size])
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+class TFSegformerEncoder(keras.layers.Layer):
+    def __init__(self, config: SegformerConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        # stochastic depth decay rule
+        drop_path_decays = [x.numpy() for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))]
+        # patch embeddings
+        embeddings = []
+        for i in range(config.num_encoder_blocks):
+            embeddings.append(
+                TFSegformerOverlapPatchEmbeddings(
+                    patch_size=config.patch_sizes[i],
+                    stride=config.strides[i],
+                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
+                    hidden_size=config.hidden_sizes[i],
+                    name=f"patch_embeddings.{i}",
+                )
+            )
+        self.embeddings = embeddings
+        # Transformer blocks
+        blocks = []
+        cur = 0
+        for i in range(config.num_encoder_blocks):
+            # each block consists of layers
+            layers = []
+            if i != 0:
+                cur += config.depths[i - 1]
+            for j in range(config.depths[i]):
+                layers.append(
+                    TFSegformerLayer(
+                        config,
+                        hidden_size=config.hidden_sizes[i],
+                        num_attention_heads=config.num_attention_heads[i],
+                        drop_path=drop_path_decays[cur + j],
+                        sequence_reduction_ratio=config.sr_ratios[i],
+                        mlp_ratio=config.mlp_ratios[i],
+                        name=f"block.{i}.{j}",
+                    )
+                )
+            blocks.append(layers)
+        self.block = blocks
+        # Layer norms
+        self.layer_norms = [
+            keras.layers.LayerNormalization(epsilon=1e-05, name=f"layer_norm.{i}")
+            for i in range(config.num_encoder_blocks)
+        ]
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        training: bool = False,
+    ) -> Union[Tuple, TFBaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        batch_size = shape_list(pixel_values)[0]
+        hidden_states = pixel_values
+        for idx, x in enumerate(zip(self.embeddings, self.block, self.layer_norms)):
+            embedding_layer, block_layer, norm_layer = x
+            # first, obtain patch embeddings
+            hidden_states, height, width = embedding_layer(hidden_states)
+            # second, send embeddings through blocks
+            # (each block consists of multiple layers i.e., list of layers)
+            for i, blk in enumerate(block_layer):
+                layer_outputs = blk(
+                    hidden_states,
+                    height,
+                    width,
+                    output_attentions,
+                    training=training,
+                )
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attentions = all_self_attentions + (layer_outputs[1],)
+            # third, apply layer norm
+            hidden_states = norm_layer(hidden_states)
+            # fourth, optionally reshape back to (batch_size, height, width, num_channels)
+            if idx != len(self.embeddings) - 1 or (idx == len(self.embeddings) - 1 and self.config.reshape_last_stage):
+                num_channels = shape_list(hidden_states)[-1]
+                hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norms", None) is not None:
+            for layer, shape in zip(self.layer_norms, self.config.hidden_sizes):
+                with tf.name_scope(layer.name):
+                    layer.build([None, None, shape])
+        if getattr(self, "block", None) is not None:
+            for block in self.block:
+                for layer in block:
+                    with tf.name_scope(layer.name):
+                        layer.build(None)
+        if getattr(self, "embeddings", None) is not None:
+            for layer in self.embeddings:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+@keras_serializable
+class TFSegformerMainLayer(keras.layers.Layer):
+    config_class = SegformerConfig
+    def __init__(self, config: SegformerConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        # hierarchical Transformer encoder
+        self.encoder = TFSegformerEncoder(config, name="encoder")
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[Tuple, TFBaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
+        # So change the input format from `NCHW` to `NHWC`.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+        encoder_outputs = self.encoder(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = encoder_outputs[0]
+        # Change to NCHW output format to have uniformity in the modules
+        sequence_output = tf.transpose(sequence_output, perm=[0, 3, 1, 2])
+        # Change the other hidden state outputs to NCHW as well
+        if output_hidden_states:
+            hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])
+        if not return_dict:
+            if tf.greater(len(encoder_outputs[1:]), 0):
+                transposed_encoder_outputs = tuple(tf.transpose(v, perm=[0, 3, 1, 2]) for v in encoder_outputs[1:][0])
+                return (sequence_output,) + (transposed_encoder_outputs,)
+            else:
+                return (sequence_output,) + encoder_outputs[1:]
+        return TFBaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+class TFSegformerPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = SegformerConfig
+    base_model_prefix = "segformer"
+    main_input_name = "pixel_values"
+    @property
+    def input_signature(self):
+        return {"pixel_values": tf.TensorSpec(shape=(None, self.config.num_channels, 512, 512), dtype=tf.float32)}
+SEGFORMER_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`SegformerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+SEGFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`SegformerImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+@add_start_docstrings(
+    "The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.",
+    SEGFORMER_START_DOCSTRING,
+)
+class TFSegformerModel(TFSegformerPreTrainedModel):
+    def __init__(self, config: SegformerConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.config = config
+        # hierarchical Transformer encoder
+        self.segformer = TFSegformerMainLayer(config, name="segformer")
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[Tuple, TFBaseModelOutput]:
+        outputs = self.segformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "segformer", None) is not None:
+            with tf.name_scope(self.segformer.name):
+                self.segformer.build(None)
+@add_start_docstrings(
+    """
+    SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden
+    states) e.g. for ImageNet.
+    """,
+    SEGFORMER_START_DOCSTRING,
+)
+class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: SegformerConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.segformer = TFSegformerMainLayer(config, name="segformer")
+        # Classifier head
+        self.classifier = keras.layers.Dense(config.num_labels, name="classifier")
+        self.config = config
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def call(
+        self,
+        pixel_values: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TFSequenceClassifierOutput]:
+        outputs = self.segformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        # convert last hidden states to (batch_size, height*width, hidden_size)
+        batch_size = shape_list(sequence_output)[0]
+        sequence_output = tf.transpose(sequence_output, perm=[0, 2, 3, 1])
+        sequence_output = tf.reshape(sequence_output, (batch_size, -1, self.config.hidden_sizes[-1]))
+        # global average pooling
+        sequence_output = tf.reduce_mean(sequence_output, axis=1)
+        logits = self.classifier(sequence_output)
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return TFSequenceClassifierOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "segformer", None) is not None:
+            with tf.name_scope(self.segformer.name):
+                self.segformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_sizes[-1]])
+class TFSegformerMLP(keras.layers.Layer):
+    """
+    Linear Embedding.
+    """
+    def __init__(self, input_dim: int, config: SegformerConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.proj = keras.layers.Dense(config.decoder_hidden_size, name="proj")
+        self.input_dim = input_dim
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        height = shape_list(hidden_states)[1]
+        width = shape_list(hidden_states)[2]
+        hidden_dim = shape_list(hidden_states)[-1]
+        hidden_states = tf.reshape(hidden_states, (-1, height * width, hidden_dim))
+        hidden_states = self.proj(hidden_states)
+        return hidden_states
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "proj", None) is not None:
+            with tf.name_scope(self.proj.name):
+                self.proj.build([None, None, self.input_dim])
+class TFSegformerDecodeHead(TFSegformerPreTrainedModel):
+    def __init__(self, config: SegformerConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
+        mlps = []
+        for i in range(config.num_encoder_blocks):
+            mlp = TFSegformerMLP(config=config, input_dim=config.hidden_sizes[i], name=f"linear_c.{i}")
+            mlps.append(mlp)
+        self.mlps = mlps
+        # the following 3 layers implement the ConvModule of the original implementation
+        self.linear_fuse = keras.layers.Conv2D(
+            filters=config.decoder_hidden_size, kernel_size=1, use_bias=False, name="linear_fuse"
+        )
+        self.batch_norm = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="batch_norm")
+        self.activation = keras.layers.Activation("relu")
+        self.dropout = keras.layers.Dropout(config.classifier_dropout_prob)
+        self.classifier = keras.layers.Conv2D(filters=config.num_labels, kernel_size=1, name="classifier")
+        self.config = config
+    def call(self, encoder_hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        all_hidden_states = ()
+        for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.mlps):
+            if self.config.reshape_last_stage is False and len(shape_list(encoder_hidden_state)) == 3:
+                height = tf.math.sqrt(tf.cast(shape_list(encoder_hidden_state)[1], tf.float32))
+                height = width = tf.cast(height, tf.int32)
+                channel_dim = shape_list(encoder_hidden_state)[-1]
+                encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim))
+            # unify channel dimension
+            encoder_hidden_state = tf.transpose(encoder_hidden_state, perm=[0, 2, 3, 1])
+            height, width = shape_list(encoder_hidden_state)[1:3]
+            encoder_hidden_state = mlp(encoder_hidden_state)
+            channel_dim = shape_list(encoder_hidden_state)[-1]
+            encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim))
+            # upsample
+            temp_state = tf.transpose(encoder_hidden_states[0], perm=[0, 2, 3, 1])
+            upsample_resolution = shape_list(temp_state)[1:-1]
+            encoder_hidden_state = tf.image.resize(encoder_hidden_state, size=upsample_resolution, method="bilinear")
+            all_hidden_states += (encoder_hidden_state,)
+        hidden_states = self.linear_fuse(tf.concat(all_hidden_states[::-1], axis=-1))
+        hidden_states = self.batch_norm(hidden_states, training=training)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        # logits of shape (batch_size, height/4, width/4, num_labels)
+        logits = self.classifier(hidden_states)
+        return logits
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "linear_fuse", None) is not None:
+            with tf.name_scope(self.linear_fuse.name):
+                self.linear_fuse.build(
+                    [None, None, None, self.config.decoder_hidden_size * self.config.num_encoder_blocks]
+                )
+        if getattr(self, "batch_norm", None) is not None:
+            with tf.name_scope(self.batch_norm.name):
+                self.batch_norm.build([None, None, None, self.config.decoder_hidden_size])
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, None, self.config.decoder_hidden_size])
+        if getattr(self, "mlps", None) is not None:
+            for layer in self.mlps:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+@add_start_docstrings(
+    """SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.""",
+    SEGFORMER_START_DOCSTRING,
+)
+class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
+    def __init__(self, config: SegformerConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.segformer = TFSegformerMainLayer(config, name="segformer")
+        self.decode_head = TFSegformerDecodeHead(config, name="decode_head")
+    def hf_compute_loss(self, logits, labels):
+        # upsample logits to the images' original size
+        # `labels` is of shape (batch_size, height, width)
+        label_interp_shape = shape_list(labels)[1:]
+        upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear")
+        # compute weighted loss
+        loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
+        def masked_loss(real, pred):
+            unmasked_loss = loss_fct(real, pred)
+            mask = tf.cast(real != self.config.semantic_loss_ignore_index, dtype=unmasked_loss.dtype)
+            masked_loss = unmasked_loss * mask
+            # Reduction strategy in the similar spirit with
+            # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_utils.py#L210
+            reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(mask)
+            return tf.reshape(reduced_masked_loss, (1,))
+        return masked_loss(labels, upsampled_logits)
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFSemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        labels: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TFSemanticSegmenterOutput]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1`, a (per-pixel) classification loss is computed
+            (Cross-Entropy).
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, TFSegformerForSemanticSegmentation
+        >>> from PIL import Image
+        >>> import requests
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+        >>> model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+        >>> inputs = image_processor(images=image, return_tensors="tf")
+        >>> outputs = model(**inputs, training=False)
+        >>> # logits are of shape (batch_size, num_labels, height/4, width/4)
+        >>> logits = outputs.logits
+        >>> list(logits.shape)
+        [1, 150, 128, 128]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        if labels is not None and not self.config.num_labels > 1:
+            raise ValueError("The number of labels should be greater than one")
+        outputs = self.segformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=True,  # we need the intermediate hidden states
+            return_dict=return_dict,
+        )
+        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
+        logits = self.decode_head(encoder_hidden_states)
+        loss = None
+        if labels is not None:
+            loss = self.hf_compute_loss(logits=logits, labels=labels)
+        # make logits of shape (batch_size, num_labels, height, width) to
+        # keep them consistent across APIs
+        logits = tf.transpose(logits, perm=[0, 3, 1, 2])
+        if not return_dict:
+            if output_hidden_states:
+                output = (logits,) + outputs[1:]
+            else:
+                output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TFSemanticSegmenterOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "segformer", None) is not None:
+            with tf.name_scope(self.segformer.name):
+                self.segformer.build(None)
+        if getattr(self, "decode_head", None) is not None:
+            with tf.name_scope(self.decode_head.name):
+                self.decode_head.build(None)
+__all__ = [
+    "TFSegformerDecodeHead",
+    "TFSegformerForImageClassification",
+    "TFSegformerForSemanticSegmentation",
+    "TFSegformerModel",
+    "TFSegformerPreTrainedModel",
+]

docs/transformers/build/lib/transformers/models/seggpt/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_seggpt import *
+    from .image_processing_seggpt import *
+    from .modeling_seggpt import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/seggpt/configuration_seggpt.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SegGpt model configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+class SegGptConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SegGptModel`]. It is used to instantiate a SegGPT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the SegGPT
+    [BAAI/seggpt-vit-large](https://huggingface.co/BAAI/seggpt-vit-large) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`List[int]`, *optional*, defaults to `[896, 448]`):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        mlp_dim (`int`, *optional*):
+            The dimensionality of the MLP layer in the Transformer encoder. If unset, defaults to
+            `hidden_size` * 4.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            The drop path rate for the dropout layers.
+        pretrain_image_size (`int`, *optional*, defaults to 224):
+            The pretrained size of the absolute position embeddings.
+        decoder_hidden_size (`int`, *optional*, defaults to 64):
+            Hidden size for decoder.
+        use_relative_position_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to use relative position embeddings in the attention layers.
+        merge_index (`int`, *optional*, defaults to 2):
+            The index of the encoder layer to merge the embeddings.
+        intermediate_hidden_state_indices (`List[int]`, *optional*, defaults to `[5, 11, 17, 23]`):
+            The indices of the encoder layers which we store as features for the decoder.
+        beta (`float`, *optional*, defaults to 0.01):
+            Regularization factor for SegGptLoss (smooth-l1 loss).
+    Example:
+    ```python
+    >>> from transformers import SegGptConfig, SegGptModel
+    >>> # Initializing a SegGPT seggpt-vit-large style configuration
+    >>> configuration = SegGptConfig()
+    >>> # Initializing a model (with random weights) from the seggpt-vit-large style configuration
+    >>> model = SegGptModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "seggpt"
+    def __init__(
+        self,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=[896, 448],
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        mlp_dim=None,
+        drop_path_rate=0.1,
+        pretrain_image_size=224,
+        decoder_hidden_size=64,
+        use_relative_position_embeddings=True,
+        merge_index=2,
+        intermediate_hidden_state_indices=[5, 11, 17, 23],
+        beta=0.01,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if merge_index > min(intermediate_hidden_state_indices):
+            raise ValueError(
+                f"Merge index must be less than the minimum encoder output index, but got {merge_index=} and {intermediate_hidden_state_indices=}"
+            )
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.drop_path_rate = drop_path_rate
+        self.pretrain_image_size = pretrain_image_size
+        self.decoder_hidden_size = decoder_hidden_size
+        self.use_relative_position_embeddings = use_relative_position_embeddings
+        self.merge_index = merge_index
+        self.intermediate_hidden_state_indices = intermediate_hidden_state_indices
+        self.beta = beta
+        self.mlp_dim = int(hidden_size * 4) if mlp_dim is None else mlp_dim
+__all__ = ["SegGptConfig"]

docs/transformers/build/lib/transformers/models/seggpt/convert_seggpt_to_hf.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert SegGPT checkpoints from the original repository.
+URL: https://github.com/baaivision/Painter/tree/main/SegGPT
+"""
+import argparse
+import requests
+import torch
+from PIL import Image
+from transformers import SegGptConfig, SegGptForImageSegmentation, SegGptImageProcessor
+from transformers.utils import logging
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config):
+    rename_keys = []
+    # fmt: off
+    # rename embedding and its parameters
+    rename_keys.append(("patch_embed.proj.weight", "model.embeddings.patch_embeddings.projection.weight"))
+    rename_keys.append(("patch_embed.proj.bias", "model.embeddings.patch_embeddings.projection.bias"))
+    rename_keys.append(("mask_token", "model.embeddings.mask_token"))
+    rename_keys.append(("segment_token_x", "model.embeddings.segment_token_input"))
+    rename_keys.append(("segment_token_y", "model.embeddings.segment_token_prompt"))
+    rename_keys.append(("type_token_cls", "model.embeddings.type_token_semantic"))
+    rename_keys.append(("type_token_ins", "model.embeddings.type_token_instance"))
+    rename_keys.append(("pos_embed", "model.embeddings.position_embeddings"))
+    # rename decoder and other
+    rename_keys.append(("norm.weight", "model.encoder.layernorm.weight"))
+    rename_keys.append(("norm.bias", "model.encoder.layernorm.bias"))
+    rename_keys.append(("decoder_embed.weight", "decoder.decoder_embed.weight"))
+    rename_keys.append(("decoder_embed.bias", "decoder.decoder_embed.bias"))
+    rename_keys.append(("decoder_pred.0.weight", "decoder.decoder_pred.conv.weight"))
+    rename_keys.append(("decoder_pred.0.bias", "decoder.decoder_pred.conv.bias"))
+    rename_keys.append(("decoder_pred.1.weight", "decoder.decoder_pred.layernorm.weight"))
+    rename_keys.append(("decoder_pred.1.bias", "decoder.decoder_pred.layernorm.bias"))
+    rename_keys.append(("decoder_pred.3.weight", "decoder.decoder_pred.head.weight"))
+    rename_keys.append(("decoder_pred.3.bias", "decoder.decoder_pred.head.bias"))
+    # rename blocks
+    for i in range(config.num_hidden_layers):
+        rename_keys.append((f"blocks.{i}.attn.qkv.weight", f"model.encoder.layers.{i}.attention.qkv.weight"))
+        rename_keys.append((f"blocks.{i}.attn.qkv.bias", f"model.encoder.layers.{i}.attention.qkv.bias"))
+        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"model.encoder.layers.{i}.attention.proj.weight"))
+        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"model.encoder.layers.{i}.attention.proj.bias"))
+        rename_keys.append((f"blocks.{i}.attn.rel_pos_h", f"model.encoder.layers.{i}.attention.rel_pos_h"))
+        rename_keys.append((f"blocks.{i}.attn.rel_pos_w", f"model.encoder.layers.{i}.attention.rel_pos_w"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"model.encoder.layers.{i}.mlp.lin1.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"model.encoder.layers.{i}.mlp.lin1.bias"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"model.encoder.layers.{i}.mlp.lin2.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"model.encoder.layers.{i}.mlp.lin2.bias"))
+        rename_keys.append((f"blocks.{i}.norm1.weight", f"model.encoder.layers.{i}.layernorm_before.weight"))
+        rename_keys.append((f"blocks.{i}.norm1.bias", f"model.encoder.layers.{i}.layernorm_before.bias"))
+        rename_keys.append((f"blocks.{i}.norm2.weight", f"model.encoder.layers.{i}.layernorm_after.weight"))
+        rename_keys.append((f"blocks.{i}.norm2.bias", f"model.encoder.layers.{i}.layernorm_after.bias"))
+    # fmt: on
+    return rename_keys
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+# We will verify our results on spongebob images
+def prepare_input():
+    image_input_url = (
+        "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
+    )
+    image_prompt_url = (
+        "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
+    )
+    mask_prompt_url = (
+        "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"
+    )
+    image_input = Image.open(requests.get(image_input_url, stream=True).raw)
+    image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
+    mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw)
+    return image_input, image_prompt, mask_prompt
+@torch.no_grad()
+def convert_seggpt_checkpoint(args):
+    model_name = args.model_name
+    pytorch_dump_folder_path = args.pytorch_dump_folder_path
+    verify_logits = args.verify_logits
+    push_to_hub = args.push_to_hub
+    # Define default GroundingDINO configuation
+    config = SegGptConfig()
+    # Load original checkpoint
+    checkpoint_url = "https://huggingface.co/BAAI/SegGpt/blob/main/seggpt_vit_large.pth"
+    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
+    # # Rename keys
+    new_state_dict = original_state_dict.copy()
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(new_state_dict, src, dest)
+    # Load HF model
+    model = SegGptForImageSegmentation(config)
+    model.eval()
+    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
+    print("Missing keys:", missing_keys)
+    print("Unexpected keys:", unexpected_keys)
+    input_img, prompt_img, prompt_mask = prepare_input()
+    image_processor = SegGptImageProcessor()
+    inputs = image_processor(images=input_img, prompt_images=prompt_img, prompt_masks=prompt_mask, return_tensors="pt")
+    expected_prompt_pixel_values = torch.tensor(
+        [
+            [[-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965]],
+            [[1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583]],
+            [[2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088]],
+        ]
+    )
+    expected_pixel_values = torch.tensor(
+        [
+            [[1.6324, 1.6153, 1.5810], [1.6153, 1.5982, 1.5810], [1.5810, 1.5639, 1.5639]],
+            [[1.2731, 1.2556, 1.2206], [1.2556, 1.2381, 1.2031], [1.2206, 1.2031, 1.1681]],
+            [[1.6465, 1.6465, 1.6465], [1.6465, 1.6465, 1.6465], [1.6291, 1.6291, 1.6291]],
+        ]
+    )
+    expected_prompt_masks = torch.tensor(
+        [
+            [[-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179]],
+            [[-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357]],
+            [[-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044]],
+        ]
+    )
+    assert torch.allclose(inputs.pixel_values[0, :, :3, :3], expected_pixel_values, atol=1e-4)
+    assert torch.allclose(inputs.prompt_pixel_values[0, :, :3, :3], expected_prompt_pixel_values, atol=1e-4)
+    assert torch.allclose(inputs.prompt_masks[0, :, :3, :3], expected_prompt_masks, atol=1e-4)
+    torch.manual_seed(2)
+    outputs = model(**inputs)
+    print(outputs)
+    if verify_logits:
+        expected_output = torch.tensor(
+            [
+                [[-2.1208, -2.1190, -2.1198], [-2.1237, -2.1228, -2.1227], [-2.1232, -2.1226, -2.1228]],
+                [[-2.0405, -2.0396, -2.0403], [-2.0434, -2.0434, -2.0433], [-2.0428, -2.0432, -2.0434]],
+                [[-1.8102, -1.8088, -1.8099], [-1.8131, -1.8126, -1.8129], [-1.8130, -1.8128, -1.8131]],
+            ]
+        )
+        assert torch.allclose(outputs.pred_masks[0, :, :3, :3], expected_output, atol=1e-4)
+        print("Looks good!")
+    else:
+        print("Converted without verifying logits")
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        image_processor.save_pretrained(pytorch_dump_folder_path)
+    if push_to_hub:
+        print(f"Pushing model and processor for {model_name} to hub")
+        model.push_to_hub(f"EduardoPacheco/{model_name}")
+        image_processor.push_to_hub(f"EduardoPacheco/{model_name}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="seggpt-vit-large",
+        type=str,
+        choices=["seggpt-vit-large"],
+        help="Name of the SegGpt model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--verify_logits",
+        action="store_false",
+        help="Whether or not to verify the logits against the original implementation.",
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_seggpt_checkpoint(args)

docs/transformers/build/lib/transformers/models/seggpt/image_processing_seggpt.py ADDED Viewed

	@@ -0,0 +1,618 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for SegGPT."""
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_torch_available, is_vision_available, logging, requires_backends
+if is_torch_available():
+    import torch
+if is_vision_available():
+    pass
+logger = logging.get_logger(__name__)
+# See https://arxiv.org/pdf/2212.02499.pdf  at 3.1 Redefining Output Spaces as "Images" - Semantic Segmentation from PAINTER paper
+# Taken from https://github.com/Abdullah-Meda/Painter/blob/main/Painter/data/coco_semseg/gen_color_coco_panoptic_segm.py#L31
+def build_palette(num_labels: int) -> List[Tuple[int, int]]:
+    base = int(num_labels ** (1 / 3)) + 1
+    margin = 256 // base
+    # we assume that class_idx 0 is the background which is mapped to black
+    color_list = [(0, 0, 0)]
+    for location in range(num_labels):
+        num_seq_r = location // base**2
+        num_seq_g = (location % base**2) // base
+        num_seq_b = location % base
+        R = 255 - num_seq_r * margin
+        G = 255 - num_seq_g * margin
+        B = 255 - num_seq_b * margin
+        color_list.append((R, G, B))
+    return color_list
+def mask_to_rgb(
+    mask: np.ndarray, palette: Optional[List[Tuple[int, int]]] = None, data_format: Optional[ChannelDimension] = None
+) -> np.ndarray:
+    data_format = data_format if data_format is not None else ChannelDimension.FIRST
+    if palette is not None:
+        height, width = mask.shape
+        rgb_mask = np.zeros((3, height, width), dtype=np.uint8)
+        classes_in_mask = np.unique(mask)
+        for class_idx in classes_in_mask:
+            rgb_value = palette[class_idx]
+            class_mask = (mask == class_idx).astype(np.uint8)
+            class_mask = np.expand_dims(class_mask, axis=-1)
+            class_rgb_mask = class_mask * np.array(rgb_value)
+            class_rgb_mask = np.moveaxis(class_rgb_mask, -1, 0)
+            rgb_mask += class_rgb_mask.astype(np.uint8)
+        rgb_mask = np.clip(rgb_mask, 0, 255).astype(np.uint8)
+    else:
+        rgb_mask = np.repeat(mask[None, ...], 3, axis=0)
+    return to_channel_dimension_format(rgb_mask, data_format)
+class SegGptImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a SegGpt image processor.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 448, "width": 448}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the prompt mask to RGB format. Can be overridden by the `do_convert_rgb` parameter in the
+            `preprocess` method.
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 448, "width": 448}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_convert_rgb = do_convert_rgb
+    def get_palette(self, num_labels: int) -> List[Tuple[int, int]]:
+        """Build a palette to map the prompt mask from a single channel to a 3 channel RGB.
+        Args:
+            num_labels (`int`):
+                Number of classes in the segmentation task (excluding the background).
+        Returns:
+            `List[Tuple[int, int]]`: Palette to map the prompt mask from a single channel to a 3 channel RGB.
+        """
+        return build_palette(num_labels)
+    def mask_to_rgb(
+        self,
+        image: np.ndarray,
+        palette: Optional[List[Tuple[int, int]]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Converts a segmentation map to RGB format.
+        Args:
+            image (`np.ndarray`):
+                Segmentation map with dimensions (height, width) where pixel values represent the class index.
+            palette (`List[Tuple[int, int]]`, *optional*, defaults to `None`):
+                Palette to use to convert the mask to RGB format. If unset, the mask is duplicated across the channel
+                dimension.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        Returns:
+            `np.ndarray`: The mask in RGB format.
+        """
+        return mask_to_rgb(image, palette=palette, data_format=data_format)
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+    def _preprocess_step(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        num_labels: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        Preprocess an image or batch of images.
+        Args:
+            images (`ImageInput`):
+                Image to _preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BICUBIC`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the prompt mask to RGB format. If `num_labels` is specified, a palette will be built
+                to map the prompt mask from a single channel to a 3 channel RGB. If unset, the prompt mask is duplicated
+                across the channel dimension. Must be set to `False` if the prompt mask is already in RGB format.
+            num_labels: (`int`, *optional*):
+                Number of classes in the segmentation task (excluding the background). If specified, a palette will be
+                built, assuming that class_idx 0 is the background, to map the prompt mask from a single class_idx
+                channel to a 3 channel RGB. Not specifying this will result in the prompt mask either being passed
+                through as is if it is already in RGB format or being duplicated across the channel dimension.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        size = size if size is not None else self.size
+        size_dict = get_size_dict(size)
+        # If segmentation map is passed we expect 2D images
+        images = make_list_of_images(images, expected_ndims=2 if do_convert_rgb else 3)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None and not do_convert_rgb:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        if do_convert_rgb:
+            palette = self.get_palette(num_labels) if num_labels is not None else None
+            # Since this is the input for the next transformations its format should be the same as the input_data_format
+            images = [
+                self.mask_to_rgb(image=image, palette=palette, data_format=ChannelDimension.FIRST) for image in images
+            ]
+            input_data_format = ChannelDimension.FIRST
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+        return images
+    def preprocess(
+        self,
+        images: Optional[ImageInput] = None,
+        prompt_images: Optional[ImageInput] = None,
+        prompt_masks: Optional[ImageInput] = None,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        num_labels: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Preprocess an image or batch of images.
+        Args:
+            images (`ImageInput`):
+                Image to _preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            prompt_images (`ImageInput`):
+                Prompt image to _preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            prompt_masks (`ImageInput`):
+                Prompt mask from prompt image to _preprocess that specify prompt_masks value in the preprocessed output.
+                Can either be in the format of segmentation maps (no channels) or RGB images. If in the format of
+                RGB images, `do_convert_rgb` should be set to `False`. If in the format of segmentation maps, `num_labels`
+                specifying `num_labels` is recommended to build a palette to map the prompt mask from a single channel to
+                a 3 channel RGB. If `num_labels` is not specified, the prompt mask will be duplicated across the channel
+                dimension.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BICUBIC`. Only has
+                an effect if `do_resize` is set to `True`. Doesn't apply to prompt mask as it is resized using nearest.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the prompt mask to RGB format. If `num_labels` is specified, a palette will be built
+                to map the prompt mask from a single channel to a 3 channel RGB. If unset, the prompt mask is duplicated
+                across the channel dimension. Must be set to `False` if the prompt mask is already in RGB format.
+            num_labels: (`int`, *optional*):
+                Number of classes in the segmentation task (excluding the background). If specified, a palette will be
+                built, assuming that class_idx 0 is the background, to map the prompt mask from a plain segmentation map
+                with no channels to a 3 channel RGB. Not specifying this will result in the prompt mask either being passed
+                through as is if it is already in RGB format (if `do_convert_rgb` is false) or being duplicated
+                across the channel dimension.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        if all(v is None for v in [images, prompt_images, prompt_masks]):
+            raise ValueError("At least one of images, prompt_images, prompt_masks must be specified.")
+        data = {}
+        if images is not None:
+            images = self._preprocess_step(
+                images,
+                is_mask=False,
+                do_resize=do_resize,
+                size=size,
+                resample=resample,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                do_convert_rgb=False,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                **kwargs,
+            )
+            data["pixel_values"] = images
+        if prompt_images is not None:
+            prompt_images = self._preprocess_step(
+                prompt_images,
+                is_mask=False,
+                do_resize=do_resize,
+                size=size,
+                resample=resample,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                do_convert_rgb=False,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                **kwargs,
+            )
+            data["prompt_pixel_values"] = prompt_images
+        if prompt_masks is not None:
+            prompt_masks = self._preprocess_step(
+                prompt_masks,
+                do_resize=do_resize,
+                size=size,
+                resample=PILImageResampling.NEAREST,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                do_convert_rgb=do_convert_rgb,
+                num_labels=num_labels,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                **kwargs,
+            )
+            data["prompt_masks"] = prompt_masks
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    def post_process_semantic_segmentation(
+        self, outputs, target_sizes: Optional[List[Tuple[int, int]]] = None, num_labels: Optional[int] = None
+    ):
+        """
+        Converts the output of [`SegGptImageSegmentationOutput`] into segmentation maps. Only supports
+        PyTorch.
+        Args:
+            outputs ([`SegGptImageSegmentationOutput`]):
+                Raw outputs of the model.
+            target_sizes (`List[Tuple[int, int]]`, *optional*):
+                List of length (batch_size), where each list item (`Tuple[int, int]`) corresponds to the requested
+                final size (height, width) of each prediction. If left to None, predictions will not be resized.
+            num_labels (`int`, *optional*):
+                Number of classes in the segmentation task (excluding the background). If specified, a palette will be
+                built, assuming that class_idx 0 is the background, to map prediction masks from RGB values to class
+                indices. This value should be the same used when preprocessing inputs.
+        Returns:
+            semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
+            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
+            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
+        """
+        requires_backends(self, ["torch"])
+        # batch_size x num_channels x 2*height x width
+        masks = outputs.pred_masks
+        # Predicted mask and prompt are concatenated in the height dimension
+        # batch_size x num_channels x height x width
+        masks = masks[:, :, masks.shape[2] // 2 :, :]
+        # To unnormalize we need to permute to channel last
+        # batch_size x height x width x num_channels
+        std = torch.tensor(self.image_std).to(masks.device)
+        mean = torch.tensor(self.image_mean).to(masks.device)
+        masks = masks.permute(0, 2, 3, 1) * std + mean
+        # batch_size x num_channels x height x width
+        masks = masks.permute(0, 3, 1, 2)
+        # Clip to match with palette if specified
+        masks = torch.clip(masks * 255, 0, 255)
+        semantic_segmentation = []
+        palette_tensor = None
+        palette = self.get_palette(num_labels) if num_labels is not None else None
+        if palette is not None:
+            palette_tensor = torch.tensor(palette).to(device=masks.device, dtype=torch.float)
+            _, num_channels, _, _ = masks.shape
+            palette_tensor = palette_tensor.view(1, 1, num_labels + 1, num_channels)
+        for idx, mask in enumerate(masks):
+            if target_sizes is not None:
+                mask = torch.nn.functional.interpolate(
+                    mask.unsqueeze(0),
+                    size=target_sizes[idx],
+                    mode="nearest",
+                )[0]
+            if num_labels is not None:
+                channels, height, width = mask.shape
+                dist = mask.permute(1, 2, 0).view(height, width, 1, channels)
+                dist = dist - palette_tensor
+                dist = torch.pow(dist, 2)
+                dist = torch.sum(dist, dim=-1)
+                pred = dist.argmin(dim=-1)
+            else:
+                # If no palette is specified SegGpt will try to paint using the mask class idx as RGB
+                pred = mask.mean(dim=0).int()
+            semantic_segmentation.append(pred)
+        return semantic_segmentation
+__all__ = ["SegGptImageProcessor"]

docs/transformers/build/lib/transformers/models/seggpt/modeling_seggpt.py ADDED Viewed

	@@ -0,0 +1,1031 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch SegGpt model."""
+import collections.abc
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import functional as F
+from ...activations import ACT2FN
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from .configuration_seggpt import SegGptConfig
+logger = logging.get_logger(__name__)
+# General docstring
+_CONFIG_FOR_DOC = "SegGptConfig"
+# Base docstring
+_CHECKPOINT_FOR_DOC = "BAAI/seggpt-vit-large"
+_EXPECTED_OUTPUT_SHAPE = [3, 896, 448]
+@dataclass
+class SegGptEncoderOutput(ModelOutput):
+    """
+    Output type of [`SegGptEncoderOutput`].
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape `(batch_size, patch_height, patch_width, hidden_size)`.
+        attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
+            Tuple of *torch.FloatTensor* (one for each layer) of shape
+            `(batch_size, num_heads, seq_len, seq_len)`.
+        intermediate_hidden_states (`Tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
+            Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
+            Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
+            Additionaly, each feature passes through a LayerNorm.
+    """
+    last_hidden_state: torch.FloatTensor
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    intermediate_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class SegGptImageSegmentationOutput(ModelOutput):
+    """
+    Output type of [`SegGptImageSegmentationOutput`].
+    Args:
+        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
+            The loss value.
+        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The predicted masks.
+        hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape `(batch_size, patch_height, patch_width, hidden_size)`.
+        attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, seq_len, seq_len)`.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    pred_masks: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+# Copied from transformers.models.sam.modeling_sam.SamPatchEmbeddings with Sam->SegGpt
+class SegGptPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+    def forward(self, pixel_values):
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        embeddings = self.projection(pixel_values).permute(0, 2, 3, 1)
+        return embeddings
+class SegGptEmbeddings(nn.Module):
+    """
+    Construct the embeddings from patch, position embeddings for input and prompt.
+    """
+    def __init__(self, config: SegGptConfig) -> None:
+        super().__init__()
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
+        self.segment_token_input = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
+        self.segment_token_prompt = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
+        # token for seg types
+        self.type_token_semantic = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
+        self.type_token_instance = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
+        self.patch_embeddings = SegGptPatchEmbeddings(config)
+        num_positions = (config.pretrain_image_size // config.patch_size) ** 2 + 1
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_positions, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def interpolate_pos_encoding(self, height: int, width: int) -> torch.Tensor:
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        num_patches = patch_pos_embed.shape[1]
+        pretrain_patch_size = torch_int(num_patches**0.5)
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if torch.jit.is_tracing() or pretrain_patch_size != height or pretrain_patch_size != width:
+            patch_pos_embed = F.interpolate(
+                patch_pos_embed.reshape(1, pretrain_patch_size, pretrain_patch_size, -1).permute(0, 3, 1, 2),
+                size=(height, width),
+                mode="bicubic",
+                align_corners=False,
+            )
+            return patch_pos_embed.permute(0, 2, 3, 1)
+        else:
+            return patch_pos_embed.reshape(1, height, width, -1)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        prompt_pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        embedding_type: Optional[str] = None,
+    ) -> torch.Tensor:
+        input_embeddings = self.patch_embeddings(pixel_values)
+        prompt_embeddings = self.patch_embeddings(prompt_pixel_values)
+        batch_size, patch_height, patch_width, _ = input_embeddings.shape
+        mask_token = self.mask_token.expand(batch_size, patch_height, patch_width, -1)
+        # replace the masked visual tokens by mask_token
+        w = bool_masked_pos.unsqueeze(-1).type_as(mask_token).reshape(-1, patch_height, patch_width, 1)
+        prompt_embeddings = prompt_embeddings * (1 - w) + mask_token * w
+        embedding_type = embedding_type if embedding_type is not None else "instance"
+        # add positional encoding to each token
+        pos_embed = self.interpolate_pos_encoding(patch_height, patch_width)
+        # add segment token
+        input_embeddings = input_embeddings + self.segment_token_input
+        prompt_embeddings = prompt_embeddings + self.segment_token_prompt
+        # add position embedding skipping CLS
+        input_embeddings = input_embeddings + pos_embed
+        prompt_embeddings = prompt_embeddings + pos_embed
+        # add type embedding to each token
+        if embedding_type == "semantic":
+            type_embedding = self.type_token_semantic
+        elif embedding_type == "instance":
+            type_embedding = self.type_token_instance
+        else:
+            raise ValueError(f"Embedding type should be either 'semantic' or 'instance', but got {embedding_type}")
+        input_embeddings = input_embeddings + type_embedding
+        prompt_embeddings = prompt_embeddings + type_embedding
+        embeddings = torch.cat((input_embeddings, prompt_embeddings), dim=0)
+        return embeddings
+class SegGptAttention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        input_size = (image_size[0] // config.patch_size, image_size[1] // config.patch_size)
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.num_attention_heads = config.num_attention_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.use_relative_position_embeddings = config.use_relative_position_embeddings
+        if self.use_relative_position_embeddings:
+            if input_size is None:
+                raise ValueError("Input size must be provided if using relative positional encoding.")
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+    def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+        """
+        Get relative positional embeddings according to the relative positions of
+            query and key sizes.
+        Args:
+            q_size (int):
+                size of the query.
+            k_size (int):
+                size of key k.
+            rel_pos (`torch.Tensor`):
+                relative position embeddings (L, channel).
+        Returns:
+            Extracted positional embeddings according to relative positions.
+        """
+        max_rel_dist = int(2 * max(q_size, k_size) - 1)
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+        # Scale the coords with short length if shapes for q and k are different.
+        q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+        k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+        relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+        return rel_pos_resized[relative_coords.long()]
+    def add_decomposed_rel_pos(
+        self,
+        attn: torch.Tensor,
+        query: torch.Tensor,
+        rel_pos_h: torch.Tensor,
+        rel_pos_w: torch.Tensor,
+        q_size: Tuple[int, int],
+        k_size: Tuple[int, int],
+    ) -> torch.Tensor:
+        """
+        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
+        Args:
+            attn (`torch.Tensor`):
+                attention map.
+            query (`torch.Tensor`):
+                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
+            rel_pos_h (`torch.Tensor`):
+                relative position embeddings (Lh, channel) for height axis.
+            rel_pos_w (`torch.Tensor`):
+                relative position embeddings (Lw, channel) for width axis.
+            q_size (tuple):
+                spatial sequence size of query q with (query_height, query_width).
+            k_size (tuple):
+                spatial sequence size of key k with (key_height, key_width).
+        Returns:
+            attn (`torch.Tensor`):
+                attention map with added relative positional embeddings.
+        """
+        query_height, query_width = q_size
+        key_height, key_width = k_size
+        relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h)
+        relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w)
+        batch_size, _, dim = query.shape
+        reshaped_query = query.reshape(batch_size, query_height, query_width, dim)
+        rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
+        rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
+        attn = attn.reshape(batch_size, query_height, query_width, key_height, key_width)
+        attn = attn + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+        attn = attn.reshape(batch_size, query_height * query_width, key_height * key_width)
+        return attn
+    def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
+        batch_size, height, width, _ = hidden_states.shape
+        # qkv with shape (3, batch_size, nHead, height * width, channel)
+        qkv = (
+            self.qkv(hidden_states)
+            .reshape(batch_size, height * width, 3, self.num_attention_heads, -1)
+            .permute(2, 0, 3, 1, 4)
+        )
+        # q, k, v with shape (batch_size * nHead, height * width, channel)
+        query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0)
+        attn_weights = (query * self.scale) @ key.transpose(-2, -1)
+        if self.use_relative_position_embeddings:
+            attn_weights = self.add_decomposed_rel_pos(
+                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            )
+        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_attention_heads, height * width, -1)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_attention_heads, height * width, -1)
+        else:
+            attn_weights_reshaped = None
+        attn_output = (attn_weights @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
+        attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
+        attn_output = self.proj(attn_output)
+        return (attn_output, attn_weights_reshaped)
+# Copied from transformers.models.sam.modeling_sam.SamMLPBlock with SamMLPBlock->SegGptMlp
+class SegGptMlp(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.lin1 = nn.Linear(config.hidden_size, config.mlp_dim)
+        self.lin2 = nn.Linear(config.mlp_dim, config.hidden_size)
+        self.act = ACT2FN[config.hidden_act]
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.lin1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.lin2(hidden_states)
+        return hidden_states
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->SegGpt
+class SegGptDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+class SegGptLayer(nn.Module):
+    def __init__(self, config: SegGptConfig, drop_path_rate: float) -> None:
+        super().__init__()
+        self.attention = SegGptAttention(config)
+        self.mlp = SegGptMlp(config)
+        self.drop_path = SegGptDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        ensemble_cond: int,
+        feature_ensemble: bool = False,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in SegGpt, layernorm is applied before self-attention
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        if feature_ensemble and attention_output.shape[0] // 2 >= ensemble_cond:
+            prompt, inputs = attention_output.split(attention_output.shape[1] // 2, dim=1)
+            if ensemble_cond == 2:
+                num_prompts = attention_output.shape[0] // 2
+                inputs = inputs.reshape(2, num_prompts, -1)
+                inputs = inputs.mean(dim=1, keepdim=True).expand_as(inputs)
+                inputs = inputs.reshape(*prompt.shape)
+            else:
+                inputs = inputs.mean(dim=0, keepdim=True).expand_as(inputs)
+            attention_output = torch.cat([prompt, inputs], dim=1)
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + hidden_states
+        residual = hidden_states
+        hidden_states = self.layernorm_after(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.drop_path(hidden_states)
+        outputs = (hidden_states,) + outputs
+        return outputs
+class SegGptEncoder(nn.Module):
+    def __init__(self, config: SegGptConfig) -> None:
+        super().__init__()
+        self.config = config
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers, device="cpu")]
+        self.layers = nn.ModuleList([SegGptLayer(config, dpr[i]) for i in range(config.num_hidden_layers)])
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        feature_ensemble: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, SegGptEncoderOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        intermediate_hidden_states = []
+        for i, layer_module in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            # Condition to check if we have the appropriate number of prompts to ensemble
+            ensemble_cond = 2 if self.config.merge_index > i else 1
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    ensemble_cond,
+                    feature_ensemble,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, ensemble_cond, feature_ensemble, output_attentions)
+            hidden_states = layer_outputs[0]
+            if i == self.config.merge_index:
+                hidden_states = (
+                    hidden_states[: hidden_states.shape[0] // 2] + hidden_states[hidden_states.shape[0] // 2 :]
+                ) * 0.5
+            if i in self.config.intermediate_hidden_state_indices:
+                intermediate_hidden_states.append(self.layernorm(hidden_states))
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attentions, intermediate_hidden_states]
+                if v is not None
+            )
+        return SegGptEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            intermediate_hidden_states=intermediate_hidden_states,
+        )
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextLayerNorm with ConvNext->SegGpt
+class SegGptLayerNorm(nn.Module):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError(f"Unsupported data format: {self.data_format}")
+        self.normalized_shape = (normalized_shape,)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.data_format == "channels_last":
+            x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            input_dtype = x.dtype
+            x = x.float()
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = x.to(dtype=input_dtype)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class SegGptDecoderHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            config.decoder_hidden_size,
+            config.decoder_hidden_size,
+            kernel_size=3,
+            padding=1,
+        )
+        self.layernorm = SegGptLayerNorm(
+            normalized_shape=config.decoder_hidden_size, eps=config.layer_norm_eps, data_format="channels_first"
+        )
+        self.act_fct = ACT2FN[config.hidden_act]
+        self.head = nn.Conv2d(config.decoder_hidden_size, 3, kernel_size=1, bias=True)  # decoder to patch
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layernorm(hidden_states)
+        hidden_states = self.act_fct(hidden_states)
+        hidden_states = self.head(hidden_states)
+        return hidden_states
+class SegGptDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.decoder_embed = nn.Linear(
+            config.hidden_size * len(config.intermediate_hidden_state_indices),
+            config.patch_size**2 * config.decoder_hidden_size,
+            bias=True,
+        )
+        self.decoder_pred = SegGptDecoderHead(config)
+        self.patch_size = config.patch_size
+        self.decoder_hidden_size = config.decoder_hidden_size
+        self.config = config
+    def _reshape_hidden_states(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        batch_size, patch_height, patch_width, _ = hidden_states.shape
+        hidden_states = hidden_states.reshape(
+            batch_size, patch_height, patch_width, self.patch_size, self.patch_size, self.decoder_hidden_size
+        )
+        hidden_states = hidden_states.permute(0, 5, 1, 3, 2, 4)
+        hidden_states = hidden_states.reshape(
+            shape=(batch_size, -1, patch_height * self.patch_size, patch_width * self.patch_size)
+        )
+        return hidden_states
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.decoder_embed(hidden_states)
+        hidden_states = self._reshape_hidden_states(hidden_states)
+        hidden_states = self.decoder_pred(hidden_states)
+        return hidden_states
+class SegGptPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = SegGptConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["SegGptEmbeddings", "SegGptLayer"]
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(module.weight.data.to(torch.float32), mean=0.0, std=std).to(
+                module.weight.dtype
+            )
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, SegGptAttention):
+            module.rel_pos_h.data = nn.init.trunc_normal_(
+                module.rel_pos_h.data.to(torch.float32),
+                mean=0.0,
+                std=std,
+            ).to(module.rel_pos_h.dtype)
+            module.rel_pos_w.data = nn.init.trunc_normal_(
+                module.rel_pos_w.data.to(torch.float32),
+                mean=0.0,
+                std=std,
+            ).to(module.rel_pos_w.dtype)
+        elif isinstance(module, SegGptEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=std,
+            ).to(module.position_embeddings.dtype)
+            torch.nn.init.normal_(module.mask_token, std=std)
+            torch.nn.init.normal_(module.segment_token_input, std=std)
+            torch.nn.init.normal_(module.segment_token_prompt, std=std)
+            torch.nn.init.normal_(module.type_token_semantic, std=std)
+            torch.nn.init.normal_(module.type_token_instance, std=std)
+SEGGPT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`SegGptConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+SEGGPT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`]
+            for details.
+        prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Prompt pixel values. Prompt pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`SegGptImageProcessor.__call__`] for details.
+        prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Prompt mask. Prompt mask can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`] for
+            details.
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        feature_ensemble (`bool`, *optional*):
+            Boolean indicating whether to use feature ensemble or not. If `True`, the model will use feature ensemble
+            if we have at least two prompts. If `False`, the model will not use feature ensemble. This argument should
+            be considered when doing few-shot inference on an input image i.e. more than one prompt for the same image.
+        embedding_type (`str`, *optional*):
+            Embedding type. Indicates whether the prompt is a semantic or instance embedding. Can be either
+            instance or semantic.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare SegGpt Model transformer outputting raw hidden-states without any specific head on top.",
+    SEGGPT_START_DOCSTRING,
+)
+class SegGptModel(SegGptPreTrainedModel):
+    def __init__(self, config: SegGptConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = SegGptEmbeddings(config)
+        self.encoder = SegGptEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> SegGptPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(SEGGPT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SegGptEncoderOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        prompt_pixel_values: torch.Tensor,
+        prompt_masks: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        feature_ensemble: Optional[bool] = None,
+        embedding_type: Optional[str] = None,
+        labels: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SegGptEncoderOutput]:
+        r"""
+        labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
+            Ground truth mask for input images.
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import SegGptImageProcessor, SegGptModel
+        >>> from PIL import Image
+        >>> import requests
+        >>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
+        >>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
+        >>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"
+        >>> image_input = Image.open(requests.get(image_input_url, stream=True).raw)
+        >>> image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
+        >>> mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw).convert("L")
+        >>> checkpoint = "BAAI/seggpt-vit-large"
+        >>> model = SegGptModel.from_pretrained(checkpoint)
+        >>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)
+        >>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> list(outputs.last_hidden_state.shape)
+        [1, 56, 28, 1024]
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        feature_ensemble = feature_ensemble if feature_ensemble is not None else False
+        expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
+        pixel_values = pixel_values.to(expected_dtype)
+        prompt_pixel_values = prompt_pixel_values.to(expected_dtype)
+        # Prepare inputs
+        pixel_values = torch.cat((prompt_pixel_values, pixel_values), dim=2)
+        prompt_pixel_values = (
+            torch.cat((prompt_masks, prompt_masks), dim=2)
+            if labels is None
+            else torch.cat((prompt_masks, labels), dim=2)
+        )
+        if bool_masked_pos is None and labels is not None:
+            logger.warning_once(
+                "Labels were provided, but bool_masked_pos were not. It will be set to default value. If you're training the model, make sure to provide a bool_masked_pos."
+            )
+        # We concat on height axis so SegGPT can handle as a single image, hence we need to mask the portion
+        # of the mask prompt pixels that will be destinated to the prediction as they don't add any information.
+        # This is only the case for inference. In training, the model concat of prompt mask and label is masked
+        # and reconstructed together (In-Context Painting).
+        if bool_masked_pos is None:
+            num_patches = self.embeddings.patch_embeddings.num_patches
+            bool_masked_pos_zeros = torch.zeros(num_patches // 2, dtype=torch.bool, device=pixel_values.device)
+            bool_masked_pos_ones = torch.ones(
+                num_patches - num_patches // 2, dtype=torch.bool, device=pixel_values.device
+            )
+            bool_masked_pos = torch.cat([bool_masked_pos_zeros, bool_masked_pos_ones])
+            bool_masked_pos = bool_masked_pos.unsqueeze(0)
+        embedding_output = self.embeddings(
+            pixel_values, prompt_pixel_values, embedding_type=embedding_type, bool_masked_pos=bool_masked_pos
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            feature_ensemble=feature_ensemble,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return encoder_outputs
+def patchify(tensor: torch.Tensor, patch_size: int) -> torch.Tensor:
+    batch_size, num_channels, height, width = tensor.shape
+    patch_height = height // patch_size
+    patch_width = width // patch_size
+    tensor = tensor.reshape(shape=(batch_size, num_channels, patch_height, patch_size, patch_width, patch_size))
+    tensor = tensor.permute(0, 2, 4, 3, 5, 1)
+    tensor = tensor.reshape(shape=(batch_size, patch_height * patch_width, patch_size**2 * 3))
+    return tensor
+def unpatchify(tensor: torch.Tensor, patch_height: int, patch_width: int) -> torch.Tensor:
+    batch_size = tensor.shape[0]
+    patch_size = int((tensor.shape[-1] / 3) ** 0.5)
+    if patch_height * patch_width != tensor.shape[1]:
+        raise ValueError(
+            f"Number of patches {tensor.shape[1]} does not match patch height ({patch_height}) and width ({patch_width})."
+        )
+    tensor = tensor.reshape(shape=(batch_size, patch_height, patch_width, patch_size, patch_size, 3))
+    tensor = tensor.permute(0, 5, 1, 3, 2, 4)
+    tensor = tensor.reshape(shape=(batch_size, 3, patch_height * patch_size, patch_width * patch_size))
+    return tensor
+class SegGptLoss(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.beta = config.beta
+        self.patch_size = config.patch_size
+    def forward(
+        self,
+        prompt_masks: torch.FloatTensor,
+        pred_masks: torch.FloatTensor,
+        labels: torch.FloatTensor,
+        bool_masked_pos: torch.BoolTensor,
+    ):
+        """Computes the L1 loss between the predicted masks and the ground truth masks.
+        Args:
+            prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values from mask prompt.
+            pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
+                Predicted masks.
+            labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Ground truth mask for input images.
+            bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
+                Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        Returns:
+            `torch.FloatTensor`: The mean L1 loss between the predicted masks and the ground truth masks.
+        """
+        ground_truth = torch.cat((prompt_masks, labels), dim=2)
+        mask = bool_masked_pos[:, :, None].repeat(1, 1, self.patch_size**2 * 3)
+        mask = unpatchify(mask, ground_truth.shape[2] // self.patch_size, ground_truth.shape[3] // self.patch_size)
+        loss = F.smooth_l1_loss(pred_masks, ground_truth, reduction="none", beta=self.beta)
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+@add_start_docstrings(
+    "SegGpt model with a decoder on top for one-shot image segmentation.",
+    SEGGPT_START_DOCSTRING,
+)
+class SegGptForImageSegmentation(SegGptPreTrainedModel):
+    def __init__(self, config: SegGptConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = SegGptModel(config)
+        self.decoder = SegGptDecoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(SEGGPT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SegGptImageSegmentationOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        prompt_pixel_values: torch.Tensor,
+        prompt_masks: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        feature_ensemble: Optional[bool] = None,
+        embedding_type: Optional[str] = None,
+        labels: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SegGptImageSegmentationOutput]:
+        r"""
+        labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
+            Ground truth mask for input images.
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import SegGptImageProcessor, SegGptForImageSegmentation
+        >>> from PIL import Image
+        >>> import requests
+        >>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
+        >>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
+        >>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"
+        >>> image_input = Image.open(requests.get(image_input_url, stream=True).raw)
+        >>> image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
+        >>> mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw).convert("L")
+        >>> checkpoint = "BAAI/seggpt-vit-large"
+        >>> model = SegGptForImageSegmentation.from_pretrained(checkpoint)
+        >>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)
+        >>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> result = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[(image_input.height, image_input.width)])[0]
+        >>> print(list(result.shape))
+        [170, 297]
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if bool_masked_pos is None:
+            num_patches = self.model.embeddings.patch_embeddings.num_patches
+            bool_masked_pos_zeros = torch.zeros(num_patches // 2, dtype=torch.bool, device=pixel_values.device)
+            bool_masked_pos_ones = torch.ones(
+                num_patches - num_patches // 2, dtype=torch.bool, device=pixel_values.device
+            )
+            bool_masked_pos = torch.cat([bool_masked_pos_zeros, bool_masked_pos_ones])
+            bool_masked_pos = bool_masked_pos.unsqueeze(0)
+        outputs = self.model(
+            pixel_values=pixel_values,
+            prompt_pixel_values=prompt_pixel_values,
+            prompt_masks=prompt_masks,
+            bool_masked_pos=bool_masked_pos,
+            feature_ensemble=feature_ensemble,
+            embedding_type=embedding_type,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        intermediate_hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[-1]
+        intermediate_hidden_states = torch.cat(intermediate_hidden_states, dim=-1)
+        pred_masks = self.decoder(intermediate_hidden_states)
+        loss = None
+        if labels is not None:
+            loss_fn = SegGptLoss(self.config)
+            loss = loss_fn(prompt_masks, pred_masks, labels, bool_masked_pos)
+        if not return_dict:
+            output = (pred_masks,)
+            if output_hidden_states:
+                output = output + (outputs[1],)
+            if output_attentions:
+                idx = 2 if output_hidden_states else 1
+                output = output + (outputs[idx],)
+            if loss is not None:
+                output = (loss,) + output
+            return output
+        return SegGptImageSegmentationOutput(
+            loss=loss,
+            pred_masks=pred_masks,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = ["SegGptModel", "SegGptPreTrainedModel", "SegGptForImageSegmentation"]

docs/transformers/build/lib/transformers/models/sew/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_sew import *
+    from .modeling_sew import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/sew/configuration_sew.py ADDED Viewed

	@@ -0,0 +1,256 @@

+# coding=utf-8
+# Copyright 2021 ASAPP Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SEW model configuration"""
+import functools
+import operator
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+class SEWConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SEWModel`]. It is used to instantiate a SEW model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the SEW
+    [asapp/sew-tiny-100k](https://huggingface.co/asapp/sew-tiny-100k) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32):
+            Vocabulary size of the SEW model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`SEW`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        squeeze_factor (`int`, *optional*, defaults to 2):
+            Sequence length downsampling factor after the encoder and upsampling factor after the transformer.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`SEWForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`SEWForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`SEWForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`Wav2Vec2ForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+    Example:
+    ```python
+    >>> from transformers import SEWConfig, SEWModel
+    >>> # Initializing a SEW asapp/sew-tiny-100k style configuration
+    >>> configuration = SEWConfig()
+    >>> # Initializing a model (with random weights) from the asapp/sew-tiny-100k style configuration
+    >>> model = SEWModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "sew"
+    def __init__(
+        self,
+        vocab_size=32,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        squeeze_factor=2,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512),
+        conv_stride=(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1),
+        conv_kernel=(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1),
+        conv_bias=False,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        ctc_loss_reduction="mean",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.squeeze_factor = squeeze_factor
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. "
+                "It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, "
+                f"but is `len(config.conv_dim) = {len(self.conv_dim)}`, `len(config.conv_stride) "
+                f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+        # sequence classification
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        self.classifier_proj_size = classifier_proj_size
+    @property
+    def inputs_to_logits_ratio(self):
+        return functools.reduce(operator.mul, self.conv_stride, 1)
+__all__ = ["SEWConfig"]

docs/transformers/build/lib/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py ADDED Viewed

	@@ -0,0 +1,305 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert SEW checkpoint."""
+import argparse
+import json
+import os
+import fairseq
+import torch
+from fairseq.data import Dictionary
+# Register SEW's fairseq modules
+from sew_asapp import tasks  # noqa: F401
+from transformers import (
+    SEWConfig,
+    SEWForCTC,
+    SEWModel,
+    Wav2Vec2CTCTokenizer,
+    Wav2Vec2FeatureExtractor,
+    Wav2Vec2Processor,
+    logging,
+)
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+MAPPING = {
+    "post_extract_proj": "feature_projection",
+    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
+    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
+    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
+    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
+    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
+    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
+    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
+    "fc2": "encoder.layers.*.feed_forward.output_dense",
+    "final_layer_norm": "encoder.layers.*.final_layer_norm",
+    "encoder.upsample.0": "encoder.upsample.projection",
+    "encoder.layer_norm": "encoder.layer_norm",
+    "w2v_model.layer_norm": "layer_norm",
+    "w2v_encoder.proj": "lm_head",
+    "mask_emb": "masked_spec_embed",
+}
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+    for attribute in key.split("."):
+        hf_pointer = getattr(hf_pointer, attribute)
+    if weight_type is not None:
+        hf_shape = getattr(hf_pointer, weight_type).shape
+    else:
+        hf_shape = hf_pointer.shape
+    assert hf_shape == value.shape, (
+        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
+        f" {value.shape} for {full_name}"
+    )
+    if weight_type == "weight":
+        hf_pointer.weight.data = value
+    elif weight_type == "weight_g":
+        hf_pointer.weight_g.data = value
+    elif weight_type == "weight_v":
+        hf_pointer.weight_v.data = value
+    elif weight_type == "bias":
+        hf_pointer.bias.data = value
+    else:
+        hf_pointer.data = value
+    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
+def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
+    unused_weights = []
+    fairseq_dict = fairseq_model.state_dict()
+    feature_extractor = hf_model.sew.feature_extractor if is_finetuned else hf_model.feature_extractor
+    for name, value in fairseq_dict.items():
+        is_used = False
+        if "conv_layers" in name:
+            load_conv_layer(
+                name,
+                value,
+                feature_extractor,
+                unused_weights,
+                hf_model.config.feat_extract_norm == "group",
+            )
+            is_used = True
+        else:
+            for key, mapped_key in MAPPING.items():
+                mapped_key = "sew." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key
+                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
+                    is_used = True
+                    if "*" in mapped_key:
+                        layer_index = name.split(key)[0].split(".")[-2]
+                        mapped_key = mapped_key.replace("*", layer_index)
+                    if "weight_g" in name:
+                        weight_type = "weight_g"
+                    elif "weight_v" in name:
+                        weight_type = "weight_v"
+                    elif "weight" in name:
+                        weight_type = "weight"
+                    elif "bias" in name:
+                        weight_type = "bias"
+                    else:
+                        weight_type = None
+                    set_recursively(hf_model, mapped_key, value, name, weight_type)
+                continue
+        if not is_used:
+            unused_weights.append(name)
+    logger.warning(f"Unused weights: {unused_weights}")
+def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
+    name = full_name.split("conv_layers.")[-1]
+    items = name.split(".")
+    layer_id = int(items[0])
+    type_id = int(items[1])
+    if type_id == 0:
+        if "bias" in name:
+            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
+                f"{full_name} has size {value.shape}, but"
+                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
+            )
+            feature_extractor.conv_layers[layer_id].conv.bias.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
+                f"{full_name} has size {value.shape}, but"
+                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
+            )
+            feature_extractor.conv_layers[layer_id].conv.weight.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
+        if "bias" in name:
+            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
+                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
+                " found."
+            )
+            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
+                f"{full_name} has size {value.shape}, but"
+                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
+            )
+            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+    else:
+        unused_weights.append(full_name)
+def convert_config(model, is_finetuned):
+    config = SEWConfig()
+    if is_finetuned:
+        fs_config = model.w2v_encoder.w2v_model.cfg
+    else:
+        fs_config = model.cfg
+    config.conv_bias = fs_config.conv_bias
+    conv_layers = eval(fs_config.conv_feature_layers)
+    config.conv_dim = [x[0] for x in conv_layers]
+    config.conv_kernel = [x[1] for x in conv_layers]
+    config.conv_stride = [x[2] for x in conv_layers]
+    config.feat_extract_activation = "gelu"
+    config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
+    config.final_dropout = 0.0
+    config.hidden_act = fs_config.activation_fn.name
+    config.hidden_size = fs_config.encoder_embed_dim
+    config.initializer_range = 0.02
+    config.intermediate_size = fs_config.encoder_ffn_embed_dim
+    config.layer_norm_eps = 1e-5
+    config.layerdrop = fs_config.encoder_layerdrop
+    config.num_attention_heads = fs_config.encoder_attention_heads
+    config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
+    config.num_conv_pos_embeddings = fs_config.conv_pos
+    config.num_feat_extract_layers = len(conv_layers)
+    config.num_hidden_layers = fs_config.encoder_layers
+    config.squeeze_factor = fs_config.squeeze_factor
+    # take care of any params that are overridden by the Wav2VecCtc model
+    if is_finetuned:
+        fs_config = model.cfg
+        config.final_dropout = fs_config.final_dropout
+        config.layerdrop = fs_config.layerdrop
+    config.activation_dropout = fs_config.activation_dropout
+    config.apply_spec_augment = fs_config.mask_prob > 0 or fs_config.mask_channel_prob > 0
+    config.attention_dropout = fs_config.attention_dropout
+    config.feat_proj_dropout = fs_config.dropout_input
+    config.hidden_dropout = fs_config.dropout
+    config.mask_feature_length = fs_config.mask_channel_length
+    config.mask_feature_prob = fs_config.mask_channel_prob
+    config.mask_time_length = fs_config.mask_length
+    config.mask_time_prob = fs_config.mask_prob
+    config.feature_extractor_type = "Wav2Vec2FeatureExtractor"
+    config.tokenizer_class = "Wav2Vec2CTCTokenizer"
+    return config
+@torch.no_grad()
+def convert_sew_checkpoint(
+    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if is_finetuned:
+        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
+        )
+    else:
+        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
+    if config_path is not None:
+        config = SEWConfig.from_pretrained(config_path)
+    else:
+        config = convert_config(model[0], is_finetuned)
+    model = model[0].eval()
+    return_attention_mask = True if config.feat_extract_norm == "layer" else False
+    feature_extractor = Wav2Vec2FeatureExtractor(
+        feature_size=1,
+        sampling_rate=16000,
+        padding_value=0,
+        do_normalize=True,
+        return_attention_mask=return_attention_mask,
+    )
+    if is_finetuned:
+        if dict_path:
+            target_dict = Dictionary.load(dict_path)
+            # important change bos & pad token id since CTC symbol is <pad> and
+            # not <s> as in fairseq
+            target_dict.indices[target_dict.bos_word] = target_dict.pad_index
+            target_dict.indices[target_dict.pad_word] = target_dict.bos_index
+            config.bos_token_id = target_dict.pad_index
+            config.pad_token_id = target_dict.bos_index
+            config.eos_token_id = target_dict.eos_index
+            config.vocab_size = len(target_dict.symbols)
+            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
+            if not os.path.isdir(pytorch_dump_folder_path):
+                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
+                return
+            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
+            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
+                json.dump(target_dict.indices, vocab_handle)
+            tokenizer = Wav2Vec2CTCTokenizer(
+                vocab_path,
+                unk_token=target_dict.unk_word,
+                pad_token=target_dict.pad_word,
+                bos_token=target_dict.bos_word,
+                eos_token=target_dict.eos_word,
+                word_delimiter_token="|",
+                do_lower_case=False,
+            )
+            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor.save_pretrained(pytorch_dump_folder_path)
+        hf_model = SEWForCTC(config)
+    else:
+        hf_model = SEWModel(config)
+        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    recursively_load_weights(model, hf_model, is_finetuned)
+    hf_model.save_pretrained(pytorch_dump_folder_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
+    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument(
+        "--is_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
+    )
+    args = parser.parse_args()
+    convert_sew_checkpoint(
+        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, args.is_finetuned
+    )

docs/transformers/build/lib/transformers/models/sew/modeling_sew.py ADDED Viewed

	@@ -0,0 +1,1498 @@

+# coding=utf-8
+# Copyright 2021 ASAPP Inc. and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch SEW model."""
+import math
+import warnings
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_sew import SEWConfig
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+logger = logging.get_logger(__name__)
+_HIDDEN_STATES_START_POSITION = 1
+# General docstring
+_CONFIG_FOR_DOC = "SEWConfig"
+# Base docstring
+_CHECKPOINT_FOR_DOC = "asapp/sew-tiny-100k-ft-ls100h"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 512]
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = (
+    "'MISTER QUILTER IS THE APPOSTILE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPOLLE'"
+)
+_CTC_EXPECTED_LOSS = 0.42
+# Audio class docstring
+_SEQ_CLASS_CHECKPOINT = "anton-l/sew-mid-100k-ft-keyword-spotting"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
+_SEQ_CLASS_EXPECTED_LOSS = 9.52
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+        return num_masked_span
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+    return spec_aug_mask
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->SEW
+class SEWNoLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->SEW
+class SEWLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->SEW
+class SEWGroupNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+class SEWPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+            stride=config.squeeze_factor,
+        )
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+            if hasattr(self.conv, "parametrizations"):
+                weight_g = self.conv.parametrizations.weight.original0
+                weight_v = self.conv.parametrizations.weight.original1
+            else:
+                weight_g = self.conv.weight_g
+                weight_v = self.conv.weight_v
+            deepspeed.zero.register_external_parameter(self, weight_v)
+            deepspeed.zero.register_external_parameter(self, weight_g)
+        else:
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
+        self.padding = SEWSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->SEW
+class SEWSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+class SEWUpsampling(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.projection = nn.Linear(config.hidden_size, config.hidden_size * config.squeeze_factor)
+        self.activation = ACT2FN[config.feat_extract_activation]
+        self.squeeze_factor = config.squeeze_factor
+    def forward(self, hidden_states):
+        hidden_states = self.projection(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        if self.squeeze_factor > 1:
+            # transform embedding channels to sequence length
+            bsz, src_len, src_embed_dim = hidden_states.size()
+            tgt_len = src_len * self.squeeze_factor
+            tgt_embed_dim = src_embed_dim // self.squeeze_factor
+            hidden_states = hidden_states.reshape(bsz, src_len, self.squeeze_factor, tgt_embed_dim)
+            hidden_states = hidden_states.reshape(bsz, tgt_len, tgt_embed_dim)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->SEW
+class SEWFeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+    def __init__(self, config):
+        super().__init__()
+        if config.feat_extract_norm == "group":
+            conv_layers = [SEWGroupNormConvLayer(config, layer_id=0)] + [
+                SEWNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [SEWLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    conv_layer.__call__,
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
+        return hidden_states
+class SEWFeatureExtractor(SEWFeatureEncoder):
+    def __init__(self, config):
+        super().__init__(config)
+        warnings.warn(
+            f"The class `{self.__class__.__name__}` has been depreciated "
+            "and will be removed in Transformers v5. "
+            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
+            FutureWarning,
+        )
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->SEW
+class SEWAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[SEWConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped, past_key_value
+# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->SEW
+class SEWFlashAttention2(SEWAttention):
+    """
+    SEW flash attention module. This module inherits from `SEWAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+    def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # SEWFlashAttention2 attention does not support output_attentions
+        if output_attentions:
+            raise ValueError("SEWFlashAttention2 attention does not support output_attentions")
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, q_len, _ = hidden_states.size()
+        # get query proj
+        query_states = self._reshape(self.q_proj(hidden_states), -1, bsz)
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0].transpose(1, 2)
+            value_states = past_key_value[1].transpose(1, 2)
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._reshape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
+            value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
+        else:
+            # self_attention
+            key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=self.dropout if self.training else 0.0,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class SEWSdpaAttention(SEWAttention):
+    # Copied from transformers.models.bart.modeling_bart.BartSdpaAttention.forward with Bart->SEW
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        if output_attentions or layer_head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "SEWModel is using SEWSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
+                ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                key_value_states=key_value_states,
+                past_key_value=past_key_value,
+                attention_mask=attention_mask,
+                layer_head_mask=layer_head_mask,
+                output_attentions=output_attentions,
+            )
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+        query_states = self._shape(query_states, tgt_len, bsz)
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+        # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
+        # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, None, past_key_value
+SEW_ATTENTION_CLASSES = {
+    "eager": SEWAttention,
+    "sdpa": SEWSdpaAttention,
+    "flash_attention_2": SEWFlashAttention2,
+}
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->SEW
+class SEWFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer with Wav2Vec2->SEW, WAV2VEC2->SEW
+class SEWEncoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = SEW_ATTENTION_CLASSES[config._attn_implementation](
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = SEWFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class SEWEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = SEWPositionalConvEmbedding(config)
+        self.pool = nn.AvgPool1d(config.squeeze_factor, config.squeeze_factor)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([SEWEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.upsample = SEWUpsampling(config)
+        self.gradient_checkpointing = False
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        if attention_mask is not None:
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            if self._use_flash_attention_2:
+                # make sure padded tokens output 0
+                hidden_states[~expand_attention_mask] = 0.0
+                # 2d mask is passed through the layers
+                attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            else:
+                # make sure padded tokens output 0
+                hidden_states[~expand_attention_mask] = 0.0
+                input_lengths = (attention_mask.long()).sum(-1)
+                # apply pooling formula to get real output_lengths
+                output_lengths = input_lengths // self.config.squeeze_factor
+                max_encoder_length = hidden_states.shape[1] // self.config.squeeze_factor
+                attention_ids = (
+                    torch.arange(0, max_encoder_length, device=output_lengths.device)
+                    .view(1, -1)
+                    .expand(output_lengths.shape[0], -1)
+                )
+                attention_mask = (attention_ids < output_lengths.view(-1, 1)).long()
+                # extend attention_mask
+                attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+                attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+                attention_mask = attention_mask.expand(
+                    attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+                )
+        n_input_timesteps = hidden_states.shape[1]
+        hidden_states = hidden_states.transpose(1, 2)
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        pooled_hidden_states = self.pool(hidden_states)
+        min_length = min(position_embeddings.size(-1), pooled_hidden_states.size(-1))
+        hidden_states = pooled_hidden_states[..., :min_length] + position_embeddings[..., :min_length]
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                    )
+                hidden_states = layer_outputs[0]
+            if skip_the_layer:
+                layer_outputs = (None, None)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        hidden_states = self.upsample(hidden_states)
+        if hidden_states.shape[1] < n_input_timesteps:
+            hidden_states = nn.functional.pad(hidden_states, (0, 0, 0, n_input_timesteps - hidden_states.shape[1]))
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class SEWPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = SEWConfig
+    base_model_prefix = "sew"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, SEWPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+                if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
+                    with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
+                        nn.init.kaiming_normal_(module.weight.data)
+                else:
+                    with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
+                        nn.init.kaiming_normal_(module.weight.data)
+            else:
+                nn.init.kaiming_normal_(module.weight.data)
+        if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None:
+            module.bias.data.zero_()
+    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the convolutional layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+        return input_lengths
+    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
+        output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+        batch_size = attention_mask.shape[0]
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+SEW_START_DOCSTRING = r"""
+    SEW was proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech
+    Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger,
+    Yoav Artzi.
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`SEWConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+SEW_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare SEW Model transformer outputting raw hidden-states without any specific head on top.",
+    SEW_START_DOCSTRING,
+)
+class SEWModel(SEWPreTrainedModel):
+    def __init__(self, config: SEWConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = SEWFeatureEncoder(config)
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.project_features = config.conv_dim[-1] != config.hidden_size
+        if self.project_features:
+            self.feature_projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.feature_dropout = nn.Dropout(config.feat_proj_dropout)
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+        self.encoder = SEWEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+        return hidden_states
+    @add_start_docstrings_to_model_forward(SEW_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = self.layer_norm(extract_features)
+        if self.project_features:
+            extract_features = self.feature_projection(extract_features)
+        hidden_states = self.feature_dropout(extract_features)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+        hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    """SEW Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    SEW_START_DOCSTRING,
+)
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->SEW, wav2vec2->sew, WAV2VEC2->SEW
+class SEWForCTC(SEWPreTrainedModel):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        super().__init__(config)
+        self.sew = SEWModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.target_lang = target_lang
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `SEWForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for SEW so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, SEW never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.sew.feature_extractor._freeze_parameters()
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.sew.parameters():
+            param.requires_grad = False
+    @add_start_docstrings_to_model_forward(SEW_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+        outputs = self.sew(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+@add_start_docstrings(
+    """
+    SEW Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like SUPERB
+    Keyword Spotting.
+    """,
+    SEW_START_DOCSTRING,
+)
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->SEW, wav2vec2->sew, WAV2VEC2->SEW
+class SEWForSequenceClassification(SEWPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of SEW adapters (config.add_adapter=True)"
+            )
+        self.sew = SEWModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.sew.feature_extractor._freeze_parameters()
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.sew.parameters():
+            param.requires_grad = False
+    @add_start_docstrings_to_model_forward(SEW_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_SEQ_CLASS_CHECKPOINT,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+        outputs = self.sew(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = ["SEWForCTC", "SEWForSequenceClassification", "SEWModel", "SEWPreTrainedModel"]

docs/transformers/build/lib/transformers/models/sew_d/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_sew_d import *
+    from .modeling_sew_d import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/sew_d/configuration_sew_d.py ADDED Viewed

	@@ -0,0 +1,291 @@

+# coding=utf-8
+# Copyright 2021 ASAPP Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SEW-D model configuration"""
+import functools
+import operator
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+class SEWDConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SEWDModel`]. It is used to instantiate a SEW-D
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the SEW-D
+    [asapp/sew-d-tiny-100k](https://huggingface.co/asapp/sew-d-tiny-100k) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32):
+            Vocabulary size of the SEW-D model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`SEWD`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        squeeze_factor (`int`, *optional*, defaults to 2):
+            Sequence length downsampling factor after the encoder and upsampling factor after the transformer.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        position_buckets (`int`, *optional*, defaults to 256):
+            The maximum size of relative position embeddings.
+        share_att_key (`bool`, *optional*, defaults to `True`):
+            Whether to share attention key with c2p and p2c.
+        relative_attention (`bool`, *optional*, defaults to `True`):
+            Whether to use relative position encoding.
+        pos_att_type (`Tuple[str]`, *optional*, defaults to `("p2c", "c2p")`):
+            The type of relative position attention, it can be a combination of `("p2c", "c2p")`, e.g. `("p2c")`,
+            `("p2c", "c2p")`, `("p2c", "c2p")`.
+        norm_rel_ebd (`str`, *optional*, defaults to `"layer_norm"`):
+            Whether to use layer norm in relative embedding (`"layer_norm"` if yes)
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_python"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"`, `"gelu_python"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            Deprecated. Not used by the model and will be removed in a future version.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`SEWDForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-7):
+            The epsilon used by the layer normalization layers in the transformer encoder.
+        feature_layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization after the feature encoder.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
+            The weight of the codebook diversity loss component.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`SEWDForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`SEWDForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`Wav2Vec2ForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+    Example:
+    ```python
+    >>> from transformers import SEWDConfig, SEWDModel
+    >>> # Initializing a SEW-D asapp/sew-d-tiny-100k style configuration
+    >>> configuration = SEWDConfig()
+    >>> # Initializing a model (with random weights) from the asapp/sew-d-tiny-100k style configuration
+    >>> model = SEWDModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "sew-d"
+    def __init__(
+        self,
+        vocab_size=32,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        squeeze_factor=2,
+        max_position_embeddings=512,
+        position_buckets=256,
+        share_att_key=True,
+        relative_attention=True,
+        pos_att_type=("p2c", "c2p"),
+        norm_rel_ebd="layer_norm",
+        hidden_act="gelu_python",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.0,
+        final_dropout=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-7,
+        feature_layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512),
+        conv_stride=(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1),
+        conv_kernel=(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1),
+        conv_bias=False,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        ctc_loss_reduction="mean",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.squeeze_factor = squeeze_factor
+        self.max_position_embeddings = max_position_embeddings
+        self.position_buckets = position_buckets
+        self.share_att_key = share_att_key
+        self.relative_attention = relative_attention
+        self.norm_rel_ebd = norm_rel_ebd
+        self.pos_att_type = list(pos_att_type)
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self._hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.feature_layer_norm_eps = feature_layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. "
+                "It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, "
+                f"but is `len(config.conv_dim) = {len(self.conv_dim)}`, `len(config.conv_stride) "
+                f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+        # sequence classification
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        self.classifier_proj_size = classifier_proj_size
+    @property
+    def inputs_to_logits_ratio(self):
+        return functools.reduce(operator.mul, self.conv_stride, 1)
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary.
+        """
+        output = super().to_dict()
+        output["hidden_dropout"] = output.pop("_hidden_dropout")
+        return output
+__all__ = ["SEWDConfig"]

docs/transformers/build/lib/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert SEW checkpoint."""
+import argparse
+import json
+import os
+import fairseq
+import torch
+from fairseq.data import Dictionary
+# Register SEW's fairseq modules
+from sew_asapp import tasks  # noqa: F401
+from transformers import (
+    SEWDConfig,
+    SEWDForCTC,
+    SEWDModel,
+    Wav2Vec2CTCTokenizer,
+    Wav2Vec2FeatureExtractor,
+    Wav2Vec2Processor,
+    logging,
+)
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+MAPPING = {
+    "post_extract_proj": "feature_projection",
+    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
+    "attention.self.query_proj": "encoder.encoder.layer.*.attention.self.query_proj",
+    "attention.self.key_proj": "encoder.encoder.layer.*.attention.self.key_proj",
+    "attention.self.value_proj": "encoder.encoder.layer.*.attention.self.value_proj",
+    "attention.output.dense": "encoder.encoder.layer.*.attention.output.dense",
+    "attention.output.LayerNorm": "encoder.encoder.layer.*.attention.output.LayerNorm",
+    "intermediate.dense": "encoder.encoder.layer.*.intermediate.dense",
+    "output.dense": "encoder.encoder.layer.*.output.dense",
+    "output.LayerNorm": "encoder.encoder.layer.*.output.LayerNorm",
+    "encoder.encoder.rel_embeddings": "encoder.encoder.rel_embeddings",
+    "encoder.encoder.LayerNorm": "encoder.encoder.LayerNorm",
+    "encoder.upsample.0": "encoder.upsample.projection",
+    "encoder.layer_norm": "encoder.layer_norm",
+    "w2v_model.layer_norm": "layer_norm",
+    "w2v_encoder.proj": "lm_head",
+    "mask_emb": "masked_spec_embed",
+}
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+    for attribute in key.split("."):
+        hf_pointer = getattr(hf_pointer, attribute)
+    if weight_type is not None:
+        hf_shape = getattr(hf_pointer, weight_type).shape
+    else:
+        hf_shape = hf_pointer.shape
+    assert hf_shape == value.shape, (
+        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
+        f" {value.shape} for {full_name}"
+    )
+    if weight_type == "weight":
+        hf_pointer.weight.data = value
+    elif weight_type == "weight_g":
+        hf_pointer.weight_g.data = value
+    elif weight_type == "weight_v":
+        hf_pointer.weight_v.data = value
+    elif weight_type == "bias":
+        hf_pointer.bias.data = value
+    else:
+        hf_pointer.data = value
+    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
+def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
+    unused_weights = []
+    fairseq_dict = fairseq_model.state_dict()
+    feature_extractor = hf_model.sew_d.feature_extractor if is_finetuned else hf_model.feature_extractor
+    for name, value in fairseq_dict.items():
+        is_used = False
+        if "conv_layers" in name:
+            load_conv_layer(
+                name,
+                value,
+                feature_extractor,
+                unused_weights,
+                hf_model.config.feat_extract_norm == "group",
+            )
+            is_used = True
+        else:
+            for key, mapped_key in MAPPING.items():
+                mapped_key = "sew_d." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key
+                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
+                    is_used = True
+                    if "*" in mapped_key:
+                        layer_index = name.split(key)[0].split(".")[-2]
+                        if not layer_index.isnumeric():
+                            continue
+                        mapped_key = mapped_key.replace("*", layer_index)
+                    if "weight_g" in name:
+                        weight_type = "weight_g"
+                    elif "weight_v" in name:
+                        weight_type = "weight_v"
+                    elif "weight" in name:
+                        weight_type = "weight"
+                    elif "bias" in name:
+                        weight_type = "bias"
+                    else:
+                        weight_type = None
+                    set_recursively(hf_model, mapped_key, value, name, weight_type)
+                continue
+        if not is_used:
+            unused_weights.append(name)
+    logger.warning(f"Unused weights: {unused_weights}")
+def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
+    name = full_name.split("conv_layers.")[-1]
+    items = name.split(".")
+    layer_id = int(items[0])
+    type_id = int(items[1])
+    if type_id == 0:
+        if "bias" in name:
+            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
+                f"{full_name} has size {value.shape}, but"
+                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
+            )
+            feature_extractor.conv_layers[layer_id].conv.bias.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
+                f"{full_name} has size {value.shape}, but"
+                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
+            )
+            feature_extractor.conv_layers[layer_id].conv.weight.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
+        if "bias" in name:
+            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
+                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
+                " found."
+            )
+            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
+                f"{full_name} has size {value.shape}, but"
+                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
+            )
+            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+    else:
+        unused_weights.append(full_name)
+def convert_config(model, is_finetuned):
+    config = SEWDConfig()
+    if is_finetuned:
+        fs_config = model.w2v_encoder.w2v_model.cfg
+    else:
+        fs_config = model.cfg
+    config.conv_bias = fs_config.conv_bias
+    conv_layers = eval(fs_config.conv_feature_layers)
+    config.conv_dim = [x[0] for x in conv_layers]
+    config.conv_kernel = [x[1] for x in conv_layers]
+    config.conv_stride = [x[2] for x in conv_layers]
+    config.feat_extract_activation = "gelu"
+    config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
+    config.final_dropout = 0.0
+    config.hidden_act = fs_config.activation_fn.name
+    config.hidden_size = fs_config.encoder_embed_dim
+    config.initializer_range = 0.02
+    config.intermediate_size = fs_config.encoder_ffn_embed_dim
+    config.layer_norm_eps = 1e-5
+    config.layerdrop = fs_config.encoder_layerdrop
+    config.num_attention_heads = fs_config.encoder_attention_heads
+    config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
+    config.num_conv_pos_embeddings = fs_config.conv_pos
+    config.num_feat_extract_layers = len(conv_layers)
+    config.num_hidden_layers = fs_config.encoder_layers
+    config.squeeze_factor = fs_config.squeeze_factor
+    # DeBERTa-specific parameters:
+    config.max_position_embeddings = fs_config.max_position_embeddings
+    config.position_buckets = fs_config.position_buckets
+    config.share_att_key = fs_config.share_att_key
+    config.relative_attention = fs_config.relative_attention
+    config.position_biased_input = fs_config.position_biased_input
+    config.pos_att_type = tuple(fs_config.pos_att_type.split("|"))
+    config.norm_rel_ebd = fs_config.norm_rel_ebd
+    # take care of any params that are overridden by the Wav2VecCtc model
+    if is_finetuned:
+        fs_config = model.cfg
+        config.final_dropout = fs_config.final_dropout
+        config.layerdrop = fs_config.layerdrop
+    config.activation_dropout = fs_config.activation_dropout
+    config.apply_spec_augment = fs_config.mask_prob > 0 or fs_config.mask_channel_prob > 0
+    config.attention_dropout = fs_config.attention_dropout
+    config.feat_proj_dropout = fs_config.dropout_input
+    config.hidden_dropout = fs_config.dropout
+    config.mask_feature_length = fs_config.mask_channel_length
+    config.mask_feature_prob = fs_config.mask_channel_prob
+    config.mask_time_length = fs_config.mask_length
+    config.mask_time_prob = fs_config.mask_prob
+    config.feature_extractor_type = "Wav2Vec2FeatureExtractor"
+    config.tokenizer_class = "Wav2Vec2CTCTokenizer"
+    return config
+@torch.no_grad()
+def convert_sew_checkpoint(
+    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if is_finetuned:
+        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
+        )
+    else:
+        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
+    if config_path is not None:
+        config = SEWDConfig.from_pretrained(config_path)
+    else:
+        config = convert_config(model[0], is_finetuned)
+    model = model[0].eval()
+    return_attention_mask = True if config.feat_extract_norm == "layer" else False
+    feature_extractor = Wav2Vec2FeatureExtractor(
+        feature_size=1,
+        sampling_rate=16000,
+        padding_value=0,
+        do_normalize=True,
+        return_attention_mask=return_attention_mask,
+    )
+    if is_finetuned:
+        if dict_path:
+            target_dict = Dictionary.load(dict_path)
+            # important change bos & pad token id since CTC symbol is <pad> and
+            # not <s> as in fairseq
+            target_dict.indices[target_dict.bos_word] = target_dict.pad_index
+            target_dict.indices[target_dict.pad_word] = target_dict.bos_index
+            config.bos_token_id = target_dict.pad_index
+            config.pad_token_id = target_dict.bos_index
+            config.eos_token_id = target_dict.eos_index
+            config.vocab_size = len(target_dict.symbols)
+            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
+            if not os.path.isdir(pytorch_dump_folder_path):
+                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
+                return
+            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
+            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
+                json.dump(target_dict.indices, vocab_handle)
+            tokenizer = Wav2Vec2CTCTokenizer(
+                vocab_path,
+                unk_token=target_dict.unk_word,
+                pad_token=target_dict.pad_word,
+                bos_token=target_dict.bos_word,
+                eos_token=target_dict.eos_word,
+                word_delimiter_token="|",
+                do_lower_case=False,
+            )
+            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor.save_pretrained(pytorch_dump_folder_path)
+        hf_model = SEWDForCTC(config)
+    else:
+        hf_model = SEWDModel(config)
+        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    recursively_load_weights(model, hf_model, is_finetuned)
+    hf_model.save_pretrained(pytorch_dump_folder_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
+    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument(
+        "--is_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
+    )
+    args = parser.parse_args()
+    convert_sew_checkpoint(
+        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, args.is_finetuned
+    )

docs/transformers/build/lib/transformers/models/sew_d/modeling_sew_d.py ADDED Viewed

	@@ -0,0 +1,1748 @@

+# coding=utf-8
+# Copyright 2021 ASAPP Inc. and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch SEW model."""
+import math
+import warnings
+from collections.abc import Sequence
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import softmax_backward_data
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_sew_d import SEWDConfig
+logger = logging.get_logger(__name__)
+_HIDDEN_STATES_START_POSITION = 1
+# General docstring
+_CONFIG_FOR_DOC = "SEWDConfig"
+# Base docstring
+_CHECKPOINT_FOR_DOC = "asapp/sew-d-tiny-100k-ft-ls100h"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 384]
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTIL OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 0.21
+# Audio class docstring
+_SEQ_CLASS_CHECKPOINT = "anton-l/sew-d-mid-400k-ft-keyword-spotting"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
+_SEQ_CLASS_EXPECTED_LOSS = 3.16
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+        return num_masked_span
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+    return spec_aug_mask
+def make_log_bucket_position(relative_pos, bucket_size, max_position):
+    sign = torch.sign(relative_pos)
+    mid = bucket_size // 2
+    abs_pos = torch.where(
+        (relative_pos < mid) & (relative_pos > -mid),
+        torch.tensor(mid - 1).type_as(relative_pos),
+        torch.abs(relative_pos),
+    )
+    log_pos = (
+        torch.ceil(torch.log(abs_pos / mid) / torch.log(torch.tensor((max_position - 1) / mid)) * (mid - 1)) + mid
+    )
+    bucket_pos = torch.where(abs_pos <= mid, relative_pos.type_as(log_pos), log_pos * sign)
+    return bucket_pos
+def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1, device=None):
+    """
+    Build relative position according to the query and key
+    We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
+    P_k\\)
+    Args:
+        query_size (int): the length of query
+        key_size (int): the length of key
+        bucket_size (int): the size of position bucket
+        max_position (int): the maximum allowed absolute position
+        device (`torch.device`): the device on which tensors will be created.
+    Return:
+        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]
+    """
+    q_ids = torch.arange(0, query_size, device=device)
+    k_ids = torch.arange(0, key_size, device=device)
+    rel_pos_ids = q_ids[:, None] - k_ids[None, :]
+    if bucket_size > 0 and max_position > 0:
+        rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
+    rel_pos_ids = rel_pos_ids.to(torch.long)
+    rel_pos_ids = rel_pos_ids[:query_size, :]
+    rel_pos_ids = rel_pos_ids.unsqueeze(0)
+    return rel_pos_ids
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.c2p_dynamic_expand
+def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
+    return c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)])
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.p2c_dynamic_expand
+def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
+    return c2p_pos.expand([query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)])
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.pos_dynamic_expand
+def pos_dynamic_expand(pos_index, p2c_att, key_layer):
+    return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2)))
+def get_mask(input, local_context):
+    if not isinstance(local_context, DropoutContext):
+        dropout = local_context
+        mask = None
+    else:
+        dropout = local_context.dropout
+        dropout *= local_context.scale
+        mask = local_context.mask if local_context.reuse_mask else None
+    if dropout > 0 and mask is None:
+        mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(torch.bool)
+    if isinstance(local_context, DropoutContext):
+        if local_context.mask is None:
+            local_context.mask = mask
+    return mask, dropout
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->SEWD
+class SEWDNoLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->SEWD
+class SEWDLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->SEWD
+class SEWDGroupNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.sew.modeling_sew.SEWPositionalConvEmbedding with SEW->SEWD
+class SEWDPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+            stride=config.squeeze_factor,
+        )
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+            if hasattr(self.conv, "parametrizations"):
+                weight_g = self.conv.parametrizations.weight.original0
+                weight_v = self.conv.parametrizations.weight.original1
+            else:
+                weight_g = self.conv.weight_g
+                weight_v = self.conv.weight_v
+            deepspeed.zero.register_external_parameter(self, weight_v)
+            deepspeed.zero.register_external_parameter(self, weight_g)
+        else:
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
+        self.padding = SEWDSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->SEW
+class SEWDSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+# Copied from transformers.models.sew.modeling_sew.SEWUpsampling with SEW->SEWD
+class SEWDUpsampling(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.projection = nn.Linear(config.hidden_size, config.hidden_size * config.squeeze_factor)
+        self.activation = ACT2FN[config.feat_extract_activation]
+        self.squeeze_factor = config.squeeze_factor
+    def forward(self, hidden_states):
+        hidden_states = self.projection(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        if self.squeeze_factor > 1:
+            # transform embedding channels to sequence length
+            bsz, src_len, src_embed_dim = hidden_states.size()
+            tgt_len = src_len * self.squeeze_factor
+            tgt_embed_dim = src_embed_dim // self.squeeze_factor
+            hidden_states = hidden_states.reshape(bsz, src_len, self.squeeze_factor, tgt_embed_dim)
+            hidden_states = hidden_states.reshape(bsz, tgt_len, tgt_embed_dim)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->SEWD
+class SEWDFeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+    def __init__(self, config):
+        super().__init__()
+        if config.feat_extract_norm == "group":
+            conv_layers = [SEWDGroupNormConvLayer(config, layer_id=0)] + [
+                SEWDNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [SEWDLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    conv_layer.__call__,
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
+        return hidden_states
+class SEWDFeatureExtractor(SEWDFeatureEncoder):
+    def __init__(self, config):
+        super().__init__(config)
+        warnings.warn(
+            f"The class `{self.__class__.__name__}` has been depreciated "
+            "and will be removed in Transformers v5. "
+            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
+            FutureWarning,
+        )
+class ContextPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
+        self.dropout = StableDropout(config.pooler_dropout)
+        self.config = config
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        context_token = hidden_states[:, 0]
+        context_token = self.dropout(context_token)
+        pooled_output = self.dense(context_token)
+        pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
+        return pooled_output
+    @property
+    def output_dim(self):
+        return self.config.hidden_size
+class XSoftmax(torch.autograd.Function):
+    """
+    Masked Softmax which is optimized for saving memory
+    Args:
+        input (`torch.tensor`): The input tensor that will apply softmax.
+        mask (`torch.IntTensor`):
+            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+        dim (int): The dimension that will apply softmax
+    Example:
+    ```python
+    >>> import torch
+    >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
+    >>> # Make a tensor
+    >>> x = torch.randn([4, 20, 100])
+    >>> # Create a mask
+    >>> mask = (x > 0).int()
+    >>> # Specify the dimension to apply softmax
+    >>> dim = -1
+    >>> y = XSoftmax.apply(x, mask, dim)
+    ```"""
+    @staticmethod
+    def forward(ctx, input, mask, dim):
+        ctx.dim = dim
+        rmask = ~(mask.to(torch.bool))
+        output = input.masked_fill(rmask, torch.tensor(torch.finfo(input.dtype).min))
+        output = torch.softmax(output, ctx.dim)
+        output.masked_fill_(rmask, 0)
+        ctx.save_for_backward(output)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        (output,) = ctx.saved_tensors
+        inputGrad = softmax_backward_data(ctx, grad_output, output, ctx.dim, output)
+        return inputGrad, None, None
+    @staticmethod
+    def symbolic(g, self, mask, dim):
+        import torch.onnx.symbolic_helper as sym_help
+        from torch.onnx.symbolic_opset9 import masked_fill, softmax
+        mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx["Long"])
+        r_mask = g.op(
+            "Cast",
+            g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
+            to_i=sym_help.cast_pytorch_to_onnx["Bool"],
+        )
+        output = masked_fill(
+            g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min))
+        )
+        output = softmax(g, output, dim)
+        return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.bool)))
+class DropoutContext:
+    def __init__(self):
+        self.dropout = 0
+        self.mask = None
+        self.scale = 1
+        self.reuse_mask = True
+class XDropout(torch.autograd.Function):
+    """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""
+    @staticmethod
+    def forward(ctx, input, local_ctx):
+        mask, dropout = get_mask(input, local_ctx)
+        ctx.scale = 1.0 / (1 - dropout)
+        if dropout > 0:
+            ctx.save_for_backward(mask)
+            return input.masked_fill(mask, 0) * ctx.scale
+        else:
+            return input
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.scale > 1:
+            (mask,) = ctx.saved_tensors
+            return grad_output.masked_fill(mask, 0) * ctx.scale, None
+        else:
+            return grad_output, None
+    @staticmethod
+    def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, DropoutContext]) -> torch._C.Value:
+        from torch.onnx import symbolic_opset12
+        dropout_p = local_ctx
+        if isinstance(local_ctx, DropoutContext):
+            dropout_p = local_ctx.dropout
+        # StableDropout only calls this function when training.
+        train = True
+        # TODO: We should check if the opset_version being used to export
+        # is > 12 here, but there's no good way to do that. As-is, if the
+        # opset_version < 12, export will fail with a CheckerError.
+        # Once https://github.com/pytorch/pytorch/issues/78391 is fixed, do something like:
+        # if opset_version < 12:
+        #   return torch.onnx.symbolic_opset9.dropout(g, input, dropout_p, train)
+        return symbolic_opset12.dropout(g, input, dropout_p, train)
+class StableDropout(nn.Module):
+    """
+    Optimized dropout module for stabilizing the training
+    Args:
+        drop_prob (float): the dropout probabilities
+    """
+    def __init__(self, drop_prob):
+        super().__init__()
+        self.drop_prob = drop_prob
+        self.count = 0
+        self.context_stack = None
+    def forward(self, x):
+        """
+        Call the module
+        Args:
+            x (`torch.tensor`): The input tensor to apply dropout
+        """
+        if self.training and self.drop_prob > 0:
+            return XDropout.apply(x, self.get_context())
+        return x
+    def clear_context(self):
+        self.count = 0
+        self.context_stack = None
+    def init_context(self, reuse_mask=True, scale=1):
+        if self.context_stack is None:
+            self.context_stack = []
+        self.count = 0
+        for c in self.context_stack:
+            c.reuse_mask = reuse_mask
+            c.scale = scale
+    def get_context(self):
+        if self.context_stack is not None:
+            if self.count >= len(self.context_stack):
+                self.context_stack.append(DropoutContext())
+            ctx = self.context_stack[self.count]
+            ctx.dropout = self.drop_prob
+            self.count += 1
+            return ctx
+        else:
+            return self.drop_prob
+class SEWDSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.activation_dropout)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class DisentangledSelfAttention(nn.Module):
+    """
+    Disentangled self-attention module
+    Parameters:
+        config (`DebertaV2Config`):
+            A model config class instance with the configuration to build a new model. The schema is similar to
+            *BertConfig*, for more details, please refer [`DebertaV2Config`]
+    """
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.num_attention_heads = config.num_attention_heads
+        _attention_head_size = config.hidden_size // config.num_attention_heads
+        self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        self.share_att_key = getattr(config, "share_att_key", False)
+        self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
+        self.relative_attention = getattr(config, "relative_attention", False)
+        if self.relative_attention:
+            self.position_buckets = getattr(config, "position_buckets", -1)
+            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+            self.pos_ebd_size = self.max_relative_positions
+            if self.position_buckets > 0:
+                self.pos_ebd_size = self.position_buckets
+            self.pos_dropout = StableDropout(config.activation_dropout)
+            if not self.share_att_key:
+                if "c2p" in self.pos_att_type:
+                    self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+                if "p2c" in self.pos_att_type:
+                    self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = StableDropout(config.attention_dropout)
+    def transpose_for_scores(self, x, attention_heads):
+        new_x_shape = x.size()[:-1] + (attention_heads, -1)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1))
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        """
+        Call the module
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input states to the module usually the output from previous layer, it will be the Q,K and V in
+                *Attention(Q,K,V)*
+            attention_mask (`torch.BoolTensor`):
+                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
+                th token.
+            output_attentions (`bool`, *optional*):
+                Whether return the attention matrix.
+            query_states (`torch.FloatTensor`, *optional*):
+                The *Q* state in *Attention(Q,K,V)*.
+            relative_pos (`torch.LongTensor`):
+                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+                values ranging in [*-max_relative_positions*, *max_relative_positions*].
+            rel_embeddings (`torch.FloatTensor`):
+                The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+                \\text{max_relative_positions}\\), *hidden_size*].
+        """
+        if query_states is None:
+            query_states = hidden_states
+        query_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads)
+        key_layer = self.transpose_for_scores(self.key_proj(hidden_states), self.num_attention_heads)
+        value_layer = self.transpose_for_scores(self.value_proj(hidden_states), self.num_attention_heads)
+        rel_att = None
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        scale_factor = 1
+        if "c2p" in self.pos_att_type:
+            scale_factor += 1
+        if "p2c" in self.pos_att_type:
+            scale_factor += 1
+        scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
+        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2) / scale.to(dtype=query_layer.dtype))
+        if self.relative_attention:
+            rel_embeddings = self.pos_dropout(rel_embeddings)
+            rel_att = self.disentangled_attention_bias(
+                query_layer, key_layer, relative_pos, rel_embeddings, scale_factor
+            )
+        if rel_att is not None:
+            attention_scores = attention_scores + rel_att
+        attention_scores = attention_scores
+        attention_scores = attention_scores.view(
+            -1, self.num_attention_heads, attention_scores.size(-2), attention_scores.size(-1)
+        )
+        # bsz x height x length x dimension
+        attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        attention_probs = self.dropout(attention_probs)
+        context_layer = torch.bmm(
+            attention_probs.view(-1, attention_probs.size(-2), attention_probs.size(-1)), value_layer
+        )
+        context_layer = (
+            context_layer.view(-1, self.num_attention_heads, context_layer.size(-2), context_layer.size(-1))
+            .permute(0, 2, 1, 3)
+            .contiguous()
+        )
+        new_context_layer_shape = context_layer.size()[:-2] + (-1,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        if output_attentions:
+            return (context_layer, attention_probs)
+        else:
+            return context_layer
+    def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
+        if relative_pos is None:
+            q = query_layer.size(-2)
+            relative_pos = build_relative_position(
+                q,
+                key_layer.size(-2),
+                bucket_size=self.position_buckets,
+                max_position=self.max_relative_positions,
+                device=query_layer.device,
+            )
+        if relative_pos.dim() == 2:
+            relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
+        elif relative_pos.dim() == 3:
+            relative_pos = relative_pos.unsqueeze(1)
+        # bsz x height x query x key
+        elif relative_pos.dim() != 4:
+            raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
+        att_span = self.pos_ebd_size
+        relative_pos = relative_pos.to(device=query_layer.device, dtype=torch.long)
+        rel_embeddings = rel_embeddings[0 : att_span * 2, :].unsqueeze(0)
+        if self.share_att_key:
+            pos_query_layer = self.transpose_for_scores(
+                self.query_proj(rel_embeddings), self.num_attention_heads
+            ).repeat(query_layer.size(0) // self.num_attention_heads, 1, 1)
+            pos_key_layer = self.transpose_for_scores(self.key_proj(rel_embeddings), self.num_attention_heads).repeat(
+                query_layer.size(0) // self.num_attention_heads, 1, 1
+            )
+        else:
+            if "c2p" in self.pos_att_type:
+                pos_key_layer = self.transpose_for_scores(
+                    self.pos_key_proj(rel_embeddings), self.num_attention_heads
+                ).repeat(query_layer.size(0) // self.num_attention_heads, 1, 1)  # .split(self.all_head_size, dim=-1)
+            if "p2c" in self.pos_att_type:
+                pos_query_layer = self.transpose_for_scores(
+                    self.pos_query_proj(rel_embeddings), self.num_attention_heads
+                ).repeat(query_layer.size(0) // self.num_attention_heads, 1, 1)  # .split(self.all_head_size, dim=-1)
+        score = 0
+        # content->position
+        if "c2p" in self.pos_att_type:
+            scale = torch.sqrt(torch.tensor(pos_key_layer.size(-1), dtype=torch.float) * scale_factor)
+            c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2))
+            c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
+            c2p_att = torch.gather(
+                c2p_att,
+                dim=-1,
+                index=c2p_pos.squeeze(0).expand([query_layer.size(0), query_layer.size(1), relative_pos.size(-1)]),
+            )
+            score += c2p_att / scale.to(dtype=c2p_att.dtype)
+        # position->content
+        if "p2c" in self.pos_att_type:
+            scale = torch.sqrt(torch.tensor(pos_query_layer.size(-1), dtype=torch.float) * scale_factor)
+            if key_layer.size(-2) != query_layer.size(-2):
+                r_pos = build_relative_position(
+                    key_layer.size(-2),
+                    key_layer.size(-2),
+                    bucket_size=self.position_buckets,
+                    max_position=self.max_relative_positions,
+                    device=query_layer.device,
+                )
+                r_pos = r_pos.unsqueeze(0)
+            else:
+                r_pos = relative_pos
+            p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1)
+            p2c_att = torch.bmm(key_layer, pos_query_layer.transpose(-1, -2))
+            p2c_att = torch.gather(
+                p2c_att,
+                dim=-1,
+                index=p2c_pos.squeeze(0).expand([query_layer.size(0), key_layer.size(-2), key_layer.size(-2)]),
+            ).transpose(-1, -2)
+            score += p2c_att / scale.to(dtype=p2c_att.dtype)
+        return score
+class SEWDAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = DisentangledSelfAttention(config)
+        self.output = SEWDSelfOutput(config)
+        self.config = config
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        self_output = self.self(
+            hidden_states,
+            attention_mask,
+            output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+        if output_attentions:
+            self_output, att_matrix = self_output
+        if query_states is None:
+            query_states = hidden_states
+        attention_output = self.output(self_output, query_states)
+        if output_attentions:
+            return (attention_output, att_matrix)
+        else:
+            return attention_output
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->SEWD
+class SEWDIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class SEWDOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.activation_dropout)
+        self.config = config
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class SEWDLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = SEWDAttention(config)
+        self.intermediate = SEWDIntermediate(config)
+        self.output = SEWDOutput(config)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+        output_attentions=False,
+    ):
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+        if output_attentions:
+            attention_output, att_matrix = attention_output
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        if output_attentions:
+            return (layer_output, att_matrix)
+        else:
+            return layer_output
+class ConvLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        kernel_size = getattr(config, "conv_kernel_size", 3)
+        groups = getattr(config, "conv_groups", 1)
+        self.conv_act = getattr(config, "conv_act", "tanh")
+        self.conv = nn.Conv1d(
+            config.hidden_size, config.hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups
+        )
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+    def forward(self, hidden_states, residual_states, input_mask):
+        out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous()
+        rmask = (1 - input_mask).bool()
+        out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0)
+        out = ACT2FN[self.conv_act](self.dropout(out))
+        layer_norm_input = residual_states + out
+        output = self.LayerNorm(layer_norm_input).to(layer_norm_input)
+        if input_mask is None:
+            output_states = output
+        else:
+            if input_mask.dim() != layer_norm_input.dim():
+                if input_mask.dim() == 4:
+                    input_mask = input_mask.squeeze(1).squeeze(1)
+                input_mask = input_mask.unsqueeze(2)
+            input_mask = input_mask.to(output.dtype)
+            output_states = output * input_mask
+        return output_states
+class SEWDTransformerEncoder(nn.Module):
+    """Modified BertEncoder with relative position bias support"""
+    def __init__(self, config):
+        super().__init__()
+        self.layer = nn.ModuleList([SEWDLayer(config) for _ in range(config.num_hidden_layers)])
+        self.relative_attention = getattr(config, "relative_attention", False)
+        if self.relative_attention:
+            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+            self.position_buckets = getattr(config, "position_buckets", -1)
+            pos_ebd_size = self.max_relative_positions * 2
+            if self.position_buckets > 0:
+                pos_ebd_size = self.position_buckets * 2
+            self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size)
+        self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")]
+        if "layer_norm" in self.norm_rel_ebd:
+            self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True)
+        self.conv = ConvLayer(config) if getattr(config, "conv_kernel_size", 0) > 0 else None
+        self.gradient_checkpointing = False
+    def get_rel_embedding(self):
+        rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
+        if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd):
+            rel_embeddings = self.LayerNorm(rel_embeddings)
+        return rel_embeddings
+    def get_attention_mask(self, attention_mask):
+        if attention_mask.dim() <= 2:
+            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
+        elif attention_mask.dim() == 3:
+            attention_mask = attention_mask.unsqueeze(1)
+        return attention_mask
+    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
+        if self.relative_attention and relative_pos is None:
+            q = query_states.size(-2) if query_states is not None else hidden_states.size(-2)
+            relative_pos = build_relative_position(
+                q,
+                hidden_states.size(-2),
+                bucket_size=self.position_buckets,
+                max_position=self.max_relative_positions,
+                device=hidden_states.device,
+            )
+        return relative_pos
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_hidden_states=True,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        return_dict=True,
+    ):
+        if attention_mask.dim() <= 2:
+            input_mask = attention_mask
+        else:
+            input_mask = attention_mask.sum(-2) > 0
+        attention_mask = self.get_attention_mask(attention_mask)
+        relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        if isinstance(hidden_states, Sequence):
+            next_kv = hidden_states[0]
+        else:
+            next_kv = hidden_states
+        rel_embeddings = self.get_rel_embedding()
+        output_states = next_kv
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (output_states,)
+            if self.gradient_checkpointing and self.training:
+                output_states = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    next_kv,
+                    attention_mask,
+                    query_states,
+                    relative_pos,
+                    rel_embeddings,
+                    output_attentions,
+                )
+            else:
+                output_states = layer_module(
+                    next_kv,
+                    attention_mask,
+                    query_states=query_states,
+                    relative_pos=relative_pos,
+                    rel_embeddings=rel_embeddings,
+                    output_attentions=output_attentions,
+                )
+            if output_attentions:
+                output_states, att_m = output_states
+            if i == 0 and self.conv is not None:
+                output_states = self.conv(hidden_states, output_states, input_mask)
+            if query_states is not None:
+                query_states = output_states
+                if isinstance(hidden_states, Sequence):
+                    next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
+            else:
+                next_kv = output_states
+            if output_attentions:
+                all_attentions = all_attentions + (att_m,)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (output_states,)
+        if not return_dict:
+            return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+class SEWDEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = SEWDPositionalConvEmbedding(config)
+        self.pool = nn.AvgPool1d(config.squeeze_factor, config.squeeze_factor)
+        self.encoder = SEWDTransformerEncoder(config)
+        self.upsample = SEWDUpsampling(config)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        max_encoder_length = hidden_states.shape[1] // self.config.squeeze_factor
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (hidden_states.shape[0], max_encoder_length), dtype=torch.long, device=hidden_states.device
+            )
+        else:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask.bool()] = 0.0
+            input_lengths = (attention_mask.long()).sum(-1)
+            # apply pooling formula to get real output_lengths
+            output_lengths = input_lengths // self.config.squeeze_factor
+            attention_ids = (
+                torch.arange(0, max_encoder_length, device=output_lengths.device)
+                .view(1, -1)
+                .expand(output_lengths.shape[0], -1)
+            )
+            attention_mask = (attention_ids < output_lengths.view(-1, 1)).long()
+        n_input_timesteps = hidden_states.shape[1]
+        hidden_states = hidden_states.transpose(1, 2)
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        pooled_hidden_states = self.pool(hidden_states)
+        min_length = min(position_embeddings.size(-1), pooled_hidden_states.size(-1))
+        hidden_states = pooled_hidden_states[..., :min_length] + position_embeddings[..., :min_length]
+        hidden_states = hidden_states.transpose(1, 2)
+        encoder_outputs = self.encoder(hidden_states, attention_mask, output_hidden_states, output_attentions)
+        hidden_states = self.upsample(encoder_outputs.last_hidden_state)
+        if hidden_states.shape[1] < n_input_timesteps:
+            hidden_states = nn.functional.pad(hidden_states, (0, 0, 0, n_input_timesteps - hidden_states.shape[1]))
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, encoder_outputs.hidden_states, encoder_outputs.attentions] if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+class SEWDPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = SEWDConfig
+    base_model_prefix = "sew-d"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, SEWDPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+                if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
+                    with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
+                        nn.init.kaiming_normal_(module.weight.data)
+                else:
+                    with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
+                        nn.init.kaiming_normal_(module.weight.data)
+            else:
+                nn.init.kaiming_normal_(module.weight.data)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None:
+            module.bias.data.zero_()
+    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the convolutional layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+        return input_lengths
+    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
+        output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+        batch_size = attention_mask.shape[0]
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+SEWD_START_DOCSTRING = r"""
+    SEW-D was proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech
+    Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger,
+    Yoav Artzi.
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`SEWDConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+SEWD_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare SEW-D Model transformer outputting raw hidden-states without any specific head on top.",
+    SEWD_START_DOCSTRING,
+)
+# Copied from transformers.models.sew.modeling_sew.SEWModel with SEW->SEWD, layer_norm_eps->feature_layer_norm_eps
+class SEWDModel(SEWDPreTrainedModel):
+    def __init__(self, config: SEWDConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = SEWDFeatureEncoder(config)
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.feature_layer_norm_eps)
+        self.project_features = config.conv_dim[-1] != config.hidden_size
+        if self.project_features:
+            self.feature_projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.feature_dropout = nn.Dropout(config.feat_proj_dropout)
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+        self.encoder = SEWDEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+        return hidden_states
+    @add_start_docstrings_to_model_forward(SEWD_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = self.layer_norm(extract_features)
+        if self.project_features:
+            extract_features = self.feature_projection(extract_features)
+        hidden_states = self.feature_dropout(extract_features)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+        hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    """SEW-D Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    SEWD_START_DOCSTRING,
+)
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->SEWD, wav2vec2->sew_d, WAV2VEC2->SEWD
+class SEWDForCTC(SEWDPreTrainedModel):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        super().__init__(config)
+        self.sew_d = SEWDModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.target_lang = target_lang
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `SEWDForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for SEWD so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, SEWD never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.sew_d.feature_extractor._freeze_parameters()
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.sew_d.parameters():
+            param.requires_grad = False
+    @add_start_docstrings_to_model_forward(SEWD_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+        outputs = self.sew_d(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+@add_start_docstrings(
+    """
+    SEWD Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like SUPERB
+    Keyword Spotting.
+    """,
+    SEWD_START_DOCSTRING,
+)
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->SEWD, wav2vec2->sew_d, WAV2VEC2->SEWD
+class SEWDForSequenceClassification(SEWDPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of SEWD adapters (config.add_adapter=True)"
+            )
+        self.sew_d = SEWDModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.sew_d.feature_extractor._freeze_parameters()
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.sew_d.parameters():
+            param.requires_grad = False
+    @add_start_docstrings_to_model_forward(SEWD_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_SEQ_CLASS_CHECKPOINT,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+        outputs = self.sew_d(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = ["SEWDForCTC", "SEWDForSequenceClassification", "SEWDModel", "SEWDPreTrainedModel"]

docs/transformers/build/lib/transformers/models/shieldgemma2/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_shieldgemma2 import *
+    from .modeling_shieldgemma2 import *
+    from .processing_shieldgemma2 import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/shieldgemma2/configuration_shieldgemma2.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+logger = logging.get_logger(__name__)
+class ShieldGemma2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ShieldGemma2ForImageClassification`]. It is used to instantiate an
+    ShieldGemma2ForImageClassification according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the shieldgemma-2-4b-it.
+    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        text_config (`Union[ShieldGemma2TextConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom vision config or dict.
+        mm_tokens_per_image (`int`, *optional*, defaults to 256):
+            The number of tokens per image embedding.
+        boi_token_index (`int`, *optional*, defaults to 255999):
+            The begin-of-image token index to wrap the image prompt.
+        eoi_token_index (`int`, *optional*, defaults to 256000):
+            The end-of-image token index to wrap the image prompt.
+        image_token_index (`int`, *optional*, defaults to 262144):
+            The image token index to encode the image prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    Example:
+    ```python
+    >>> from transformers import ShieldGemma2ForConditionalGeneration, ShieldGemma2Config, SiglipVisionConfig, ShieldGemma2TextConfig
+    >>> # Initializing a Siglip-like vision config
+    >>> vision_config = SiglipVisionConfig()
+    >>> # Initializing a ShieldGemma2 Text config
+    >>> text_config = ShieldGemma2TextConfig()
+    >>> # Initializing a ShieldGemma2 gemma-3-4b style configuration
+    >>> configuration = ShieldGemma2Config(vision_config, text_config)
+    >>> # Initializing a model from the gemma-3-4b style configuration
+    >>> model = ShieldGemma2TextConfig(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "shieldgemma2"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+        "boi_token_id": "boi_token_index",
+        "eoi_token_id": "eoi_token_index",
+    }
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        mm_tokens_per_image: int = 256,
+        boi_token_index: int = 255_999,
+        eoi_token_index: int = 256_000,
+        image_token_index: int = 262_144,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model"
+            )
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["siglip_vision_model"]()
+        self.vision_config = vision_config
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "gemma3_text"
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["gemma3_text"]()
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.mm_tokens_per_image = mm_tokens_per_image
+        self.boi_token_index = boi_token_index
+        self.eoi_token_index = eoi_token_index
+        self.image_token_index = image_token_index
+        self.initializer_range = initializer_range
+        super().__init__(**kwargs)
+__all__ = ["ShieldGemma2Config"]

docs/transformers/build/lib/transformers/models/shieldgemma2/convert_shieldgemma2_weights_orbax_to_hf.py ADDED Viewed

	@@ -0,0 +1,470 @@

+r"""Utility to convert Gemma models from Orbax to HF Transformers checkpoint.
+python -m transformers.models.shieldgemma2.convert_shieldgemma2_weights_orbax_to_hf \
+    --tokenizer_path="$HOME/gemma3/tokenizer/gemma3_cleaned_262144_v2.spiece.model" \
+    --checkpoint_path_gemma="$HOME/gemma3/gemma3_4b_pt_orbax/" \
+    --checkpoint_path_shieldgemma="$HOME/shieldgemma2/shieldgemma-2_4b_orbax/" \
+    --output_path="$HOME/shieldgemma2/shieldgemma2_4b_pt_safetensors/" \
+    --precision='bfloat16'
+"""
+import dataclasses
+from collections.abc import Iterator, Mapping, Sequence
+from typing import Any
+import accelerate
+import numpy as np
+import torch
+import tree
+from absl import app, flags, logging
+from orbax import checkpoint as obc
+from ...image_utils import PILImageResampling
+from ..gemma import GemmaTokenizerFast
+from ..gemma3 import Gemma3ImageProcessor, Gemma3TextConfig
+from ..siglip import SiglipVisionConfig
+from . import ShieldGemma2Config, ShieldGemma2ForImageClassification, ShieldGemma2Processor
+# ==== Internal Constants and Classes ====
+_CHAT_TEMPLATE = """{{ bos_token }}
+{%- for message in messages -%}
+    {%- if message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {{ '<start_of_turn>user\n' }}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ 'You are a policy expert trying to help determine whether a generated or real image is in' }}
+                {{ ' violation of the safety policy below. Your decision must very clearly follow the specific'}}
+                {{ ' language used in the safety policy and should not include any interpretation outside of the' }}
+                {{ ' safety policy.\n\n' }}
+                {{ '<BEGIN_SAFETY_POLICY>\n' }}
+                {{ item['text'] + '\n' }}
+                {{ '</END_SAFETY_POLICY>\n\n' }}
+                {{ 'Does the image violate the above policy? Your Answer must start with \\'Yes\\' or \\'No\\'.' }}
+                {{ '<end_of_turn>\n' }}
+            {%- endif -%}
+        {%- endfor -%}
+        {{'<start_of_turn>model\n'}}
+    {%- else -%}
+        {{ raise_exception("Conversation messages must contain iterable content containing images and policy definitions in text.") }}
+    {%- endif -%}
+{%- endfor -%}
+"""
+_DTYPES = {
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+    "float16": torch.float16,
+}
+_SHIELDGEMMA2_POLICIES: Mapping[str, str] = {
+    "dangerous": (
+        "No Dangerous Content: The image shall not contain content that"
+        " facilitates or encourages activities that could cause real-world harm"
+        " (e.g., building firearms and explosive devices, promotion of"
+        " terrorism, instructions for suicide)."
+    ),
+    "sexual": (
+        "No Sexually Explicit content: The image shall not contain content that"
+        " depicts explicit or graphic sexual acts (e.g., pornography, erotic"
+        " nudity, depictions of rape or sexual assault)."
+    ),
+    "violence": (
+        "No Violence/Gore content: The image shall not contain content that"
+        " depicts shocking, sensational, or gratuitous violence (e.g.,"
+        " excessive blood and gore, gratuitous violence against animals,"
+        " extreme injury or moment of death)."
+    ),
+}
+_SIGLIP_BASE = "SigLiPFromPatches_0/siglip_encoder"
+_SIGLIP_EMBEDDING = "SigLiPFromPatches_0/siglip_encoder/embedding"
+_SIGLIP_TRANSFORMER_ENCODER_BLOCK = "SigLiPFromPatches_0/siglip_encoder/Transformer/encoderblock_"
+_SIGLIP_TRANSFORMER_ENCODER_BLOCK_LEN = len(_SIGLIP_TRANSFORMER_ENCODER_BLOCK)
+_SIGLIP_TRANSFORMER_ENCODER_NORM = "SigLiPFromPatches_0/siglip_encoder/Transformer/encoder_norm"
+_TRANSFORMER_DECODER_BLOCK = "transformer/layer_"
+_TRANSFORMER_DECODER_BLOCK_LEN = len(_TRANSFORMER_DECODER_BLOCK)
+_TRANSFORMER_EMBEDDER = "transformer/embedder"
+_TRANSFORMER_FINAL_NORM = "transformer/final_norm"
+_TRANSFORMER_POST_TRAINING_PREFIX = "rlx_networks/policy_network/"
+_TRANSFORMER_POST_TRAINING_PREFIX_LEN = len(_TRANSFORMER_POST_TRAINING_PREFIX)
+# ==== Flags ====
+_GEMMA_CHECKPOINT_PATH = flags.DEFINE_string(
+    name="checkpoint_path_gemma",
+    default=None,
+    help="Path to the Orbax checkpoint containing the vision weights.",
+    required=True,
+)
+_SHIELDGEMMA_CHECKPOINT_PATH = flags.DEFINE_string(
+    name="checkpoint_path_shieldgemma",
+    default=None,
+    help="Path to the Orbax checkpoint containing the language model weights.",
+    required=True,
+)
+OUTPUT_PATH = flags.DEFINE_string(
+    name="output_path",
+    default=None,
+    help="Path to store the HF checkpoint.",
+    required=True,
+)
+PRECISION = flags.DEFINE_enum(
+    name="precision",
+    default=None,
+    help="The floating point precision (aka dtype) of the model.",
+    enum_values=set(_DTYPES.keys()),
+    required=True,
+)
+TOKENIZER_PATH = flags.DEFINE_string(
+    name="tokenizer_path",
+    default=None,
+    help="Path to the SentencePiece model file.",
+    required=True,
+)
+def convert_siglip_weight(
+    config: SiglipVisionConfig,
+    paths: Sequence[str],
+    weights: np.ndarray,
+) -> tuple[str, np.ndarray]:
+    path, prop = paths
+    normalized_path: str = ""
+    updated_weights: np.ndarray = None
+    if path == _SIGLIP_BASE:
+        normalized_path = "vision_tower.vision_model.embeddings.position_embedding.weight"
+        updated_weights = weights.reshape(-1, config.hidden_size)
+    elif path == _SIGLIP_EMBEDDING:
+        if prop == "kernel":
+            normalized_path = "vision_tower.vision_model.embeddings.patch_embedding.weight"
+            updated_weights = weights.transpose(3, 2, 0, 1)
+        elif prop == "bias":
+            normalized_path = "vision_tower.vision_model.embeddings.patch_embedding.bias"
+            updated_weights = weights
+        else:
+            raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `kernel`.")
+    elif path.startswith(_SIGLIP_TRANSFORMER_ENCODER_BLOCK):
+        encoder_block_path = path[_SIGLIP_TRANSFORMER_ENCODER_BLOCK_LEN:]
+        next_path_seperator_idx = encoder_block_path.find("/")
+        layer_idx = encoder_block_path[:next_path_seperator_idx]
+        encoder_block_path = encoder_block_path[next_path_seperator_idx:]
+        normalized_path = f"vision_tower.vision_model.encoder.layers.{layer_idx}"
+        if encoder_block_path.startswith("/LayerNorm"):
+            normalized_path += ".layer_norm1" if path.endswith("_0") else ".layer_norm2"
+            if prop == "scale":
+                normalized_path += ".weight"
+                updated_weights = weights.transpose()
+            elif prop == "bias":
+                normalized_path += ".bias"
+                updated_weights = weights
+            else:
+                raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `scale`.")
+        elif encoder_block_path.startswith("/MlpBlock_0"):
+            normalized_path += ".mlp.fc1" if "/Dense_0" in encoder_block_path else ".mlp.fc2"
+            if prop == "kernel":
+                normalized_path += ".weight"
+                updated_weights = weights.transpose()
+            elif prop == "bias":
+                normalized_path += ".bias"
+                updated_weights = weights
+            else:
+                raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `kernel`.")
+        elif encoder_block_path.startswith("/MultiHeadDotProductAttention_0"):
+            if encoder_block_path.endswith("/key"):
+                normalized_path += ".self_attn.k_proj"
+            elif encoder_block_path.endswith("/out"):
+                normalized_path += ".self_attn.out_proj"
+            elif encoder_block_path.endswith("/query"):
+                normalized_path += ".self_attn.q_proj"
+            elif encoder_block_path.endswith("/value"):
+                normalized_path += ".self_attn.v_proj"
+            else:
+                raise ValueError(f"Unexpected path `{path}` in SigLIP Transformer MultiHeadDotProductAttention_0.")
+            if prop == "bias":
+                normalized_path += ".bias"
+                updated_weights = weights.reshape(-1, config.hidden_size).reshape(-1)
+            elif prop == "kernel":
+                normalized_path += ".weight"
+                updated_weights = weights.reshape(-1, config.hidden_size).transpose()
+            else:
+                raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `kernel`.")
+        else:
+            raise ValueError(f"Unexpected path `{path}` in SigLIP Transformer Encoder Block.")
+    elif path == _SIGLIP_TRANSFORMER_ENCODER_NORM:
+        if prop == "scale":
+            normalized_path = "vision_tower.vision_model.post_layernorm.weight"
+            updated_weights = weights.transpose()
+        elif prop == "bias":
+            normalized_path = "vision_tower.vision_model.post_layernorm.bias"
+            updated_weights = weights
+        else:
+            raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `scale`.")
+    else:
+        raise ValueError(f"Unexpected path `{path}`.")
+    if "vision" in normalized_path:
+        print(normalized_path)
+    return normalized_path, updated_weights
+def convert_transformer_weights(
+    config: Gemma3TextConfig,
+    paths: Sequence[str],
+    weights: np.ndarray,
+) -> Iterator[tuple[str, np.ndarray]]:
+    path, prop = paths
+    if path.startswith(_TRANSFORMER_POST_TRAINING_PREFIX):
+        path = path[_TRANSFORMER_POST_TRAINING_PREFIX_LEN:]
+    converted_paths: list[str] = []
+    converted_weights: list[Any] = []
+    attn_head_dim = config.num_attention_heads * config.head_dim
+    kv_head_dim = config.num_key_value_heads * config.head_dim
+    if path == _TRANSFORMER_EMBEDDER:
+        if prop == "input_embedding":
+            # Tied to language_model.lm_head.weight, assigned at the end.
+            converted_paths = ["language_model.model.embed_tokens.weight"]
+            # Gemma3 model doesn't have image soft token in input and output embeddings, resize to avoid bugs we had with Mllama
+            pre_expansion_embeddings = weights
+            mu = np.mean(pre_expansion_embeddings, axis=0)
+            sigma = np.cov(pre_expansion_embeddings, rowvar=False, bias=True)
+            new_embeddings = np.random.multivariate_normal(mu, 1e-5 * sigma, size=64)
+            weights = np.vstack([pre_expansion_embeddings, new_embeddings])
+            converted_weights = [weights]
+        else:
+            raise ValueError(f"Unexpected member, {prop}, in Embedder.")
+    elif path.startswith(f"{_TRANSFORMER_EMBEDDER}/mm"):
+        if path.endswith("/mm_input_projection"):
+            converted_paths = ["multi_modal_projector.mm_input_projection_weight"]
+            converted_weights = [weights]
+        elif path.endswith("/mm_soft_embedding_norm"):
+            converted_paths = ["multi_modal_projector.mm_soft_emb_norm.weight"]
+            converted_weights = [weights]
+        else:
+            raise ValueError(f"Unexpected subpath, `{path}`, in Embedder.")
+    elif path == _TRANSFORMER_FINAL_NORM:
+        converted_paths = ["language_model.model.norm.weight"]
+        converted_weights = [weights]
+    elif path.startswith(_TRANSFORMER_DECODER_BLOCK):
+        decoder_block_path = path[_TRANSFORMER_DECODER_BLOCK_LEN:]
+        next_path_seperator_idx = decoder_block_path.find("/")
+        layer_idx = decoder_block_path[:next_path_seperator_idx]
+        decoder_block_path = decoder_block_path[next_path_seperator_idx:]
+        base_path = f"language_model.model.layers.{layer_idx}"
+        if path.endswith("attn/attn_vec_einsum"):
+            converted_paths = [f"{base_path}.self_attn.o_proj.weight"]
+            converted_weights = [weights.transpose(2, 0, 1).reshape(config.hidden_size, attn_head_dim)]
+        elif path.endswith("attn/_key_norm"):
+            converted_paths = [f"{base_path}.self_attn.k_norm.weight"]
+            converted_weights = [weights]
+        elif path.endswith("attn/kv_einsum"):
+            converted_paths = [
+                f"{base_path}.self_attn.k_proj.weight",
+                f"{base_path}.self_attn.v_proj.weight",
+            ]
+            k_proj_weights, v_proj_weights = weights
+            converted_weights = [
+                k_proj_weights.transpose(0, 2, 1).reshape(kv_head_dim, config.hidden_size),
+                v_proj_weights.transpose(0, 2, 1).reshape(kv_head_dim, config.hidden_size),
+            ]
+        elif path.endswith("attn/q_einsum"):
+            converted_paths = [f"{base_path}.self_attn.q_proj.weight"]
+            converted_weights = [weights.transpose(0, 2, 1).reshape(attn_head_dim, config.hidden_size)]
+        elif path.endswith("attn/_query_norm"):
+            converted_paths = [f"{base_path}.self_attn.q_norm.weight"]
+            converted_weights = [weights]
+        elif path.endswith("mlp/gating_einsum"):
+            converted_paths = [
+                f"{base_path}.mlp.gate_proj.weight",
+                f"{base_path}.mlp.up_proj.weight",
+            ]
+            gate_proj_weight, up_proj_weight = weights
+            converted_weights = [gate_proj_weight, up_proj_weight]
+        elif path.endswith("mlp/linear"):
+            converted_paths = [f"{base_path}.mlp.down_proj.weight"]
+            converted_weights = [weights.transpose()]
+        elif path.endswith("post_attention_norm"):
+            converted_paths = [f"{base_path}.post_attention_layernorm.weight"]
+            converted_weights = [weights]
+        elif path.endswith("post_ffw_norm"):
+            converted_paths = [f"{base_path}.post_feedforward_layernorm.weight"]
+            converted_weights = [weights]
+        elif path.endswith("pre_attention_norm"):
+            converted_paths = [f"{base_path}.input_layernorm.weight"]
+            converted_weights = [weights]
+        elif path.endswith("pre_ffw_norm"):
+            converted_paths = [f"{base_path}.pre_feedforward_layernorm.weight"]
+            converted_weights = [weights]
+        else:
+            raise ValueError(f"Unexpected path `{path}` in Decoder Block.")
+    else:
+        raise ValueError(f"Unexpected path `{path}`.")
+    if (cpl := len(converted_paths)) != (cwl := len(converted_weights)):
+        raise ValueError(
+            "The `converted_paths` and `converted_weights` should be the same "
+            f"length. Got {cpl} and {cwl}, respectively, for {path}."
+        )
+    return zip(converted_paths, converted_weights)
+def transpose_reshape(x: torch.Tensor) -> torch.Tensor:
+    x = x.transpose(1, 2)
+    return x.reshape(x.shape[0] * x.shape[1], x.shape[2]).contiguous()
+@dataclasses.dataclass(frozen=True)
+class ConversionResult:
+    state_tree: dict[str, torch.Tensor]
+    config: ShieldGemma2Config
+def convert(
+    shieldgemma_checkpoint_path: str,
+    gemma_checkpoint_path: str,
+    config: ShieldGemma2Config,
+    target_dtype: torch.dtype,
+) -> ConversionResult:
+    """Loads Orbax checkpoint from `input_path` and converts it to HF tree."""
+    checkpointer = obc.PyTreeCheckpointer()
+    sg2_ckpt = checkpointer.restore(shieldgemma_checkpoint_path)
+    g3_ckpt = checkpointer.restore(gemma_checkpoint_path)
+    hf_tree: dict[str, torch.Tensor] = {}
+    def update_tree(path: str, weights: np.ndarray) -> None:
+        torch_tensor = torch.from_numpy(weights.astype("float32")).type(target_dtype)
+        logging.info(
+            "%s converted shape=%s with dtype=%s",
+            path,
+            weights.shape,
+            torch_tensor.dtype,
+        )
+        hf_tree[f"model.{path}"] = torch_tensor
+    for paths, value in tree.flatten_with_path(g3_ckpt):
+        if paths[0].startswith("SigLiPFromPatches_"):
+            path, weights = convert_siglip_weight(config=config.vision_config, paths=paths, weights=value)
+            update_tree(path, weights)
+    for paths, value in tree.flatten_with_path(sg2_ckpt):
+        for path, weights in convert_transformer_weights(config=config.text_config, paths=paths, weights=value):
+            update_tree(path, weights)
+    hf_tree["model.language_model.lm_head.weight"] = hf_tree["model.language_model.model.embed_tokens.weight"]
+    return ConversionResult(state_tree=hf_tree, config=config)
+def main(*args):
+    del args
+    dtype = getattr(torch, PRECISION.value)
+    output_path = OUTPUT_PATH.value
+    tokenizer = GemmaTokenizerFast(
+        TOKENIZER_PATH.value,
+        extra_special_tokens={
+            "image_token": "<image_soft_token>",  # Should be ID=262_144
+            "boi_token": "<start_of_image>",  # Should be ID=255_999
+            "eoi_token": "<end_of_image>",  # Should be ID=256_000
+        },
+    )
+    yes_token_index, no_token_index = torch.tensor(tokenizer(["Yes", "No"])["input_ids"])[:, 1].numpy()
+    config = ShieldGemma2Config(
+        yes_token_index=int(yes_token_index),
+        no_token_index=int(no_token_index),
+        text_config=Gemma3TextConfig(
+            vocab_size=262_208,
+            hidden_size=2560,
+            intermediate_size=2560 * 8 // 2,
+            num_attention_heads=8,
+            head_dim=256,
+            num_hidden_layers=34,
+            num_key_value_heads=4,
+            sliding_window=1024,
+            rope_scaling={"rope_type": "linear", "factor": 8.0},  # used for global RoPE only
+            rope_theta=1_000_000,
+            rope_local_base_freq=10_000,
+            attn_logit_softcapping=None,
+            query_pre_attn_scalar=256,
+            max_position_embeddings=8192,
+        ),
+        vision_config={
+            "hidden_size": 1152,
+            "intermediate_size": 4304,
+            "num_hidden_layers": 27,
+            "num_attention_heads": 16,
+            "num_channels": 3,
+            "image_size": 896,
+            "patch_size": 14,
+            "hidden_act": "gelu_pytorch_tanh",
+            "layer_norm_eps": 1e-6,
+            "attention_dropout": 0.0,
+            "vision_use_head": False,
+        },
+    )
+    config.save_pretrained(output_path)
+    image_processor = Gemma3ImageProcessor(
+        image_seq_length=256,
+        image_mean=(0.5,) * 3,
+        image_std=(0.5,) * 3,
+        size={"height": 896, "width": 896},
+        resample=PILImageResampling.BILINEAR,
+    )
+    processor = ShieldGemma2Processor(
+        image_processor=image_processor,
+        tokenizer=tokenizer,
+        policy_definitions=_SHIELDGEMMA2_POLICIES,
+    )
+    tokenizer.chat_template = _CHAT_TEMPLATE
+    processor.chat_template = _CHAT_TEMPLATE
+    processor.save_pretrained(output_path)
+    logging.info("Saved Shieldgemma2Processor to %s", output_path)
+    del processor
+    del tokenizer
+    logging.info("Converting Shieldgemma2 @ %s", dtype)
+    result = convert(_SHIELDGEMMA_CHECKPOINT_PATH.value, _GEMMA_CHECKPOINT_PATH.value, config, dtype)
+    logging.info("Converted Shieldgemma2 state tree from Orbax to Hugging Face.")
+    with accelerate.init_empty_weights():
+        model = ShieldGemma2ForImageClassification(config=config)
+    model.load_state_dict(result.state_tree, assign=True, strict=True)
+    model.config.torch_dtype = dtype
+    logging.info("Loaded Shieldgemma2 in Hugging Face Transformers.")
+    model.save_pretrained(output_path, safe_serialization=True)
+    logging.info("Saved Shieldgemma2 to SafeTensors in %s", output_path)
+    del model
+    del result
+if __name__ == "__main__":
+    app.run(main)

docs/transformers/build/lib/transformers/models/shieldgemma2/modeling_shieldgemma2.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import List, Optional, Union
+import torch
+import torch.utils.checkpoint
+from ...cache_utils import Cache
+from ...modeling_outputs import ImageClassifierOutputWithNoAttention
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from ..auto import AutoModelForImageTextToText
+from .configuration_shieldgemma2 import ShieldGemma2Config
+_CHECKPOINT_FOR_DOC = "google/shieldgemma-2-4b-it"
+_CONFIG_FOR_DOC = "ShieldGemma2Config"
+logger = logging.get_logger(__name__)
+SHIELDGEMMA2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@dataclass
+class ShieldGemma2ImageClassifierOutputWithNoAttention(ImageClassifierOutputWithNoAttention):
+    """ShieldGemma2 classifies imags as violative or not relative to a specific policy
+    Args:
+    """
+    probabilities: Optional[torch.Tensor] = None
+class ShieldGemma2ForImageClassification(PreTrainedModel):
+    config_class = ShieldGemma2Config
+    def __init__(self, config: ShieldGemma2Config):
+        super().__init__(config=config)
+        self.yes_token_index = getattr(config, "yes_token_index", 10_784)
+        self.no_token_index = getattr(config, "no_token_index", 3771)
+        self.model = AutoModelForImageTextToText.from_config(config=config)
+    def get_input_embeddings(self):
+        return self.model.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.model.language_model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.model.language_model.get_output_embeddings()
+    def set_output_embeddings(self, new_embeddings):
+        self.model.language_model.set_output_embeddings(new_embeddings)
+    def set_decoder(self, decoder):
+        self.model.language_model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.model.language_model.get_decoder()
+    def tie_weights(self):
+        return self.model.language_model.tie_weights()
+    @add_start_docstrings_to_model_forward(SHIELDGEMMA2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> ShieldGemma2ImageClassifierOutputWithNoAttention:
+        """Predicts the binary probability that the image violates the specified policy.
+        Returns:
+            A `ShieldGemma2ImageClassifierOutputWithNoAttention` instance containing the logits and probabilities
+            associated with the model predicting the `Yes` or `No` token as the response to that prompt, captured in the
+            following properties.
+                *   `logits` (`torch.Tensor` of shape `(batch_size, 2)`):
+                    The first position along dim=1 is the logits for the `Yes` token and the second position along dim=1 is
+                    the logits for the `No` token.
+                *   `probabilities` (`torch.Tensor` of shape `(batch_size, 2)`):
+                    The first position along dim=1 is the probability of predicting the `Yes` token and the second position
+                    along dim=1 is the probability of predicting the `No` token.
+            ShieldGemma prompts are constructed such that predicting the `Yes` token means the content *does violate* the
+            policy as described. If you are only interested in the violative condition, use
+            `violated = outputs.probabilities[:, 1]` to extract that slice from the output tensors.
+            When used with the `ShieldGemma2Processor`, the `batch_size` will be equal to `len(images) * len(policies)`,
+            and the order within the batch will be img1_policy1, ... img1_policyN, ... imgM_policyN.
+        """
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            token_type_ids=token_type_ids,
+            cache_position=cache_position,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+        logits = outputs.logits
+        selected_logits = logits[:, -1, [self.yes_token_index, self.no_token_index]]
+        probabilities = torch.softmax(selected_logits, dim=-1)
+        return ShieldGemma2ImageClassifierOutputWithNoAttention(
+            logits=selected_logits,
+            probabilities=probabilities,
+        )
+__all__ = [
+    "ShieldGemma2ForImageClassification",
+]

docs/transformers/build/lib/transformers/models/shieldgemma2/processing_shieldgemma2.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Mapping, Sequence
+from typing import Optional
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import Unpack
+from ...utils import logging
+from ..gemma3.processing_gemma3 import Gemma3Processor, Gemma3ProcessorKwargs
+logger = logging.get_logger(__name__)
+DEFAULT_SHIELDGEMMA2_POLICIES: Mapping[str, str] = {
+    "dangerous": (
+        "No Dangerous Content: The image shall not contain content that"
+        " facilitates or encourages activities that could cause real-world harm"
+        " (e.g., building firearms and explosive devices, promotion of"
+        " terrorism, instructions for suicide)."
+    ),
+    "sexual": (
+        "No Sexually Explicit content: The image shall not contain content that"
+        " depicts explicit or graphic sexual acts (e.g., pornography, erotic"
+        " nudity, depictions of rape or sexual assault)."
+    ),
+    "violence": (
+        "No Violence/Gore content: The image shall not contain content that"
+        " depicts shocking, sensational, or gratuitous violence (e.g.,"
+        " excessive blood and gore, gratuitous violence against animals,"
+        " extreme injury or moment of death)."
+    ),
+}
+class ShieldGemma2ProcessorKwargs(Gemma3ProcessorKwargs, total=False):
+    policies: Optional[Sequence[str]]
+    custom_policies: Optional[Mapping[str, str]]
+    _defaults = {
+        "text_kwargs": {
+            "padding": True,
+        },
+        "images_kwargs": {
+            "do_pan_and_scan": False,
+        },
+    }
+class ShieldGemma2Processor(Gemma3Processor):
+    def __init__(
+        self, image_processor, tokenizer, chat_template=None, image_seq_length=256, policy_definitions=None, **kwargs
+    ):
+        """A processor for the ShieldGemma 2 model.
+        Args:
+            image_processor: The image processor to use, typically a `Gemma3ImageProcessorFast` instance.
+            tokenizer: The tokenizer to use, typically a `GemmaTokenizerFast` instance.
+            chat_template: The chat template to use with this processor. Typically, this is unset as the processor
+                configuration on Hugging Face Hub includes this value already.
+            image_seq_length: The number of soft tokens per image. Typically, this is unset as the processor
+                configuration on Hugging Face Hub includes this value already.
+            policy_definitions: A mapping from policy name to its description in text used as the default policies to
+                classify images against. The policy descriptions are included in the text of the prompts generated by
+                this processor. Typically, this is unset as the processor configuration on Hugging Face Hub includes
+                the base policies ShieldGemma was trained on.
+        """
+        super().__init__(image_processor, tokenizer, chat_template, image_seq_length, **kwargs)
+        if policy_definitions is None:
+            self.policy_definitions = DEFAULT_SHIELDGEMMA2_POLICIES
+        else:
+            self.policy_definitions = policy_definitions
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text=None,
+        videos=None,
+        audio=None,
+        **kwargs: Unpack[ShieldGemma2ProcessorKwargs],
+    ) -> BatchFeature:
+        """Generates a batch of inputs from the provided images.
+        ShieldGemma was trained to classify image content for policy compliance using a specific prompt construction.
+        This processor generates a batch of such prompts from the provided images by:
+        1.  Creating a list of conversations, one for each `<image, policy>` pair;
+        2.  Converting these conversations to text using `self.apply_chat_template()`; and
+        3.  Encoding the conversations and images using the same techniques as `Gemma3Processor`.
+        Args:
+            images: A single image or a list of images to include in the batch.
+            text: Not supported.
+            videos: Not supported.
+            audio: Not supported.
+            kwargs: An optional dictionary of keyword arguments to configre the
+                processor. Possible values include:
+                *   `custom_policies`: Additional policy definitions that augment the `self.policy_definitions` passed
+                    into the constructor. Note that `custom_policies` that share a key with `self.policy_definitions`
+                    will override the policy description
+                *   `policies`: (Optional) a list of keys in the joint `self.policy_definitions | custom_policies`
+                    dictionary of specific interest for the provided images. If empty or None, prompts will be
+                    generated for every key in the joint dictionary.
+        Returns:
+            A `BatchFeature` continaing `input_ids`, `pixel_values`, etc. where each Tensor is of shape
+            `(len(images) * len(policies), )`, and the order within the batch will be
+            img1_policy1, ... img1_policyN, ... imgM_policyN.
+        """
+        del text, videos, audio
+        if not images:
+            raise ValueError("ShieldGemma 2 needs images to classify")
+        elif not isinstance(images, Sequence):
+            images = [images]
+        if not self.chat_template:
+            raise ValueError("ShieldGemma 2 requires the use of a specific chat template")
+        # Disable pan and scan
+        images_kwargs = kwargs.setdefault("images_kwargs", {})
+        if images_kwargs.get("do_pan_and_scan") is True:
+            logger.warning_once("ShieldGemma2 does not support pan and scan.")
+            images_kwargs["do_pan_and_scan"] = False
+        # Enable padding on the batch during tokenization
+        text_kwargs = kwargs.setdefault("text_kwargs", {})
+        if "padding" not in text_kwargs:
+            text_kwargs["padding"] = kwargs.pop("padding", True)
+            text_kwargs["padding_side"] = kwargs.pop("padding_side", "left")
+        policy_definitions: Mapping[str, str] = {
+            **self.policy_definitions,
+            **kwargs.get("custom_policies", {}),
+        }
+        if (policies := kwargs.get("policies")) is None:
+            policies = list(policy_definitions.keys())
+        # TODO(ryanmullins): Support images from PIL or URLs.
+        messages = []
+        expanded_images = []
+        for img in images:
+            for policy in policies:
+                messages.append(
+                    [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "image"},
+                                {"type": "text", "text": policy_definitions[policy]},
+                            ],
+                        }
+                    ]
+                )
+                expanded_images.append([img])
+        text = self.apply_chat_template(messages, tokenize=False)
+        return super().__call__(images=expanded_images, text=text, **kwargs)
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names + ["token_type_ids"]
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+__all__ = ["ShieldGemma2Processor"]

docs/transformers/build/lib/transformers/models/siglip/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_siglip import *
+    from .image_processing_siglip import *
+    from .image_processing_siglip_fast import *
+    from .modeling_siglip import *
+    from .processing_siglip import *
+    from .tokenization_siglip import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/siglip/configuration_siglip.py ADDED Viewed

	@@ -0,0 +1,269 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Siglip model configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+class SiglipTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipTextModel`]. It is used to instantiate a
+    Siglip text encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the text encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Siglip text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`SiglipModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 64):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the padding token in the vocabulary.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            The id of the beginning-of-sequence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            The id of the end-of-sequence token in the vocabulary.
+        projection_size (`int`, *optional*, defaults to `hidden_size`):
+            The size of the projection head.
+    Example:
+    ```python
+    >>> from transformers import SiglipTextConfig, SiglipTextModel
+    >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipTextConfig()
+    >>> # Initializing a SiglipTextModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipTextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "siglip_text_model"
+    base_config_key = "text_config"
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        max_position_embeddings=64,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/siglip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        projection_size=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.attention_dropout = attention_dropout
+        self.projection_size = projection_size if projection_size is not None else hidden_size
+class SiglipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
+    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    Example:
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipVisionConfig()
+    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "siglip_vision_model"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+class SiglipConfig(PretrainedConfig):
+    r"""
+    [`SiglipConfig`] is the configuration class to store the configuration of a [`SiglipModel`]. It is used to
+    instantiate a Siglip model according to the specified arguments, defining the text model and vision model configs.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`SiglipTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`SiglipVisionConfig`].
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from transformers import SiglipConfig, SiglipModel
+    >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipConfig()
+    >>> # Initializing a SiglipModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # We can also initialize a SiglipConfig from a SiglipTextConfig and a SiglipVisionConfig
+    >>> from transformers import SiglipTextConfig, SiglipVisionConfig
+    >>> # Initializing a SiglipText and SiglipVision configuration
+    >>> config_text = SiglipTextConfig()
+    >>> config_vision = SiglipVisionConfig()
+    >>> config = SiglipConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+    model_type = "siglip"
+    sub_configs = {"text_config": SiglipTextConfig, "vision_config": SiglipVisionConfig}
+    def __init__(self, text_config=None, vision_config=None, **kwargs):
+        super().__init__(**kwargs)
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.")
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.")
+        self.text_config = SiglipTextConfig(**text_config)
+        self.vision_config = SiglipVisionConfig(**vision_config)
+        self.initializer_factor = 1.0
+    @classmethod
+    def from_text_vision_configs(cls, text_config: SiglipTextConfig, vision_config: SiglipVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`SiglipConfig`] (or a derived class) from siglip text model configuration and siglip vision
+        model configuration.
+        Returns:
+            [`SiglipConfig`]: An instance of a configuration object
+        """
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+__all__ = ["SiglipConfig", "SiglipTextConfig", "SiglipVisionConfig"]

docs/transformers/build/lib/transformers/models/siglip/convert_siglip_to_hf.py ADDED Viewed

	@@ -0,0 +1,533 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert SigLIP checkpoints from the original repository.
+URL: https://github.com/google-research/big_vision/tree/main
+"""
+import argparse
+import collections
+import os
+from typing import Tuple
+import numpy as np
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from numpy import load
+from PIL import Image
+from transformers import (
+    GemmaTokenizerFast,
+    SiglipConfig,
+    SiglipImageProcessor,
+    SiglipModel,
+    SiglipProcessor,
+    SiglipTokenizer,
+)
+from transformers.utils import logging
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+MODEL_CONFIGS = {
+    "base": {
+        "hidden_size": 768,
+        "intermediate_size": 3072,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+    },
+    "large": {
+        "hidden_size": 1024,
+        "intermediate_size": 4096,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+    },
+    "giant-opt": {
+        "hidden_size": 1536,
+        "intermediate_size": 6144,
+        "num_hidden_layers": 40,
+        "num_attention_heads": 16,
+    },
+    "so400m": {
+        "hidden_size": 1152,
+        "intermediate_size": 4304,
+        "num_hidden_layers": 27,
+        "num_attention_heads": 16,
+    },
+}
+model_name_to_checkpoint = {
+    # base checkpoints
+    "siglip-base-patch16-224": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_224_63724782.npz",
+    "siglip-base-patch16-256": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_256_60500360.npz",
+    "siglip-base-patch16-384": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_384_68578854.npz",
+    "siglip-base-patch16-512": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_512_68580893.npz",
+    # large checkpoints
+    "siglip-large-patch16-256": "/Users/nielsrogge/Documents/SigLIP/webli_en_l16_256_60552751.npz",
+    "siglip-large-patch16-384": "/Users/nielsrogge/Documents/SigLIP/webli_en_l16_384_63634585.npz",
+    # multilingual checkpoint
+    "siglip-base-patch16-256-i18n": "/Users/nielsrogge/Documents/SigLIP/webli_i18n_b16_256_66117334.npz",
+    # so400m checkpoints
+    "siglip-so400m-patch14-384": "/Users/nielsrogge/Documents/SigLIP/webli_en_so400m_384_58765454.npz",
+    # ----------------- v2 -----------------
+    # base checkpoints
+    "siglip2-base-patch32-256": "gv-hf/siglip2/siglip2_b32_256.npz",
+    "siglip2-base-patch16-224": "gv-hf/siglip2/siglip2_b16_224.npz",
+    "siglip2-base-patch16-256": "gv-hf/siglip2/siglip2_b16_256.npz",
+    "siglip2-base-patch16-384": "gv-hf/siglip2/siglip2_b16_384.npz",
+    "siglip2-base-patch16-512": "gv-hf/siglip2/siglip2_b16_512.npz",
+    # large checkpoints
+    "siglip2-large-patch16-256": "gv-hf/siglip2/siglip2_l16_256.npz",
+    "siglip2-large-patch16-384": "gv-hf/siglip2/siglip2_l16_384.npz",
+    "siglip2-large-patch16-512": "gv-hf/siglip2/siglip2_l16_512.npz",
+    # giant opt checkpoints
+    "siglip2-giant-opt-patch16-256": "gv-hf/siglip2/siglip2_g-opt16_256.npz",
+    "siglip2-giant-opt-patch16-384": "gv-hf/siglip2/siglip2_g-opt16_384.npz",
+    # so400m checkpoints
+    "siglip2-so400m-patch14-224": "gv-hf/siglip2/siglip2_so400m14_224.npz",
+    "siglip2-so400m-patch14-384": "gv-hf/siglip2/siglip2_so400m14_384.npz",
+    "siglip2-so400m-patch16-256": "gv-hf/siglip2/siglip2_so400m16_256.npz",
+    "siglip2-so400m-patch16-384": "gv-hf/siglip2/siglip2_so400m16_384.npz",
+    "siglip2-so400m-patch16-512": "gv-hf/siglip2/siglip2_so400m16_512.npz",
+}
+# ------------------------------------------------------------------------------------------------------
+#  CONFIG
+# ------------------------------------------------------------------------------------------------------
+def get_image_size_from_model_name(model_name: str) -> int:
+    if "-i18n" not in model_name:
+        size = model_name.split("-")[-1]
+    else:
+        size = model_name.split("-")[-2]
+    return int(size)
+def get_patch_size_from_model_name(model_name: str) -> int:
+    patch_str = [x for x in model_name.split("-") if "patch" in x][0]
+    return int(patch_str[-2:])
+def get_vocab_size_from_model_name(model_name: str) -> int:
+    if "siglip2" in model_name:
+        vocab_size = 256000
+    elif "-i18n" in model_name:
+        vocab_size = 250000
+    else:
+        vocab_size = 32000
+    return vocab_size
+def get_vocab_file_from_model_name(model_name: str) -> str:
+    # get vocab file
+    if "i18n" in model_name:
+        vocab_file = "/Users/nielsrogge/Documents/SigLIP/multilingual_vocab/sentencepiece.model"
+    else:
+        vocab_file = "/Users/nielsrogge/Documents/SigLIP/english_vocab/sentencepiece.model"
+    return vocab_file
+def get_text_and_vision_vit_variants(model_name: str) -> Tuple[str, str]:
+    variant = model_name.split("-")[1] if "giant-opt" not in model_name else "giant-opt"
+    return {
+        "base": ("base", "base"),
+        "large": ("large", "large"),
+        "so400m": ("so400m", "so400m"),
+        # g-opt siglip2 is not symmetric
+        "giant-opt": ("so400m", "giant-opt"),
+    }[variant]
+def get_siglip_config(model_name):
+    text_variant, vision_variant = get_text_and_vision_vit_variants(model_name)
+    text_config = MODEL_CONFIGS[text_variant].copy()
+    vision_config = MODEL_CONFIGS[vision_variant].copy()
+    text_config["vocab_size"] = get_vocab_size_from_model_name(model_name)
+    vision_config["image_size"] = get_image_size_from_model_name(model_name)
+    vision_config["patch_size"] = get_patch_size_from_model_name(model_name)
+    if text_config["hidden_size"] != vision_config["hidden_size"]:
+        text_config["projection_size"] = vision_config["hidden_size"]
+    return SiglipConfig(text_config=text_config, vision_config=vision_config)
+# ------------------------------------------------------------------------------------------------------
+#  PROCESSING
+# ------------------------------------------------------------------------------------------------------
+def get_tokenizer(model_name: str) -> GemmaTokenizerFast:
+    if "siglip2" in model_name:
+        tokenizer = GemmaTokenizerFast.from_pretrained(
+            "google/gemma-2-9b-it",
+            add_bos_token=False,
+            add_eos_token=True,
+            padding_side="right",
+            do_lower_case=True,
+            # important: make tokenizer NOT return attention_mask since original one doesn't require it
+            model_input_names=["input_ids"],
+        )
+    else:
+        # for siglip v1
+        vocab_file = get_vocab_file_from_model_name(model_name)
+        # important: make tokenizer not return attention_mask since original one doesn't require it
+        tokenizer = SiglipTokenizer(vocab_file=vocab_file, model_input_names=["input_ids"])
+    return tokenizer
+def get_image_processor(model_name: str) -> SiglipImageProcessor:
+    image_size = get_image_size_from_model_name(model_name)
+    size = {"height": image_size, "width": image_size}
+    if "siglip2" in model_name:
+        image_processor = SiglipImageProcessor(size=size, resample=2)  # bilinear resampling
+    else:
+        image_processor = SiglipImageProcessor(size=size)
+    return image_processor
+# ------------------------------------------------------------------------------------------------------
+#  CONVERT FUNCTIONS
+# ------------------------------------------------------------------------------------------------------
+def split_encoderblock_layers(state_dict: dict) -> dict:
+    """
+    Split the encoderblock weight into layers. In some cases they are concatenated in
+    the original checkpoints.
+    """
+    # Make shallow copy
+    state_dict = state_dict.copy()
+    # Split encoderblock weight into layers
+    keys = list(state_dict.keys())
+    for key in keys:
+        if "/encoderblock/" in key:
+            weight = state_dict.pop(key)
+            for i, weight_i in enumerate(weight):
+                new_name = key.replace("encoderblock", f"encoderblock_{i}")
+                state_dict[new_name] = weight_i
+    return state_dict
+def create_rename_keys(config):
+    rename_keys = []
+    # fmt: off
+    # vision encoder
+    rename_keys.append(("params/img/embedding/kernel", "vision_model.embeddings.patch_embedding.weight"))
+    rename_keys.append(("params/img/embedding/bias", "vision_model.embeddings.patch_embedding.bias"))
+    rename_keys.append(("params/img/pos_embedding", "vision_model.embeddings.position_embedding.weight"))
+    for i in range(config.vision_config.num_hidden_layers):
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_0/scale", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_0/bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_1/scale", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_1/bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_0/kernel", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_0/bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_1/kernel", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_1/bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/key/kernel", f"vision_model.encoder.layers.{i}.self_attn.k_proj.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/key/bias", f"vision_model.encoder.layers.{i}.self_attn.k_proj.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/value/kernel", f"vision_model.encoder.layers.{i}.self_attn.v_proj.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/value/bias", f"vision_model.encoder.layers.{i}.self_attn.v_proj.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/query/kernel", f"vision_model.encoder.layers.{i}.self_attn.q_proj.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/query/bias", f"vision_model.encoder.layers.{i}.self_attn.q_proj.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/out/kernel", f"vision_model.encoder.layers.{i}.self_attn.out_proj.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/out/bias", f"vision_model.encoder.layers.{i}.self_attn.out_proj.bias"))
+    rename_keys.append(("params/img/Transformer/encoder_norm/scale", "vision_model.post_layernorm.weight"))
+    rename_keys.append(("params/img/Transformer/encoder_norm/bias", "vision_model.post_layernorm.bias"))
+    rename_keys.append(("params/img/MAPHead_0/probe", "vision_model.head.probe"))
+    rename_keys.append(("params/img/MAPHead_0/LayerNorm_0/scale", "vision_model.head.layernorm.weight"))
+    rename_keys.append(("params/img/MAPHead_0/LayerNorm_0/bias", "vision_model.head.layernorm.bias"))
+    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_0/kernel", "vision_model.head.mlp.fc1.weight"))
+    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_0/bias", "vision_model.head.mlp.fc1.bias"))
+    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_1/kernel", "vision_model.head.mlp.fc2.weight"))
+    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_1/bias", "vision_model.head.mlp.fc2.bias"))
+    rename_keys.append(("params/img/MAPHead_0/MultiHeadDotProductAttention_0/out/kernel", "vision_model.head.attention.out_proj.weight"))
+    rename_keys.append(("params/img/MAPHead_0/MultiHeadDotProductAttention_0/out/bias", "vision_model.head.attention.out_proj.bias"))
+    # text encoder
+    rename_keys.append(("params/txt/Embed_0/embedding", "text_model.embeddings.token_embedding.weight"))
+    rename_keys.append(("params/txt/pos_embedding", "text_model.embeddings.position_embedding.weight"))
+    for i in range(config.text_config.num_hidden_layers):
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_0/scale", f"text_model.encoder.layers.{i}.layer_norm1.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_0/bias", f"text_model.encoder.layers.{i}.layer_norm1.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_1/scale", f"text_model.encoder.layers.{i}.layer_norm2.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_1/bias", f"text_model.encoder.layers.{i}.layer_norm2.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_0/kernel", f"text_model.encoder.layers.{i}.mlp.fc1.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_0/bias", f"text_model.encoder.layers.{i}.mlp.fc1.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_1/kernel", f"text_model.encoder.layers.{i}.mlp.fc2.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_1/bias", f"text_model.encoder.layers.{i}.mlp.fc2.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/key/kernel", f"text_model.encoder.layers.{i}.self_attn.k_proj.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/key/bias", f"text_model.encoder.layers.{i}.self_attn.k_proj.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/value/kernel", f"text_model.encoder.layers.{i}.self_attn.v_proj.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/value/bias", f"text_model.encoder.layers.{i}.self_attn.v_proj.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/query/kernel", f"text_model.encoder.layers.{i}.self_attn.q_proj.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/query/bias", f"text_model.encoder.layers.{i}.self_attn.q_proj.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/out/kernel", f"text_model.encoder.layers.{i}.self_attn.out_proj.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/out/bias", f"text_model.encoder.layers.{i}.self_attn.out_proj.bias"))
+    rename_keys.append(("params/txt/Encoder_0/encoder_norm/scale", "text_model.final_layer_norm.weight"))
+    rename_keys.append(("params/txt/Encoder_0/encoder_norm/bias", "text_model.final_layer_norm.bias"))
+    rename_keys.append(("params/txt/head/kernel", "text_model.head.weight"))
+    rename_keys.append(("params/txt/head/bias", "text_model.head.bias"))
+    # learned temperature and bias
+    rename_keys.append(("params/t", "logit_scale"))
+    rename_keys.append(("params/b", "logit_bias"))
+    # fmt: on
+    return rename_keys
+def rename_key(dct, old, new, config):
+    val = dct.pop(old)
+    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "vision" in new:
+        val = val.reshape(-1, config.vision_config.hidden_size)
+    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "text" in new:
+        val = val.reshape(-1, config.text_config.hidden_size)
+    if "patch_embedding.weight" in new:
+        val = val.transpose(3, 2, 0, 1)
+    elif new.endswith("weight") and "position_embedding" not in new and "token_embedding" not in new:
+        val = val.T
+    if "position_embedding" in new and "vision" in new:
+        val = val.reshape(-1, config.vision_config.hidden_size)
+    if "position_embedding" in new and "text" in new:
+        val = val.reshape(-1, config.text_config.hidden_size)
+    if new.endswith("bias"):
+        val = val.reshape(-1)
+    dct[new] = torch.from_numpy(val)
+def read_in_q_k_v_head(state_dict, config):
+    # read in individual input projection layers
+    key_proj_weight = (
+        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/key/kernel")
+        .reshape(-1, config.vision_config.hidden_size)
+        .T
+    )
+    key_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/key/bias").reshape(-1)
+    value_proj_weight = (
+        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/value/kernel")
+        .reshape(-1, config.vision_config.hidden_size)
+        .T
+    )
+    value_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/value/bias").reshape(-1)
+    query_proj_weight = (
+        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/query/kernel")
+        .reshape(-1, config.vision_config.hidden_size)
+        .T
+    )
+    query_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/query/bias").reshape(-1)
+    # next, add them to the state dict as a single matrix + vector
+    state_dict["vision_model.head.attention.in_proj_weight"] = torch.from_numpy(
+        np.concatenate([query_proj_weight, key_proj_weight, value_proj_weight], axis=0)
+    )
+    state_dict["vision_model.head.attention.in_proj_bias"] = torch.from_numpy(
+        np.concatenate([query_proj_bias, key_proj_bias, value_proj_bias], axis=0)
+    )
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+def flatten_nested_dict(params, parent_key="", sep="/"):
+    items = []
+    for k, v in params.items():
+        new_key = parent_key + sep + k if parent_key else k
+        if isinstance(v, collections.abc.MutableMapping):
+            items.extend(flatten_nested_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+@torch.no_grad()
+def convert_siglip_checkpoint(model_name, pytorch_dump_folder_path, verify_logits=True, push_to_hub=False):
+    """
+    Copy/paste/tweak model's weights to our SigLIP structure.
+    """
+    # Define default SigLIP configuration
+    config = get_siglip_config(model_name)
+    # Get checkpoint
+    checkpoint = model_name_to_checkpoint[model_name]
+    if not os.path.exists(checkpoint):
+        org, repo_id, *filepath = checkpoint.split("/")
+        checkpoint = hf_hub_download(repo_id=f"{org}/{repo_id}", filename="/".join(filepath))
+    # Load original state dict
+    data = load(checkpoint)
+    state_dict = flatten_nested_dict(data)
+    state_dict = split_encoderblock_layers(state_dict)
+    # Remove and rename some keys
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest, config)
+    # qkv matrices of attention pooling head need special treatment
+    read_in_q_k_v_head(state_dict, config)
+    # Load HuggingFace model
+    model = SiglipModel(config).eval()
+    model.load_state_dict(state_dict)
+    # Create processor
+    image_processor = get_image_processor(model_name)
+    tokenizer = get_tokenizer(model_name)
+    processor = SiglipProcessor(image_processor=image_processor, tokenizer=tokenizer)
+    # Verify forward pass on dummy images and texts
+    url_1 = "https://cdn.openai.com/multimodal-neurons/assets/apple/apple-ipod.jpg"
+    image_1 = Image.open(requests.get(url_1, stream=True).raw).convert("RGB")
+    url_2 = "https://cdn.openai.com/multimodal-neurons/assets/apple/apple-blank.jpg"
+    image_2 = Image.open(requests.get(url_2, stream=True).raw).convert("RGB")
+    texts = ["an apple", "a picture of an apple"]
+    inputs = processor(images=[image_1, image_2], text=texts, padding="max_length", max_length=64, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model(**inputs)
+    if verify_logits:
+        image_size = config.vision_config.image_size
+        # verify input_ids against original ones
+        if image_size == 224:
+            filename = "siglip_pixel_values.pt"
+        elif image_size == 256:
+            filename = "siglip_pixel_values_256.pt"
+        elif image_size == 384:
+            filename = "siglip_pixel_values_384.pt"
+        elif image_size == 512:
+            filename = "siglip_pixel_values_512.pt"
+        else:
+            raise ValueError("Image size not supported")
+        filepath = hf_hub_download(repo_id="nielsr/test-image", filename=filename, repo_type="dataset")
+        original_pixel_values = torch.load(filepath, weights_only=True)
+        filepath = hf_hub_download(repo_id="nielsr/test-image", filename="siglip_input_ids.pt", repo_type="dataset")
+        original_input_ids = torch.load(filepath, weights_only=True)
+        if "i18n" not in model_name:
+            assert inputs.input_ids.tolist() == original_input_ids.tolist()
+        print("Mean of original pixel values:", original_pixel_values.mean())
+        print("Mean of new pixel values:", inputs.pixel_values.mean())
+        # note: we're testing with original pixel values here since we don't have exact pixel values
+        with torch.no_grad():
+            outputs = model(input_ids=original_input_ids, pixel_values=original_pixel_values)
+        print(outputs.logits_per_image[:3, :3])
+        probs = torch.sigmoid(outputs.logits_per_image)  # these are the probabilities
+        print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
+        print(f"{probs[0][1]:.1%} that image 0 is '{texts[1]}'")
+        if model_name == "siglip-base-patch16-224":
+            expected_slice = torch.tensor(
+                [[-2.9621, -2.1672], [-0.2713, 0.2910]],
+            )
+        elif model_name == "siglip-base-patch16-256":
+            expected_slice = torch.tensor(
+                [[-3.1146, -1.9894], [-0.7312, 0.6387]],
+            )
+        elif model_name == "siglip-base-patch16-384":
+            expected_slice = torch.tensor(
+                [[-2.8098, -2.1891], [-0.4242, 0.4102]],
+            )
+        elif model_name == "siglip-base-patch16-512":
+            expected_slice = torch.tensor(
+                [[-2.7899, -2.2668], [-0.4295, -0.0735]],
+            )
+        elif model_name == "siglip-large-patch16-256":
+            expected_slice = torch.tensor(
+                [[-1.5827, -0.5801], [-0.9153, 0.1363]],
+            )
+        elif model_name == "siglip-large-patch16-384":
+            expected_slice = torch.tensor(
+                [[-2.1523, -0.2899], [-0.2959, 0.7884]],
+            )
+        elif model_name == "siglip-so400m-patch14-384":
+            expected_slice = torch.tensor([[-1.2441, -0.6649], [-0.7060, 0.7374]])
+        elif model_name == "siglip-base-patch16-256-i18n":
+            expected_slice = torch.tensor(
+                [[-0.9064, 0.1073], [-0.0299, 0.5304]],
+            )
+        assert torch.allclose(outputs.logits_per_image[:3, :3], expected_slice, atol=1e-4)
+        print("Looks ok!")
+    if pytorch_dump_folder_path is not None:
+        pytorch_dump_folder_path = os.path.join(pytorch_dump_folder_path, model_name)
+        os.makedirs(pytorch_dump_folder_path, exist_ok=True)
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving processor to {pytorch_dump_folder_path}")
+        processor.save_pretrained(pytorch_dump_folder_path)
+    if push_to_hub:
+        model.push_to_hub(f"s0225/{model_name}", private=True)
+        processor.push_to_hub(f"s0225/{model_name}", private=True)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="siglip-base-patch16-224",
+        type=str,
+        choices=model_name_to_checkpoint.keys(),
+        help="Name of the model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--verify_logits",
+        action="store_true",
+        help="Whether to verify logits against the original implementation.",
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_siglip_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.verify_logits, args.push_to_hub)

docs/transformers/build/lib/transformers/models/siglip/image_processing_siglip.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for SigLIP."""
+from typing import Dict, List, Optional, Union
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    import PIL
+class SiglipImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a SigLIP image processor.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
+            `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        do_convert_rgb: Optional[bool] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        images = make_flat_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        if do_resize:
+            height, width = size["height"], size["width"]
+            images = [
+                resize(image=image, size=(height, width), resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+__all__ = ["SiglipImageProcessor"]

docs/transformers/build/lib/transformers/models/siglip/image_processing_siglip_fast.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for SigLIP."""
+from ...image_processing_utils_fast import BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BaseImageProcessorFast
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    PILImageResampling,
+)
+from ...utils import add_start_docstrings
+@add_start_docstrings(
+    "Constructs a fast SigLIP image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+)
+class SiglipImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 224, "width": 224}
+    default_to_square = False
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+__all__ = ["SiglipImageProcessorFast"]