camenduru commited on
Commit
d8452e4
·
verified ·
1 Parent(s): 280c61a

thanks to fudan-generative-ai ❤

Browse files
Files changed (34) hide show
  1. .gitattributes +1 -0
  2. README.md +259 -0
  3. audio_separator/Kim_Vocal_2.onnx +3 -0
  4. audio_separator/download_checks.json +231 -0
  5. audio_separator/mdx_model_data.json +384 -0
  6. audio_separator/vr_model_data.json +137 -0
  7. cogvideox-5b-i2v-sat/transformer/1/mp_rank_00_model_states.pt +3 -0
  8. cogvideox-5b-i2v-sat/transformer/latest +1 -0
  9. cogvideox-5b-i2v-sat/vae/3d-vae.pt +3 -0
  10. face_analysis/models/1k3d68.onnx +3 -0
  11. face_analysis/models/2d106det.onnx +3 -0
  12. face_analysis/models/buffalo_l.zip +3 -0
  13. face_analysis/models/face_landmarker_v2_with_blendshapes.task +3 -0
  14. face_analysis/models/genderage.onnx +3 -0
  15. face_analysis/models/glintr100.onnx +3 -0
  16. face_analysis/models/scrfd_10g_bnkps.onnx +3 -0
  17. hallo3/latest +1 -0
  18. t5-v1_1-xxl/added_tokens.json +102 -0
  19. t5-v1_1-xxl/config.json +32 -0
  20. t5-v1_1-xxl/model-00001-of-00002.safetensors +3 -0
  21. t5-v1_1-xxl/model-00002-of-00002.safetensors +3 -0
  22. t5-v1_1-xxl/model.safetensors.index.json +226 -0
  23. t5-v1_1-xxl/special_tokens_map.json +125 -0
  24. t5-v1_1-xxl/spiece.model +3 -0
  25. t5-v1_1-xxl/tokenizer_config.json +940 -0
  26. wav2vec/wav2vec2-base-960h/.gitattributes +18 -0
  27. wav2vec/wav2vec2-base-960h/README.md +128 -0
  28. wav2vec/wav2vec2-base-960h/config.json +77 -0
  29. wav2vec/wav2vec2-base-960h/feature_extractor_config.json +8 -0
  30. wav2vec/wav2vec2-base-960h/model.safetensors +3 -0
  31. wav2vec/wav2vec2-base-960h/preprocessor_config.json +8 -0
  32. wav2vec/wav2vec2-base-960h/special_tokens_map.json +1 -0
  33. wav2vec/wav2vec2-base-960h/tokenizer_config.json +1 -0
  34. wav2vec/wav2vec2-base-960h/vocab.json +1 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ face_analysis/models/face_landmarker_v2_with_blendshapes.task filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+ <h1 align='center'>Hallo3: Highly Dynamic and Realistic Portrait Image Animation with Diffusion Transformer Networks</h1>
5
+
6
+ <div align='center'>
7
+ <a href='https://github.com/cuijh26' target='_blank'>Jiahao Cui</a><sup>1</sup>&emsp;
8
+ <a href='https://github.com/crystallee-ai' target='_blank'>Hui Li</a><sup>1</sup>&emsp;
9
+ <a href='https://github.com/subazinga' target='_blank'>Yun Zhan</a><sup>1</sup>&emsp;
10
+ <a href='https://github.com/NinoNeumann' target='_blank'>Hanlin Shang</a><sup>1</sup>&emsp;
11
+ <a href='https://github.com/Kaihui-Cheng' target='_blank'>Kaihui Cheng</a><sup>1</sup>&emsp;
12
+ <a href='https://github.com/mayuqi7777' target='_blank'>Yuqi Ma</a><sup>1</sup>&emsp;
13
+ <a href='https://github.com/AricGamma' target='_blank'>Shan Mu</a><sup>1</sup>&emsp;
14
+ </div>
15
+ <div align='center'>
16
+ <a href='https://hangz-nju-cuhk.github.io/' target='_blank'>Hang Zhou</a><sup>2</sup>&emsp;
17
+ <a href='https://jingdongwang2017.github.io/' target='_blank'>Jingdong Wang</a><sup>2</sup>&emsp;
18
+ <a href='https://sites.google.com/site/zhusiyucs/home' target='_blank'>Siyu Zhu</a><sup>1✉️</sup>&emsp;
19
+ </div>
20
+
21
+ <div align='center'>
22
+ <sup>1</sup>Fudan University&emsp; <sup>2</sup>Baidu Inc&emsp;
23
+ </div>
24
+
25
+ <br>
26
+ <div align='center'>
27
+ <a href='https://github.com/fudan-generative-vision/hallo3'><img src='https://img.shields.io/github/stars/fudan-generative-vision/hallo3?style=social'></a>
28
+ </div>
29
+ <br>
30
+
31
+ ## 📸 Showcase
32
+
33
+ <table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
34
+ <tr>
35
+ <td>
36
+ <video src="https://github.com/user-attachments/assets/3fc44086-bdbf-4a54-bfe3-62cfd9dfb191" width="100%" controls autoplay loop></video>
37
+ </td>
38
+ <td>
39
+ <video src="https://github.com/user-attachments/assets/ad5a87cf-b50e-48d6-af35-774e3b1713e7" width="100%" controls autoplay loop></video>
40
+ </td>
41
+ <td>
42
+ <video src="https://github.com/user-attachments/assets/78c7acc3-4fa2-447e-b77d-3462d411c81c" width="100%" controls autoplay loop></video>
43
+ </td>
44
+ </tr>
45
+ <tr>
46
+ <td>
47
+ <video src="https://github.com/user-attachments/assets/f62f2b6d-9846-40be-a976-56cc7d5a8a5b" width="100%" controls autoplay loop></video>
48
+ </td>
49
+ <td>
50
+ <video src="https://github.com/user-attachments/assets/42b6968e-c68a-4473-b773-406ccf5d90b1" width="100%" controls autoplay loop></video>
51
+ </td>
52
+ <td>
53
+ <video src="https://github.com/user-attachments/assets/015f1d6d-31a8-4454-b51a-5431d3c953c2" width="100%" controls autoplay loop></video>
54
+ </td>
55
+ </tr>
56
+ </table>
57
+
58
+ Visit our [project page](https://fudan-generative-vision.github.io/hallo3/#/) to view more cases.
59
+
60
+ ## ⚙️ Installation
61
+
62
+ - System requirement: Ubuntu 20.04/Ubuntu 22.04, Cuda 12.1
63
+ - Tested GPUs: H100
64
+
65
+ Download the codes:
66
+
67
+ ```bash
68
+ git clone https://github.com/fudan-generative-vision/hallo3
69
+ cd hallo3
70
+ ```
71
+
72
+ Create conda environment:
73
+
74
+ ```bash
75
+ conda create -n hallo python=3.10
76
+ conda activate hallo
77
+ ```
78
+
79
+ Install packages with `pip`
80
+
81
+ ```bash
82
+ pip install -r requirements.txt
83
+ ```
84
+
85
+ Besides, ffmpeg is also needed:
86
+
87
+ ```bash
88
+ apt-get install ffmpeg
89
+ ```
90
+
91
+ ### 📥 Download Pretrained Models
92
+
93
+ You can easily get all pretrained models required by inference from our [HuggingFace repo](https://huggingface.co/fudan-generative-ai/hallo3).
94
+
95
+ Using `huggingface-cli` to download the models:
96
+
97
+ ```shell
98
+ cd $ProjectRootDir
99
+ pip install huggingface-cli
100
+ huggingface-cli download fudan-generative-ai/hallo3 --local-dir ./pretrained_models
101
+ ```
102
+
103
+ Or you can download them separately from their source repo:
104
+
105
+ - [hallo3](https://huggingface.co/fudan-generative-ai/hallo3/tree/main/hallo3): Our checkpoints.
106
+ - [Cogvidex](https://github.com/THUDM/CogVideo): Cogvideox-5b-i2v pretrained model, consisting of transformer and 3d vae
107
+ - [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl): text encoder, you can download from [text_encoder](https://huggingface.co/THUDM/CogVideoX-2b/tree/main/text_encoder) and [tokenizer](https://huggingface.co/THUDM/CogVideoX-2b/tree/main/tokenizer)
108
+ - [audio_separator](https://huggingface.co/huangjackson/Kim_Vocal_2): Kim Vocal_2 MDX-Net vocal removal model.
109
+ - [wav2vec](https://huggingface.co/facebook/wav2vec2-base-960h): wav audio to vector model from [Facebook](https://huggingface.co/facebook/wav2vec2-base-960h).
110
+ - [insightface](https://github.com/deepinsight/insightface/tree/master/python-package#model-zoo): 2D and 3D Face Analysis placed into `pretrained_models/face_analysis/models/`. (_Thanks to deepinsight_)
111
+ - [face landmarker](https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task): Face detection & mesh model from [mediapipe](https://ai.google.dev/edge/mediapipe/solutions/vision/face_landmarker#models) placed into `pretrained_models/face_analysis/models`.
112
+
113
+ Finally, these pretrained models should be organized as follows:
114
+
115
+ ```text
116
+ ./pretrained_models/
117
+ |-- audio_separator/
118
+ | |-- download_checks.json
119
+ | |-- mdx_model_data.json
120
+ | |-- vr_model_data.json
121
+ | `-- Kim_Vocal_2.onnx
122
+ |-- cogvideox-5b-i2v-sat/
123
+ | |-- transformer/
124
+ | |--1/
125
+ | |-- mp_rank_00_model_states.pt
126
+ | `--latest
127
+ | `-- vae/
128
+ | |-- 3d-vae.pt
129
+ |-- face_analysis/
130
+ | `-- models/
131
+ | |-- face_landmarker_v2_with_blendshapes.task # face landmarker model from mediapipe
132
+ | |-- 1k3d68.onnx
133
+ | |-- 2d106det.onnx
134
+ | |-- genderage.onnx
135
+ | |-- glintr100.onnx
136
+ | `-- scrfd_10g_bnkps.onnx
137
+ |-- hallo3
138
+ | |--1/
139
+ | |-- mp_rank_00_model_states.pt
140
+ | `--latest
141
+ |-- t5-v1_1-xxl/
142
+ | |-- added_tokens.json
143
+ | |-- config.json
144
+ | |-- model-00001-of-00002.safetensors
145
+ | |-- model-00002-of-00002.safetensors
146
+ | |-- model.safetensors.index.json
147
+ | |-- special_tokens_map.json
148
+ | |-- spiece.model
149
+ | |-- tokenizer_config.json
150
+ |
151
+ `-- wav2vec/
152
+ `-- wav2vec2-base-960h/
153
+ |-- config.json
154
+ |-- feature_extractor_config.json
155
+ |-- model.safetensors
156
+ |-- preprocessor_config.json
157
+ |-- special_tokens_map.json
158
+ |-- tokenizer_config.json
159
+ `-- vocab.json
160
+ ```
161
+
162
+ ### 🛠️ Prepare Inference Data
163
+
164
+ Hallo3 has a few simple requirements for the input data of inference:
165
+ 1. Reference image must be 1:1 or 3:2 aspect ratio.
166
+ 2. Driving audio must be in WAV format.
167
+ 3. Audio must be in English since our training datasets are only in this language.
168
+ 4. Ensure the vocals of audio are clear; background music is acceptable.
169
+
170
+ ### 🎮 Run Inference
171
+
172
+ Simply to run the `scripts/inference_long_batch.sh`:
173
+
174
+ ```bash
175
+ bash scripts/inference_long_batch.sh ./examples/inference/input.txt ./output
176
+ ```
177
+
178
+ Animation results will be saved at `./output`. You can find more examples for inference at [examples folder](https://github.com/fudan-generative-vision/hallo3/tree/main/examples).
179
+
180
+
181
+ ## Training
182
+
183
+ #### prepare data for training
184
+ Organize your raw videos into the following directory structure:
185
+ ```text
186
+ dataset_name/
187
+ |-- videos/
188
+ | |-- 0001.mp4
189
+ | |-- 0002.mp4
190
+ | `-- 0003.mp4
191
+ |-- caption/
192
+ | |-- 0001.txt
193
+ | |-- 0002.txt
194
+ | `-- 0003.txt
195
+ ```
196
+ You can use any dataset_name, but ensure the videos directory and caption directory are named as shown above.
197
+
198
+ Next, process the videos with the following commands:
199
+ ```bash
200
+ bash scripts/data_preprocess.sh {dataset_name} {parallelism} {rank} {output_name}
201
+ ```
202
+
203
+ #### Training
204
+
205
+ Update the data meta path settings in the configuration YAML files, `configs/sft_s1.yaml` and `configs/sft_s2.yaml`:
206
+
207
+ ```yaml
208
+ #sft_s1.yaml
209
+ train_data: [
210
+ "./data/output_name.json"
211
+ ]
212
+
213
+ #sft_s2.yaml
214
+ train_data: [
215
+ "./data/output_name.json"
216
+ ]
217
+ ```
218
+
219
+ Start training with the following command:
220
+ ```bash
221
+ # stage1
222
+ bash scripts/finetune_multi_gpus_s1.sh
223
+
224
+ # stage2
225
+ bash scripts/finetune_multi_gpus_s2.sh
226
+ ```
227
+
228
+ ## 📝 Citation
229
+
230
+ If you find our work useful for your research, please consider citing the paper:
231
+
232
+ ```
233
+ @misc{cui2024hallo3,
234
+ title={Hallo3: Highly Dynamic and Realistic Portrait Image Animation with Diffusion Transformer Networks},
235
+ author={Jiahao Cui and Hui Li and Yun Zhang and Hanlin Shang and Kaihui Cheng and Yuqi Ma and Shan Mu and Hang Zhou and Jingdong Wang and Siyu Zhu},
236
+ year={2024},
237
+ eprint={2412.00733},
238
+ archivePrefix={arXiv},
239
+ primaryClass={cs.CV}
240
+ }
241
+ ```
242
+
243
+ ## ⚠️ Social Risks and Mitigations
244
+
245
+ The development of portrait image animation technologies driven by audio inputs poses social risks, such as the ethical implications of creating realistic portraits that could be misused for deepfakes. To mitigate these risks, it is crucial to establish ethical guidelines and responsible use practices. Privacy and consent concerns also arise from using individuals' images and voices. Addressing these involves transparent data usage policies, informed consent, and safeguarding privacy rights. By addressing these risks and implementing mitigations, the research aims to ensure the responsible and ethical development of this technology.
246
+
247
+ ## 🤗 Acknowledgements
248
+
249
+ This model is a fine-tuned derivative version based on the **CogVideo-5B I2V** model. CogVideo-5B is an open-source text-to-video generation model developed by the CogVideoX team. Its original code and model parameters are governed by the [CogVideo-5B LICENSE](https://huggingface.co/THUDM/CogVideoX-5b/blob/main/LICENSE).
250
+
251
+ As a derivative work of CogVideo-5B, the use, distribution, and modification of this model must comply with the license terms of CogVideo-5B.
252
+
253
+ ## 👏 Community Contributors
254
+
255
+ Thank you to all the contributors who have helped to make this project better!
256
+
257
+ <a href="https://github.com/fudan-generative-vision/hallo2/graphs/contributors">
258
+ <img src="https://contrib.rocks/image?repo=fudan-generative-vision/hallo3" />
259
+ </a>
audio_separator/Kim_Vocal_2.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce74ef3b6a6024ce44211a07be9cf8bc6d87728cc852a68ab34eb8e58cde9c8b
3
+ size 66759214
audio_separator/download_checks.json ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "current_version": "UVR_Patch_10_6_23_4_27",
3
+ "current_version_ocl": "UVR_Patch_10_6_23_4_27",
4
+ "current_version_mac": "UVR_Patch_10_6_23_4_27",
5
+ "current_version_linux": "UVR_Patch_10_6_23_4_27",
6
+ "vr_download_list": {
7
+ "VR Arch Single Model v5: 1_HP-UVR": "1_HP-UVR.pth",
8
+ "VR Arch Single Model v5: 2_HP-UVR": "2_HP-UVR.pth",
9
+ "VR Arch Single Model v5: 3_HP-Vocal-UVR": "3_HP-Vocal-UVR.pth",
10
+ "VR Arch Single Model v5: 4_HP-Vocal-UVR": "4_HP-Vocal-UVR.pth",
11
+ "VR Arch Single Model v5: 5_HP-Karaoke-UVR": "5_HP-Karaoke-UVR.pth",
12
+ "VR Arch Single Model v5: 6_HP-Karaoke-UVR": "6_HP-Karaoke-UVR.pth",
13
+ "VR Arch Single Model v5: 7_HP2-UVR": "7_HP2-UVR.pth",
14
+ "VR Arch Single Model v5: 8_HP2-UVR": "8_HP2-UVR.pth",
15
+ "VR Arch Single Model v5: 9_HP2-UVR": "9_HP2-UVR.pth",
16
+ "VR Arch Single Model v5: 10_SP-UVR-2B-32000-1": "10_SP-UVR-2B-32000-1.pth",
17
+ "VR Arch Single Model v5: 11_SP-UVR-2B-32000-2": "11_SP-UVR-2B-32000-2.pth",
18
+ "VR Arch Single Model v5: 12_SP-UVR-3B-44100": "12_SP-UVR-3B-44100.pth",
19
+ "VR Arch Single Model v5: 13_SP-UVR-4B-44100-1": "13_SP-UVR-4B-44100-1.pth",
20
+ "VR Arch Single Model v5: 14_SP-UVR-4B-44100-2": "14_SP-UVR-4B-44100-2.pth",
21
+ "VR Arch Single Model v5: 15_SP-UVR-MID-44100-1": "15_SP-UVR-MID-44100-1.pth",
22
+ "VR Arch Single Model v5: 16_SP-UVR-MID-44100-2": "16_SP-UVR-MID-44100-2.pth",
23
+ "VR Arch Single Model v5: 17_HP-Wind_Inst-UVR": "17_HP-Wind_Inst-UVR.pth",
24
+ "VR Arch Single Model v5: UVR-De-Echo-Aggressive by FoxJoy": "UVR-De-Echo-Aggressive.pth",
25
+ "VR Arch Single Model v5: UVR-De-Echo-Normal by FoxJoy": "UVR-De-Echo-Normal.pth",
26
+ "VR Arch Single Model v5: UVR-DeEcho-DeReverb by FoxJoy": "UVR-DeEcho-DeReverb.pth",
27
+ "VR Arch Single Model v5: UVR-DeNoise-Lite by FoxJoy": "UVR-DeNoise-Lite.pth",
28
+ "VR Arch Single Model v5: UVR-DeNoise by FoxJoy": "UVR-DeNoise.pth",
29
+ "VR Arch Single Model v5: UVR-BVE-4B_SN-44100-1": "UVR-BVE-4B_SN-44100-1.pth",
30
+ "VR Arch Single Model v4: MGM_HIGHEND_v4": "MGM_HIGHEND_v4.pth",
31
+ "VR Arch Single Model v4: MGM_LOWEND_A_v4": "MGM_LOWEND_A_v4.pth",
32
+ "VR Arch Single Model v4: MGM_LOWEND_B_v4": "MGM_LOWEND_B_v4.pth",
33
+ "VR Arch Single Model v4: MGM_MAIN_v4": "MGM_MAIN_v4.pth"
34
+ },
35
+
36
+ "mdx_download_list": {
37
+ "MDX-Net Model: UVR-MDX-NET Inst HQ 1": "UVR-MDX-NET-Inst_HQ_1.onnx",
38
+ "MDX-Net Model: UVR-MDX-NET Inst HQ 2": "UVR-MDX-NET-Inst_HQ_2.onnx",
39
+ "MDX-Net Model: UVR-MDX-NET Inst HQ 3": "UVR-MDX-NET-Inst_HQ_3.onnx",
40
+ "MDX-Net Model: UVR-MDX-NET Inst HQ 4": "UVR-MDX-NET-Inst_HQ_4.onnx",
41
+ "MDX-Net Model: UVR-MDX-NET Main": "UVR_MDXNET_Main.onnx",
42
+ "MDX-Net Model: UVR-MDX-NET Inst Main": "UVR-MDX-NET-Inst_Main.onnx",
43
+ "MDX-Net Model: UVR-MDX-NET 1": "UVR_MDXNET_1_9703.onnx",
44
+ "MDX-Net Model: UVR-MDX-NET 2": "UVR_MDXNET_2_9682.onnx",
45
+ "MDX-Net Model: UVR-MDX-NET 3": "UVR_MDXNET_3_9662.onnx",
46
+ "MDX-Net Model: UVR-MDX-NET Inst 1": "UVR-MDX-NET-Inst_1.onnx",
47
+ "MDX-Net Model: UVR-MDX-NET Inst 2": "UVR-MDX-NET-Inst_2.onnx",
48
+ "MDX-Net Model: UVR-MDX-NET Inst 3": "UVR-MDX-NET-Inst_3.onnx",
49
+ "MDX-Net Model: UVR-MDX-NET Karaoke": "UVR_MDXNET_KARA.onnx",
50
+ "MDX-Net Model: UVR-MDX-NET Karaoke 2": "UVR_MDXNET_KARA_2.onnx",
51
+ "MDX-Net Model: UVR_MDXNET_9482": "UVR_MDXNET_9482.onnx",
52
+ "MDX-Net Model: UVR-MDX-NET Voc FT": "UVR-MDX-NET-Voc_FT.onnx",
53
+ "MDX-Net Model: Kim Vocal 1": "Kim_Vocal_1.onnx",
54
+ "MDX-Net Model: Kim Vocal 2": "Kim_Vocal_2.onnx",
55
+ "MDX-Net Model: Kim Inst": "Kim_Inst.onnx",
56
+ "MDX-Net Model: Reverb HQ By FoxJoy": "Reverb_HQ_By_FoxJoy.onnx",
57
+ "MDX-Net Model: UVR-MDX-NET Crowd HQ 1 By Aufr33": "UVR-MDX-NET_Crowd_HQ_1.onnx",
58
+ "MDX-Net Model: kuielab_a_vocals": "kuielab_a_vocals.onnx",
59
+ "MDX-Net Model: kuielab_a_other": "kuielab_a_other.onnx",
60
+ "MDX-Net Model: kuielab_a_bass": "kuielab_a_bass.onnx",
61
+ "MDX-Net Model: kuielab_a_drums": "kuielab_a_drums.onnx",
62
+ "MDX-Net Model: kuielab_b_vocals": "kuielab_b_vocals.onnx",
63
+ "MDX-Net Model: kuielab_b_other": "kuielab_b_other.onnx",
64
+ "MDX-Net Model: kuielab_b_bass": "kuielab_b_bass.onnx",
65
+ "MDX-Net Model: kuielab_b_drums": "kuielab_b_drums.onnx"
66
+ },
67
+
68
+ "demucs_download_list":{
69
+
70
+ "Demucs v4: htdemucs_ft":{
71
+ "f7e0c4bc-ba3fe64a.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/f7e0c4bc-ba3fe64a.th",
72
+ "d12395a8-e57c48e6.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/d12395a8-e57c48e6.th",
73
+ "92cfc3b6-ef3bcb9c.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/92cfc3b6-ef3bcb9c.th",
74
+ "04573f0d-f3cf25b2.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/04573f0d-f3cf25b2.th",
75
+ "htdemucs_ft.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs_ft.yaml"
76
+ },
77
+
78
+ "Demucs v4: htdemucs":{
79
+ "955717e8-8726e21a.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th",
80
+ "htdemucs.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs.yaml"
81
+ },
82
+
83
+ "Demucs v4: hdemucs_mmi":{
84
+ "75fc33f5-1941ce65.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/75fc33f5-1941ce65.th",
85
+ "hdemucs_mmi.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/hdemucs_mmi.yaml"
86
+ },
87
+ "Demucs v4: htdemucs_6s":{
88
+ "5c90dfd2-34c22ccb.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/5c90dfd2-34c22ccb.th",
89
+ "htdemucs_6s.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs_6s.yaml"
90
+ },
91
+ "Demucs v3: mdx":{
92
+ "0d19c1c6-0f06f20e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/0d19c1c6-0f06f20e.th",
93
+ "7ecf8ec1-70f50cc9.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7ecf8ec1-70f50cc9.th",
94
+ "c511e2ab-fe698775.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/c511e2ab-fe698775.th",
95
+ "7d865c68-3d5dd56b.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7d865c68-3d5dd56b.th",
96
+ "mdx.yaml": "https://raw.githubusercontent.com/facebookresearch/demucs/main/demucs/remote/mdx.yaml"
97
+ },
98
+
99
+ "Demucs v3: mdx_q":{
100
+ "6b9c2ca1-3fd82607.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/6b9c2ca1-3fd82607.th",
101
+ "b72baf4e-8778635e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/b72baf4e-8778635e.th",
102
+ "42e558d4-196e0e1b.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/42e558d4-196e0e1b.th",
103
+ "305bc58f-18378783.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/305bc58f-18378783.th",
104
+ "mdx_q.yaml": "https://raw.githubusercontent.com/facebookresearch/demucs/main/demucs/remote/mdx_q.yaml"
105
+ },
106
+
107
+ "Demucs v3: mdx_extra":{
108
+ "e51eebcc-c1b80bdd.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/e51eebcc-c1b80bdd.th",
109
+ "a1d90b5c-ae9d2452.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/a1d90b5c-ae9d2452.th",
110
+ "5d2d6c55-db83574e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/5d2d6c55-db83574e.th",
111
+ "cfa93e08-61801ae1.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/cfa93e08-61801ae1.th",
112
+ "mdx_extra.yaml": "https://raw.githubusercontent.com/facebookresearch/demucs/main/demucs/remote/mdx_extra.yaml"
113
+ },
114
+
115
+ "Demucs v3: mdx_extra_q": {
116
+ "83fc094f-4a16d450.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/83fc094f-4a16d450.th",
117
+ "464b36d7-e5a9386e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/464b36d7-e5a9386e.th",
118
+ "14fc6a69-a89dd0ee.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/14fc6a69-a89dd0ee.th",
119
+ "7fd6ef75-a905dd85.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7fd6ef75-a905dd85.th",
120
+ "mdx_extra_q.yaml": "https://raw.githubusercontent.com/facebookresearch/demucs/main/demucs/remote/mdx_extra_q.yaml"
121
+ },
122
+
123
+ "Demucs v3: UVR Model":{
124
+ "ebf34a2db.th": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/ebf34a2db.th",
125
+ "UVR_Demucs_Model_1.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/UVR_Demucs_Model_1.yaml"
126
+ },
127
+
128
+ "Demucs v3: repro_mdx_a":{
129
+ "9a6b4851-03af0aa6.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/9a6b4851-03af0aa6.th",
130
+ "1ef250f1-592467ce.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/1ef250f1-592467ce.th",
131
+ "fa0cb7f9-100d8bf4.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/fa0cb7f9-100d8bf4.th",
132
+ "902315c2-b39ce9c9.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/902315c2-b39ce9c9.th",
133
+ "repro_mdx_a.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a.yaml"
134
+ },
135
+
136
+ "Demucs v3: repro_mdx_a_time_only":{
137
+ "9a6b4851-03af0aa6.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/9a6b4851-03af0aa6.th",
138
+ "1ef250f1-592467ce.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/1ef250f1-592467ce.th",
139
+ "repro_mdx_a_time_only.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a_time_only.yaml"
140
+ },
141
+
142
+ "Demucs v3: repro_mdx_a_hybrid_only":{
143
+ "fa0cb7f9-100d8bf4.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/fa0cb7f9-100d8bf4.th",
144
+ "902315c2-b39ce9c9.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/902315c2-b39ce9c9.th",
145
+ "repro_mdx_a_hybrid_only.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a_hybrid_only.yaml"
146
+ },
147
+
148
+ "Demucs v2: demucs": {
149
+ "demucs-e07c671f.th": "https://dl.fbaipublicfiles.com/demucs/v3.0/demucs-e07c671f.th"
150
+ },
151
+
152
+ "Demucs v2: demucs_extra": {
153
+ "demucs_extra-3646af93.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs_extra-3646af93.th"
154
+ },
155
+
156
+ "Demucs v2: demucs48_hq": {
157
+ "demucs48_hq-28a1282c.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs48_hq-28a1282c.th"
158
+ },
159
+
160
+ "Demucs v2: tasnet": {
161
+ "tasnet-beb46fac.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/tasnet-beb46fac.th"
162
+ },
163
+
164
+ "Demucs v2: tasnet_extra": {
165
+ "tasnet_extra-df3777b2.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/tasnet_extra-df3777b2.th"
166
+ },
167
+
168
+ "Demucs v2: demucs_unittest": {
169
+ "demucs_unittest-09ebc15f.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs_unittest-09ebc15f.th"
170
+ },
171
+
172
+ "Demucs v1: demucs": {
173
+ "demucs.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/demucs.th"
174
+ },
175
+
176
+ "Demucs v1: demucs_extra": {
177
+ "demucs_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/demucs_extra.th"
178
+ },
179
+
180
+ "Demucs v1: light": {
181
+ "light.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/light.th"
182
+ },
183
+
184
+ "Demucs v1: light_extra": {
185
+ "light_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/light_extra.th"
186
+ },
187
+
188
+ "Demucs v1: tasnet": {
189
+ "tasnet.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/tasnet.th"
190
+ },
191
+
192
+ "Demucs v1: tasnet_extra": {
193
+ "tasnet_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/tasnet_extra.th"
194
+ }
195
+ },
196
+
197
+ "mdx_download_vip_list": {
198
+ "MDX-Net Model VIP: UVR-MDX-NET_Main_340": "UVR-MDX-NET_Main_340.onnx",
199
+ "MDX-Net Model VIP: UVR-MDX-NET_Main_390": "UVR-MDX-NET_Main_390.onnx",
200
+ "MDX-Net Model VIP: UVR-MDX-NET_Main_406": "UVR-MDX-NET_Main_406.onnx",
201
+ "MDX-Net Model VIP: UVR-MDX-NET_Main_427": "UVR-MDX-NET_Main_427.onnx",
202
+ "MDX-Net Model VIP: UVR-MDX-NET_Main_438": "UVR-MDX-NET_Main_438.onnx",
203
+ "MDX-Net Model VIP: UVR-MDX-NET_Inst_82_beta": "UVR-MDX-NET_Inst_82_beta.onnx",
204
+ "MDX-Net Model VIP: UVR-MDX-NET_Inst_90_beta": "UVR-MDX-NET_Inst_90_beta.onnx",
205
+ "MDX-Net Model VIP: UVR-MDX-NET_Inst_187_beta": "UVR-MDX-NET_Inst_187_beta.onnx",
206
+ "MDX-Net Model VIP: UVR-MDX-NET-Inst_full_292": "UVR-MDX-NET-Inst_full_292.onnx"
207
+ },
208
+
209
+ "mdx23_download_list": {
210
+ "MDX23C Model: MDX23C_D1581": {"MDX23C_D1581.ckpt":"model_2_stem_061321.yaml"}
211
+ },
212
+
213
+ "mdx23c_download_list": {
214
+ "MDX23C Model: MDX23C-InstVoc HQ": {"MDX23C-8KFFT-InstVoc_HQ.ckpt":"model_2_stem_full_band_8k.yaml"}
215
+ },
216
+
217
+ "roformer_download_list": {
218
+ "Roformer Model: BS-Roformer-Viperx-1297": {"model_bs_roformer_ep_317_sdr_12.9755.ckpt":"model_bs_roformer_ep_317_sdr_12.9755.yaml"},
219
+ "Roformer Model: BS-Roformer-Viperx-1296": {"model_bs_roformer_ep_368_sdr_12.9628.ckpt":"model_bs_roformer_ep_368_sdr_12.9628.yaml"},
220
+ "Roformer Model: BS-Roformer-Viperx-1053": {"model_bs_roformer_ep_937_sdr_10.5309.ckpt":"model_bs_roformer_ep_937_sdr_10.5309.yaml"},
221
+ "Roformer Model: Mel-Roformer-Viperx-1143": {"model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt":"model_mel_band_roformer_ep_3005_sdr_11.4360.yaml"}
222
+ },
223
+
224
+ "mdx23c_download_vip_list": {
225
+ "MDX23C Model VIP: MDX23C_D1581": {"MDX23C_D1581.ckpt":"model_2_stem_061321.yaml"},
226
+ "MDX23C Model VIP: MDX23C-InstVoc HQ 2": {"MDX23C-8KFFT-InstVoc_HQ_2.ckpt":"model_2_stem_full_band_8k.yaml"}
227
+ },
228
+
229
+ "vr_download_vip_list": [],
230
+ "demucs_download_vip_list": []
231
+ }
audio_separator/mdx_model_data.json ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0ddfc0eb5792638ad5dc27850236c246": {
3
+ "compensate": 1.035,
4
+ "mdx_dim_f_set": 2048,
5
+ "mdx_dim_t_set": 8,
6
+ "mdx_n_fft_scale_set": 6144,
7
+ "primary_stem": "Vocals"
8
+ },
9
+ "26d308f91f3423a67dc69a6d12a8793d": {
10
+ "compensate": 1.035,
11
+ "mdx_dim_f_set": 2048,
12
+ "mdx_dim_t_set": 9,
13
+ "mdx_n_fft_scale_set": 8192,
14
+ "primary_stem": "Other"
15
+ },
16
+ "2cdd429caac38f0194b133884160f2c6": {
17
+ "compensate": 1.045,
18
+ "mdx_dim_f_set": 3072,
19
+ "mdx_dim_t_set": 8,
20
+ "mdx_n_fft_scale_set": 7680,
21
+ "primary_stem": "Instrumental"
22
+ },
23
+ "2f5501189a2f6db6349916fabe8c90de": {
24
+ "compensate": 1.035,
25
+ "mdx_dim_f_set": 2048,
26
+ "mdx_dim_t_set": 8,
27
+ "mdx_n_fft_scale_set": 6144,
28
+ "primary_stem": "Vocals",
29
+ "is_karaoke": true
30
+ },
31
+ "398580b6d5d973af3120df54cee6759d": {
32
+ "compensate": 1.75,
33
+ "mdx_dim_f_set": 3072,
34
+ "mdx_dim_t_set": 8,
35
+ "mdx_n_fft_scale_set": 7680,
36
+ "primary_stem": "Vocals"
37
+ },
38
+ "488b3e6f8bd3717d9d7c428476be2d75": {
39
+ "compensate": 1.035,
40
+ "mdx_dim_f_set": 3072,
41
+ "mdx_dim_t_set": 8,
42
+ "mdx_n_fft_scale_set": 7680,
43
+ "primary_stem": "Instrumental"
44
+ },
45
+ "4910e7827f335048bdac11fa967772f9": {
46
+ "compensate": 1.035,
47
+ "mdx_dim_f_set": 2048,
48
+ "mdx_dim_t_set": 7,
49
+ "mdx_n_fft_scale_set": 4096,
50
+ "primary_stem": "Drums"
51
+ },
52
+ "53c4baf4d12c3e6c3831bb8f5b532b93": {
53
+ "compensate": 1.043,
54
+ "mdx_dim_f_set": 3072,
55
+ "mdx_dim_t_set": 8,
56
+ "mdx_n_fft_scale_set": 7680,
57
+ "primary_stem": "Vocals"
58
+ },
59
+ "5d343409ef0df48c7d78cce9f0106781": {
60
+ "compensate": 1.075,
61
+ "mdx_dim_f_set": 3072,
62
+ "mdx_dim_t_set": 8,
63
+ "mdx_n_fft_scale_set": 7680,
64
+ "primary_stem": "Vocals"
65
+ },
66
+ "5f6483271e1efb9bfb59e4a3e6d4d098": {
67
+ "compensate": 1.035,
68
+ "mdx_dim_f_set": 2048,
69
+ "mdx_dim_t_set": 9,
70
+ "mdx_n_fft_scale_set": 6144,
71
+ "primary_stem": "Vocals"
72
+ },
73
+ "65ab5919372a128e4167f5e01a8fda85": {
74
+ "compensate": 1.035,
75
+ "mdx_dim_f_set": 2048,
76
+ "mdx_dim_t_set": 8,
77
+ "mdx_n_fft_scale_set": 8192,
78
+ "primary_stem": "Other"
79
+ },
80
+ "6703e39f36f18aa7855ee1047765621d": {
81
+ "compensate": 1.035,
82
+ "mdx_dim_f_set": 2048,
83
+ "mdx_dim_t_set": 9,
84
+ "mdx_n_fft_scale_set": 16384,
85
+ "primary_stem": "Bass"
86
+ },
87
+ "6b31de20e84392859a3d09d43f089515": {
88
+ "compensate": 1.035,
89
+ "mdx_dim_f_set": 2048,
90
+ "mdx_dim_t_set": 8,
91
+ "mdx_n_fft_scale_set": 6144,
92
+ "primary_stem": "Vocals"
93
+ },
94
+ "867595e9de46f6ab699008295df62798": {
95
+ "compensate": 1.03,
96
+ "mdx_dim_f_set": 3072,
97
+ "mdx_dim_t_set": 8,
98
+ "mdx_n_fft_scale_set": 7680,
99
+ "primary_stem": "Vocals"
100
+ },
101
+ "a3cd63058945e777505c01d2507daf37": {
102
+ "compensate": 1.03,
103
+ "mdx_dim_f_set": 2048,
104
+ "mdx_dim_t_set": 8,
105
+ "mdx_n_fft_scale_set": 6144,
106
+ "primary_stem": "Vocals"
107
+ },
108
+ "b33d9b3950b6cbf5fe90a32608924700": {
109
+ "compensate": 1.03,
110
+ "mdx_dim_f_set": 3072,
111
+ "mdx_dim_t_set": 8,
112
+ "mdx_n_fft_scale_set": 7680,
113
+ "primary_stem": "Vocals"
114
+ },
115
+ "c3b29bdce8c4fa17ec609e16220330ab": {
116
+ "compensate": 1.035,
117
+ "mdx_dim_f_set": 2048,
118
+ "mdx_dim_t_set": 8,
119
+ "mdx_n_fft_scale_set": 16384,
120
+ "primary_stem": "Bass"
121
+ },
122
+ "ceed671467c1f64ebdfac8a2490d0d52": {
123
+ "compensate": 1.035,
124
+ "mdx_dim_f_set": 3072,
125
+ "mdx_dim_t_set": 8,
126
+ "mdx_n_fft_scale_set": 7680,
127
+ "primary_stem": "Instrumental"
128
+ },
129
+ "d2a1376f310e4f7fa37fb9b5774eb701": {
130
+ "compensate": 1.035,
131
+ "mdx_dim_f_set": 3072,
132
+ "mdx_dim_t_set": 8,
133
+ "mdx_n_fft_scale_set": 7680,
134
+ "primary_stem": "Instrumental"
135
+ },
136
+ "d7bff498db9324db933d913388cba6be": {
137
+ "compensate": 1.035,
138
+ "mdx_dim_f_set": 2048,
139
+ "mdx_dim_t_set": 8,
140
+ "mdx_n_fft_scale_set": 6144,
141
+ "primary_stem": "Vocals"
142
+ },
143
+ "d94058f8c7f1fae4164868ae8ae66b20": {
144
+ "compensate": 1.035,
145
+ "mdx_dim_f_set": 2048,
146
+ "mdx_dim_t_set": 8,
147
+ "mdx_n_fft_scale_set": 6144,
148
+ "primary_stem": "Vocals"
149
+ },
150
+ "dc41ede5961d50f277eb846db17f5319": {
151
+ "compensate": 1.035,
152
+ "mdx_dim_f_set": 2048,
153
+ "mdx_dim_t_set": 9,
154
+ "mdx_n_fft_scale_set": 4096,
155
+ "primary_stem": "Drums"
156
+ },
157
+ "e5572e58abf111f80d8241d2e44e7fa4": {
158
+ "compensate": 1.028,
159
+ "mdx_dim_f_set": 3072,
160
+ "mdx_dim_t_set": 8,
161
+ "mdx_n_fft_scale_set": 7680,
162
+ "primary_stem": "Instrumental"
163
+ },
164
+ "e7324c873b1f615c35c1967f912db92a": {
165
+ "compensate": 1.03,
166
+ "mdx_dim_f_set": 3072,
167
+ "mdx_dim_t_set": 8,
168
+ "mdx_n_fft_scale_set": 7680,
169
+ "primary_stem": "Vocals"
170
+ },
171
+ "1c56ec0224f1d559c42fd6fd2a67b154": {
172
+ "compensate": 1.025,
173
+ "mdx_dim_f_set": 2048,
174
+ "mdx_dim_t_set": 8,
175
+ "mdx_n_fft_scale_set": 5120,
176
+ "primary_stem": "Instrumental"
177
+ },
178
+ "f2df6d6863d8f435436d8b561594ff49": {
179
+ "compensate": 1.035,
180
+ "mdx_dim_f_set": 3072,
181
+ "mdx_dim_t_set": 8,
182
+ "mdx_n_fft_scale_set": 7680,
183
+ "primary_stem": "Instrumental"
184
+ },
185
+ "b06327a00d5e5fbc7d96e1781bbdb596": {
186
+ "compensate": 1.035,
187
+ "mdx_dim_f_set": 3072,
188
+ "mdx_dim_t_set": 8,
189
+ "mdx_n_fft_scale_set": 6144,
190
+ "primary_stem": "Instrumental"
191
+ },
192
+ "94ff780b977d3ca07c7a343dab2e25dd": {
193
+ "compensate": 1.039,
194
+ "mdx_dim_f_set": 3072,
195
+ "mdx_dim_t_set": 8,
196
+ "mdx_n_fft_scale_set": 6144,
197
+ "primary_stem": "Instrumental"
198
+ },
199
+ "73492b58195c3b52d34590d5474452f6": {
200
+ "compensate": 1.043,
201
+ "mdx_dim_f_set": 3072,
202
+ "mdx_dim_t_set": 8,
203
+ "mdx_n_fft_scale_set": 7680,
204
+ "primary_stem": "Vocals"
205
+ },
206
+ "970b3f9492014d18fefeedfe4773cb42": {
207
+ "compensate": 1.009,
208
+ "mdx_dim_f_set": 3072,
209
+ "mdx_dim_t_set": 8,
210
+ "mdx_n_fft_scale_set": 7680,
211
+ "primary_stem": "Vocals"
212
+ },
213
+ "1d64a6d2c30f709b8c9b4ce1366d96ee": {
214
+ "compensate": 1.065,
215
+ "mdx_dim_f_set": 2048,
216
+ "mdx_dim_t_set": 8,
217
+ "mdx_n_fft_scale_set": 5120,
218
+ "primary_stem": "Instrumental",
219
+ "is_karaoke": true
220
+ },
221
+ "203f2a3955221b64df85a41af87cf8f0": {
222
+ "compensate": 1.035,
223
+ "mdx_dim_f_set": 3072,
224
+ "mdx_dim_t_set": 8,
225
+ "mdx_n_fft_scale_set": 6144,
226
+ "primary_stem": "Instrumental"
227
+ },
228
+ "291c2049608edb52648b96e27eb80e95": {
229
+ "compensate": 1.035,
230
+ "mdx_dim_f_set": 3072,
231
+ "mdx_dim_t_set": 8,
232
+ "mdx_n_fft_scale_set": 6144,
233
+ "primary_stem": "Instrumental"
234
+ },
235
+ "ead8d05dab12ec571d67549b3aab03fc": {
236
+ "compensate": 1.035,
237
+ "mdx_dim_f_set": 3072,
238
+ "mdx_dim_t_set": 8,
239
+ "mdx_n_fft_scale_set": 6144,
240
+ "primary_stem": "Instrumental"
241
+ },
242
+ "cc63408db3d80b4d85b0287d1d7c9632": {
243
+ "compensate": 1.033,
244
+ "mdx_dim_f_set": 3072,
245
+ "mdx_dim_t_set": 8,
246
+ "mdx_n_fft_scale_set": 6144,
247
+ "primary_stem": "Instrumental"
248
+ },
249
+ "cd5b2989ad863f116c855db1dfe24e39": {
250
+ "compensate": 1.035,
251
+ "mdx_dim_f_set": 3072,
252
+ "mdx_dim_t_set": 9,
253
+ "mdx_n_fft_scale_set": 6144,
254
+ "primary_stem": "Reverb"
255
+ },
256
+ "55657dd70583b0fedfba5f67df11d711": {
257
+ "compensate": 1.022,
258
+ "mdx_dim_f_set": 3072,
259
+ "mdx_dim_t_set": 8,
260
+ "mdx_n_fft_scale_set": 6144,
261
+ "primary_stem": "Instrumental"
262
+ },
263
+ "b6bccda408a436db8500083ef3491e8b": {
264
+ "compensate": 1.02,
265
+ "mdx_dim_f_set": 3072,
266
+ "mdx_dim_t_set": 8,
267
+ "mdx_n_fft_scale_set": 7680,
268
+ "primary_stem": "Instrumental"
269
+ },
270
+ "8a88db95c7fb5dbe6a095ff2ffb428b1": {
271
+ "compensate": 1.026,
272
+ "mdx_dim_f_set": 2048,
273
+ "mdx_dim_t_set": 8,
274
+ "mdx_n_fft_scale_set": 5120,
275
+ "primary_stem": "Instrumental"
276
+ },
277
+ "b78da4afc6512f98e4756f5977f5c6b9": {
278
+ "compensate": 1.021,
279
+ "mdx_dim_f_set": 3072,
280
+ "mdx_dim_t_set": 8,
281
+ "mdx_n_fft_scale_set": 7680,
282
+ "primary_stem": "Instrumental"
283
+ },
284
+ "77d07b2667ddf05b9e3175941b4454a0": {
285
+ "compensate": 1.021,
286
+ "mdx_dim_f_set": 3072,
287
+ "mdx_dim_t_set": 8,
288
+ "mdx_n_fft_scale_set": 7680,
289
+ "primary_stem": "Vocals"
290
+ },
291
+ "0f2a6bc5b49d87d64728ee40e23bceb1": {
292
+ "compensate": 1.019,
293
+ "mdx_dim_f_set": 2560,
294
+ "mdx_dim_t_set": 8,
295
+ "mdx_n_fft_scale_set": 5120,
296
+ "primary_stem": "Instrumental"
297
+ },
298
+ "b02be2d198d4968a121030cf8950b492": {
299
+ "compensate": 1.020,
300
+ "mdx_dim_f_set": 2560,
301
+ "mdx_dim_t_set": 8,
302
+ "mdx_n_fft_scale_set": 5120,
303
+ "primary_stem": "No Crowd"
304
+ },
305
+ "2154254ee89b2945b97a7efed6e88820": {
306
+ "config_yaml": "model_2_stem_061321.yaml"
307
+ },
308
+ "063aadd735d58150722926dcbf5852a9": {
309
+ "config_yaml": "model_2_stem_061321.yaml"
310
+ },
311
+ "c09f714d978b41d718facfe3427e6001": {
312
+ "config_yaml": "model_2_stem_061321.yaml"
313
+ },
314
+ "fe96801369f6a148df2720f5ced88c19": {
315
+ "config_yaml": "model3.yaml"
316
+ },
317
+ "02e8b226f85fb566e5db894b9931c640": {
318
+ "config_yaml": "model2.yaml"
319
+ },
320
+ "e3de6d861635ab9c1d766149edd680d6": {
321
+ "config_yaml": "model1.yaml"
322
+ },
323
+ "3f2936c554ab73ce2e396d54636bd373": {
324
+ "config_yaml": "modelB.yaml"
325
+ },
326
+ "890d0f6f82d7574bca741a9e8bcb8168": {
327
+ "config_yaml": "modelB.yaml"
328
+ },
329
+ "63a3cb8c37c474681049be4ad1ba8815": {
330
+ "config_yaml": "modelB.yaml"
331
+ },
332
+ "a7fc5d719743c7fd6b61bd2b4d48b9f0": {
333
+ "config_yaml": "modelA.yaml"
334
+ },
335
+ "3567f3dee6e77bf366fcb1c7b8bc3745": {
336
+ "config_yaml": "modelA.yaml"
337
+ },
338
+ "a28f4d717bd0d34cd2ff7a3b0a3d065e": {
339
+ "config_yaml": "modelA.yaml"
340
+ },
341
+ "c9971a18da20911822593dc81caa8be9": {
342
+ "config_yaml": "sndfx.yaml"
343
+ },
344
+ "57d94d5ed705460d21c75a5ac829a605": {
345
+ "config_yaml": "sndfx.yaml"
346
+ },
347
+ "e7a25f8764f25a52c1b96c4946e66ba2": {
348
+ "config_yaml": "sndfx.yaml"
349
+ },
350
+ "104081d24e37217086ce5fde09147ee1": {
351
+ "config_yaml": "model_2_stem_061321.yaml"
352
+ },
353
+ "1e6165b601539f38d0a9330f3facffeb": {
354
+ "config_yaml": "model_2_stem_061321.yaml"
355
+ },
356
+ "fe0108464ce0d8271be5ab810891bd7c": {
357
+ "config_yaml": "model_2_stem_full_band.yaml"
358
+ },
359
+ "e9b82ec90ee56c507a3a982f1555714c": {
360
+ "config_yaml": "model_2_stem_full_band_2.yaml"
361
+ },
362
+ "99b6ceaae542265a3b6d657bf9fde79f": {
363
+ "config_yaml": "model_2_stem_full_band_8k.yaml"
364
+ },
365
+ "116f6f9dabb907b53d847ed9f7a9475f": {
366
+ "config_yaml": "model_2_stem_full_band_8k.yaml"
367
+ },
368
+ "53f707017bfcbb56f5e1bfac420d6732": {
369
+ "config_yaml": "model_bs_roformer_ep_317_sdr_12.9755.yaml",
370
+ "is_roformer": true
371
+ },
372
+ "63e41acc264bf681a73aa9f7e5f606cc": {
373
+ "config_yaml": "model_mel_band_roformer_ep_3005_sdr_11.4360.yaml",
374
+ "is_roformer": true
375
+ },
376
+ "e733736763234047587931fc35322fd9": {
377
+ "config_yaml": "model_bs_roformer_ep_937_sdr_10.5309.yaml",
378
+ "is_roformer": true
379
+ },
380
+ "d789065adfd747d6f585b27b495bcdae": {
381
+ "config_yaml": "model_bs_roformer_ep_368_sdr_12.9628.yaml",
382
+ "is_roformer": true
383
+ }
384
+ }
audio_separator/vr_model_data.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0d0e6d143046b0eecc41a22e60224582": {
3
+ "vr_model_param": "3band_44100_mid",
4
+ "primary_stem": "Instrumental"
5
+ },
6
+ "18b52f873021a0af556fb4ecd552bb8e": {
7
+ "vr_model_param": "2band_32000",
8
+ "primary_stem": "Instrumental"
9
+ },
10
+ "1fc66027c82b499c7d8f55f79e64cadc": {
11
+ "vr_model_param": "2band_32000",
12
+ "primary_stem": "Instrumental"
13
+ },
14
+ "2aa34fbc01f8e6d2bf509726481e7142": {
15
+ "vr_model_param": "4band_44100",
16
+ "primary_stem": "No Piano"
17
+ },
18
+ "3e18f639b11abea7361db1a4a91c2559": {
19
+ "vr_model_param": "4band_44100",
20
+ "primary_stem": "Instrumental"
21
+ },
22
+ "570b5f50054609a17741369a35007ddd": {
23
+ "vr_model_param": "4band_v3",
24
+ "primary_stem": "Instrumental"
25
+ },
26
+ "5a6e24c1b530f2dab045a522ef89b751": {
27
+ "vr_model_param": "1band_sr44100_hl512",
28
+ "primary_stem": "Instrumental"
29
+ },
30
+ "6b5916069a49be3fe29d4397ecfd73fa": {
31
+ "vr_model_param": "3band_44100_msb2",
32
+ "primary_stem": "Instrumental",
33
+ "is_karaoke": true
34
+ },
35
+ "74b3bc5fa2b69f29baf7839b858bc679": {
36
+ "vr_model_param": "4band_44100",
37
+ "primary_stem": "Instrumental"
38
+ },
39
+ "827213b316df36b52a1f3d04fec89369": {
40
+ "vr_model_param": "4band_44100",
41
+ "primary_stem": "Instrumental"
42
+ },
43
+ "911d4048eee7223eca4ee0efb7d29256": {
44
+ "vr_model_param": "4band_44100",
45
+ "primary_stem": "Vocals"
46
+ },
47
+ "941f3f7f0b0341f12087aacdfef644b1": {
48
+ "vr_model_param": "4band_v2",
49
+ "primary_stem": "Instrumental"
50
+ },
51
+ "a02827cf69d75781a35c0e8a327f3195": {
52
+ "vr_model_param": "1band_sr33075_hl384",
53
+ "primary_stem": "Instrumental"
54
+ },
55
+ "b165fbff113c959dba5303b74c6484bc": {
56
+ "vr_model_param": "3band_44100",
57
+ "primary_stem": "Instrumental"
58
+ },
59
+ "b5f988cd3e891dca7253bf5f0f3427c7": {
60
+ "vr_model_param": "4band_44100",
61
+ "primary_stem": "Instrumental"
62
+ },
63
+ "b99c35723bc35cb11ed14a4780006a80": {
64
+ "vr_model_param": "1band_sr44100_hl1024",
65
+ "primary_stem": "Instrumental"
66
+ },
67
+ "ba02fd25b71d620eebbdb49e18e4c336": {
68
+ "vr_model_param": "3band_44100_mid",
69
+ "primary_stem": "Instrumental"
70
+ },
71
+ "c4476ef424d8cba65f38d8d04e8514e2": {
72
+ "vr_model_param": "3band_44100_msb2",
73
+ "primary_stem": "Instrumental"
74
+ },
75
+ "da2d37b8be2972e550a409bae08335aa": {
76
+ "vr_model_param": "4band_44100",
77
+ "primary_stem": "Vocals"
78
+ },
79
+ "db57205d3133e39df8e050b435a78c80": {
80
+ "vr_model_param": "4band_44100",
81
+ "primary_stem": "Instrumental"
82
+ },
83
+ "ea83b08e32ec2303456fe50659035f69": {
84
+ "vr_model_param": "4band_v3",
85
+ "primary_stem": "Instrumental"
86
+ },
87
+ "f6ea8473ff86017b5ebd586ccacf156b": {
88
+ "vr_model_param": "4band_v2_sn",
89
+ "primary_stem": "Instrumental",
90
+ "is_karaoke": true
91
+ },
92
+ "fd297a61eafc9d829033f8b987c39a3d": {
93
+ "vr_model_param": "1band_sr32000_hl512",
94
+ "primary_stem": "Instrumental"
95
+ },
96
+ "0ec76fd9e65f81d8b4fbd13af4826ed8": {
97
+ "vr_model_param": "4band_v3",
98
+ "primary_stem": "No Woodwinds"
99
+ },
100
+ "0fb9249ffe4ffc38d7b16243f394c0ff": {
101
+ "vr_model_param": "4band_v3",
102
+ "primary_stem": "No Reverb"
103
+ },
104
+ "6857b2972e1754913aad0c9a1678c753": {
105
+ "vr_model_param": "4band_v3",
106
+ "primary_stem": "No Echo",
107
+ "nout": 48,
108
+ "nout_lstm": 128
109
+ },
110
+ "f200a145434efc7dcf0cd093f517ed52": {
111
+ "vr_model_param": "4band_v3",
112
+ "primary_stem": "No Echo",
113
+ "nout": 48,
114
+ "nout_lstm": 128
115
+ },
116
+ "44c55d8b5d2e3edea98c2b2bf93071c7": {
117
+ "vr_model_param": "4band_v3",
118
+ "primary_stem": "Noise",
119
+ "nout": 48,
120
+ "nout_lstm": 128
121
+ },
122
+ "51ea8c43a6928ed3c10ef5cb2707d57b": {
123
+ "vr_model_param": "1band_sr44100_hl1024",
124
+ "primary_stem": "Noise",
125
+ "nout": 16,
126
+ "nout_lstm": 128
127
+ },
128
+ "944950a9c5963a5eb70b445d67b7068a": {
129
+ "vr_model_param": "4band_v3_sn",
130
+ "primary_stem": "Vocals",
131
+ "nout": 64,
132
+ "nout_lstm": 128,
133
+ "is_karaoke": false,
134
+ "is_bv_model": true,
135
+ "is_bv_model_rebalanced": 0.9
136
+ }
137
+ }
cogvideox-5b-i2v-sat/transformer/1/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eea368b16ea2a39a97bde4778b426711da7f971e852d737ea4da13ba57d7858f
3
+ size 11518625392
cogvideox-5b-i2v-sat/transformer/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ 1
cogvideox-5b-i2v-sat/vae/3d-vae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdf2683a98192fc35ebb1f86ff0bfd620eb0f8905efa4e8eb818af759e2bc418
3
+ size 1176149148
face_analysis/models/1k3d68.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df5c06b8a0c12e422b2ed8947b8869faa4105387f199c477af038aa01f9a45cc
3
+ size 143607619
face_analysis/models/2d106det.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f001b856447c413801ef5c42091ed0cd516fcd21f2d6b79635b1e733a7109dbf
3
+ size 5030888
face_analysis/models/buffalo_l.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b320dd731435fb5864ba5aa1ed493ec3f9e471ef84f12e46f25507a79ccad709
3
+ size 1546240
face_analysis/models/face_landmarker_v2_with_blendshapes.task ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64184e229b263107bc2b804c6625db1341ff2bb731874b0bcc2fe6544e0bc9ff
3
+ size 3758596
face_analysis/models/genderage.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fde69b1c810857b88c64a335084f1c3fe8f01246c9a191b48c7bb756d6652fb
3
+ size 1322532
face_analysis/models/glintr100.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ab1d6435d639628a6f3e5008dd4f929edf4c4124b1a7169e1048f9fef534cdf
3
+ size 260665334
face_analysis/models/scrfd_10g_bnkps.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5838f7fe053675b1c7a08b633df49e7af5495cee0493c7dcf6697200b85b5b91
3
+ size 16923827
hallo3/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ 1
t5-v1_1-xxl/added_tokens.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<extra_id_0>": 32099,
3
+ "<extra_id_10>": 32089,
4
+ "<extra_id_11>": 32088,
5
+ "<extra_id_12>": 32087,
6
+ "<extra_id_13>": 32086,
7
+ "<extra_id_14>": 32085,
8
+ "<extra_id_15>": 32084,
9
+ "<extra_id_16>": 32083,
10
+ "<extra_id_17>": 32082,
11
+ "<extra_id_18>": 32081,
12
+ "<extra_id_19>": 32080,
13
+ "<extra_id_1>": 32098,
14
+ "<extra_id_20>": 32079,
15
+ "<extra_id_21>": 32078,
16
+ "<extra_id_22>": 32077,
17
+ "<extra_id_23>": 32076,
18
+ "<extra_id_24>": 32075,
19
+ "<extra_id_25>": 32074,
20
+ "<extra_id_26>": 32073,
21
+ "<extra_id_27>": 32072,
22
+ "<extra_id_28>": 32071,
23
+ "<extra_id_29>": 32070,
24
+ "<extra_id_2>": 32097,
25
+ "<extra_id_30>": 32069,
26
+ "<extra_id_31>": 32068,
27
+ "<extra_id_32>": 32067,
28
+ "<extra_id_33>": 32066,
29
+ "<extra_id_34>": 32065,
30
+ "<extra_id_35>": 32064,
31
+ "<extra_id_36>": 32063,
32
+ "<extra_id_37>": 32062,
33
+ "<extra_id_38>": 32061,
34
+ "<extra_id_39>": 32060,
35
+ "<extra_id_3>": 32096,
36
+ "<extra_id_40>": 32059,
37
+ "<extra_id_41>": 32058,
38
+ "<extra_id_42>": 32057,
39
+ "<extra_id_43>": 32056,
40
+ "<extra_id_44>": 32055,
41
+ "<extra_id_45>": 32054,
42
+ "<extra_id_46>": 32053,
43
+ "<extra_id_47>": 32052,
44
+ "<extra_id_48>": 32051,
45
+ "<extra_id_49>": 32050,
46
+ "<extra_id_4>": 32095,
47
+ "<extra_id_50>": 32049,
48
+ "<extra_id_51>": 32048,
49
+ "<extra_id_52>": 32047,
50
+ "<extra_id_53>": 32046,
51
+ "<extra_id_54>": 32045,
52
+ "<extra_id_55>": 32044,
53
+ "<extra_id_56>": 32043,
54
+ "<extra_id_57>": 32042,
55
+ "<extra_id_58>": 32041,
56
+ "<extra_id_59>": 32040,
57
+ "<extra_id_5>": 32094,
58
+ "<extra_id_60>": 32039,
59
+ "<extra_id_61>": 32038,
60
+ "<extra_id_62>": 32037,
61
+ "<extra_id_63>": 32036,
62
+ "<extra_id_64>": 32035,
63
+ "<extra_id_65>": 32034,
64
+ "<extra_id_66>": 32033,
65
+ "<extra_id_67>": 32032,
66
+ "<extra_id_68>": 32031,
67
+ "<extra_id_69>": 32030,
68
+ "<extra_id_6>": 32093,
69
+ "<extra_id_70>": 32029,
70
+ "<extra_id_71>": 32028,
71
+ "<extra_id_72>": 32027,
72
+ "<extra_id_73>": 32026,
73
+ "<extra_id_74>": 32025,
74
+ "<extra_id_75>": 32024,
75
+ "<extra_id_76>": 32023,
76
+ "<extra_id_77>": 32022,
77
+ "<extra_id_78>": 32021,
78
+ "<extra_id_79>": 32020,
79
+ "<extra_id_7>": 32092,
80
+ "<extra_id_80>": 32019,
81
+ "<extra_id_81>": 32018,
82
+ "<extra_id_82>": 32017,
83
+ "<extra_id_83>": 32016,
84
+ "<extra_id_84>": 32015,
85
+ "<extra_id_85>": 32014,
86
+ "<extra_id_86>": 32013,
87
+ "<extra_id_87>": 32012,
88
+ "<extra_id_88>": 32011,
89
+ "<extra_id_89>": 32010,
90
+ "<extra_id_8>": 32091,
91
+ "<extra_id_90>": 32009,
92
+ "<extra_id_91>": 32008,
93
+ "<extra_id_92>": 32007,
94
+ "<extra_id_93>": 32006,
95
+ "<extra_id_94>": 32005,
96
+ "<extra_id_95>": 32004,
97
+ "<extra_id_96>": 32003,
98
+ "<extra_id_97>": 32002,
99
+ "<extra_id_98>": 32001,
100
+ "<extra_id_99>": 32000,
101
+ "<extra_id_9>": 32090
102
+ }
t5-v1_1-xxl/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/t5-v1_1-xxl",
3
+ "architectures": [
4
+ "T5EncoderModel"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 10240,
8
+ "d_kv": 64,
9
+ "d_model": 4096,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "t5",
20
+ "num_decoder_layers": 24,
21
+ "num_heads": 64,
22
+ "num_layers": 24,
23
+ "output_past": true,
24
+ "pad_token_id": 0,
25
+ "relative_attention_max_distance": 128,
26
+ "relative_attention_num_buckets": 32,
27
+ "tie_word_embeddings": false,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.44.2",
30
+ "use_cache": true,
31
+ "vocab_size": 32128
32
+ }
t5-v1_1-xxl/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9162b8ae9152e7a8e3bbebc535c8692783f50aec8cd3bb8ef6a751c432dd6392
3
+ size 4994582224
t5-v1_1-xxl/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3edef29693d52402b1cc7c362f031e052f2e9482ed0c765c6351950434349b0
3
+ size 4530066360
t5-v1_1-xxl/model.safetensors.index.json ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 9524621312
4
+ },
5
+ "weight_map": {
6
+ "encoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
7
+ "encoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
8
+ "encoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
9
+ "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00002.safetensors",
10
+ "encoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
11
+ "encoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
12
+ "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
13
+ "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
14
+ "encoder.block.0.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
15
+ "encoder.block.0.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
16
+ "encoder.block.1.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
17
+ "encoder.block.1.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
18
+ "encoder.block.1.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
19
+ "encoder.block.1.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
20
+ "encoder.block.1.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
21
+ "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
22
+ "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
23
+ "encoder.block.1.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
24
+ "encoder.block.1.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
25
+ "encoder.block.10.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
26
+ "encoder.block.10.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
27
+ "encoder.block.10.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
28
+ "encoder.block.10.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
29
+ "encoder.block.10.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
30
+ "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
31
+ "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
32
+ "encoder.block.10.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
33
+ "encoder.block.10.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
34
+ "encoder.block.11.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
35
+ "encoder.block.11.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
36
+ "encoder.block.11.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
37
+ "encoder.block.11.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
38
+ "encoder.block.11.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
39
+ "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
40
+ "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
41
+ "encoder.block.11.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
42
+ "encoder.block.11.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
43
+ "encoder.block.12.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
44
+ "encoder.block.12.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
45
+ "encoder.block.12.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
46
+ "encoder.block.12.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
47
+ "encoder.block.12.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
48
+ "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
49
+ "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
50
+ "encoder.block.12.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
51
+ "encoder.block.12.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
52
+ "encoder.block.13.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
53
+ "encoder.block.13.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
54
+ "encoder.block.13.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
55
+ "encoder.block.13.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
56
+ "encoder.block.13.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
57
+ "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
58
+ "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
59
+ "encoder.block.13.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
60
+ "encoder.block.13.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
61
+ "encoder.block.14.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
62
+ "encoder.block.14.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
63
+ "encoder.block.14.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
64
+ "encoder.block.14.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
65
+ "encoder.block.14.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
66
+ "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
67
+ "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
68
+ "encoder.block.14.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
69
+ "encoder.block.14.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
70
+ "encoder.block.15.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
71
+ "encoder.block.15.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
72
+ "encoder.block.15.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
73
+ "encoder.block.15.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
74
+ "encoder.block.15.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
75
+ "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
76
+ "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
77
+ "encoder.block.15.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
78
+ "encoder.block.15.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
79
+ "encoder.block.16.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
80
+ "encoder.block.16.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
81
+ "encoder.block.16.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
82
+ "encoder.block.16.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
83
+ "encoder.block.16.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
84
+ "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
85
+ "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
86
+ "encoder.block.16.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
87
+ "encoder.block.16.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
88
+ "encoder.block.17.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
89
+ "encoder.block.17.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
90
+ "encoder.block.17.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
91
+ "encoder.block.17.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
92
+ "encoder.block.17.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
93
+ "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
94
+ "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
95
+ "encoder.block.17.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
96
+ "encoder.block.17.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
97
+ "encoder.block.18.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
98
+ "encoder.block.18.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
99
+ "encoder.block.18.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
100
+ "encoder.block.18.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
101
+ "encoder.block.18.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
102
+ "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
103
+ "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
104
+ "encoder.block.18.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
105
+ "encoder.block.18.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
106
+ "encoder.block.19.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
107
+ "encoder.block.19.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
108
+ "encoder.block.19.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
109
+ "encoder.block.19.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
110
+ "encoder.block.19.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
111
+ "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
112
+ "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
113
+ "encoder.block.19.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
114
+ "encoder.block.19.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
115
+ "encoder.block.2.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
116
+ "encoder.block.2.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
117
+ "encoder.block.2.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
118
+ "encoder.block.2.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
119
+ "encoder.block.2.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
120
+ "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
121
+ "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
122
+ "encoder.block.2.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
123
+ "encoder.block.2.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
124
+ "encoder.block.20.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
125
+ "encoder.block.20.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
126
+ "encoder.block.20.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
127
+ "encoder.block.20.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
128
+ "encoder.block.20.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
129
+ "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
130
+ "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
131
+ "encoder.block.20.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
132
+ "encoder.block.20.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
133
+ "encoder.block.21.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
134
+ "encoder.block.21.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
135
+ "encoder.block.21.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
136
+ "encoder.block.21.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
137
+ "encoder.block.21.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
138
+ "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
139
+ "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
140
+ "encoder.block.21.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
141
+ "encoder.block.21.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
142
+ "encoder.block.22.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
143
+ "encoder.block.22.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
144
+ "encoder.block.22.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
145
+ "encoder.block.22.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
146
+ "encoder.block.22.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
147
+ "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
148
+ "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
149
+ "encoder.block.22.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
150
+ "encoder.block.22.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
151
+ "encoder.block.23.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
152
+ "encoder.block.23.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
153
+ "encoder.block.23.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
154
+ "encoder.block.23.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
155
+ "encoder.block.23.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
156
+ "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
157
+ "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
158
+ "encoder.block.23.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
159
+ "encoder.block.23.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
160
+ "encoder.block.3.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
161
+ "encoder.block.3.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
162
+ "encoder.block.3.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
163
+ "encoder.block.3.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
164
+ "encoder.block.3.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
165
+ "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
166
+ "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
167
+ "encoder.block.3.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
168
+ "encoder.block.3.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
169
+ "encoder.block.4.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
170
+ "encoder.block.4.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
171
+ "encoder.block.4.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
172
+ "encoder.block.4.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
173
+ "encoder.block.4.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
174
+ "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
175
+ "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
176
+ "encoder.block.4.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
177
+ "encoder.block.4.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
178
+ "encoder.block.5.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
179
+ "encoder.block.5.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
180
+ "encoder.block.5.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
181
+ "encoder.block.5.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
182
+ "encoder.block.5.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
183
+ "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
184
+ "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
185
+ "encoder.block.5.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
186
+ "encoder.block.5.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
187
+ "encoder.block.6.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
188
+ "encoder.block.6.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
189
+ "encoder.block.6.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
190
+ "encoder.block.6.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
191
+ "encoder.block.6.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
192
+ "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
193
+ "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
194
+ "encoder.block.6.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
195
+ "encoder.block.6.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
196
+ "encoder.block.7.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
197
+ "encoder.block.7.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
198
+ "encoder.block.7.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
199
+ "encoder.block.7.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
200
+ "encoder.block.7.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
201
+ "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
202
+ "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
203
+ "encoder.block.7.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
204
+ "encoder.block.7.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
205
+ "encoder.block.8.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
206
+ "encoder.block.8.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
207
+ "encoder.block.8.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
208
+ "encoder.block.8.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
209
+ "encoder.block.8.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
210
+ "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
211
+ "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
212
+ "encoder.block.8.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
213
+ "encoder.block.8.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
214
+ "encoder.block.9.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
215
+ "encoder.block.9.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
216
+ "encoder.block.9.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
217
+ "encoder.block.9.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
218
+ "encoder.block.9.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
219
+ "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
220
+ "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
221
+ "encoder.block.9.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
222
+ "encoder.block.9.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
223
+ "encoder.final_layer_norm.weight": "model-00002-of-00002.safetensors",
224
+ "shared.weight": "model-00001-of-00002.safetensors"
225
+ }
226
+ }
t5-v1_1-xxl/special_tokens_map.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": {
105
+ "content": "</s>",
106
+ "lstrip": false,
107
+ "normalized": false,
108
+ "rstrip": false,
109
+ "single_word": false
110
+ },
111
+ "pad_token": {
112
+ "content": "<pad>",
113
+ "lstrip": false,
114
+ "normalized": false,
115
+ "rstrip": false,
116
+ "single_word": false
117
+ },
118
+ "unk_token": {
119
+ "content": "<unk>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false
124
+ }
125
+ }
t5-v1_1-xxl/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
+ size 791656
t5-v1_1-xxl/tokenizer_config.json ADDED
@@ -0,0 +1,940 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<pad>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "</s>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<unk>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "32000": {
29
+ "content": "<extra_id_99>",
30
+ "lstrip": true,
31
+ "normalized": false,
32
+ "rstrip": true,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "32001": {
37
+ "content": "<extra_id_98>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": true,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "32002": {
45
+ "content": "<extra_id_97>",
46
+ "lstrip": true,
47
+ "normalized": false,
48
+ "rstrip": true,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "32003": {
53
+ "content": "<extra_id_96>",
54
+ "lstrip": true,
55
+ "normalized": false,
56
+ "rstrip": true,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "32004": {
61
+ "content": "<extra_id_95>",
62
+ "lstrip": true,
63
+ "normalized": false,
64
+ "rstrip": true,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "32005": {
69
+ "content": "<extra_id_94>",
70
+ "lstrip": true,
71
+ "normalized": false,
72
+ "rstrip": true,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "32006": {
77
+ "content": "<extra_id_93>",
78
+ "lstrip": true,
79
+ "normalized": false,
80
+ "rstrip": true,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "32007": {
85
+ "content": "<extra_id_92>",
86
+ "lstrip": true,
87
+ "normalized": false,
88
+ "rstrip": true,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "32008": {
93
+ "content": "<extra_id_91>",
94
+ "lstrip": true,
95
+ "normalized": false,
96
+ "rstrip": true,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "32009": {
101
+ "content": "<extra_id_90>",
102
+ "lstrip": true,
103
+ "normalized": false,
104
+ "rstrip": true,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "32010": {
109
+ "content": "<extra_id_89>",
110
+ "lstrip": true,
111
+ "normalized": false,
112
+ "rstrip": true,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "32011": {
117
+ "content": "<extra_id_88>",
118
+ "lstrip": true,
119
+ "normalized": false,
120
+ "rstrip": true,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "32012": {
125
+ "content": "<extra_id_87>",
126
+ "lstrip": true,
127
+ "normalized": false,
128
+ "rstrip": true,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "32013": {
133
+ "content": "<extra_id_86>",
134
+ "lstrip": true,
135
+ "normalized": false,
136
+ "rstrip": true,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "32014": {
141
+ "content": "<extra_id_85>",
142
+ "lstrip": true,
143
+ "normalized": false,
144
+ "rstrip": true,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "32015": {
149
+ "content": "<extra_id_84>",
150
+ "lstrip": true,
151
+ "normalized": false,
152
+ "rstrip": true,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "32016": {
157
+ "content": "<extra_id_83>",
158
+ "lstrip": true,
159
+ "normalized": false,
160
+ "rstrip": true,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "32017": {
165
+ "content": "<extra_id_82>",
166
+ "lstrip": true,
167
+ "normalized": false,
168
+ "rstrip": true,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "32018": {
173
+ "content": "<extra_id_81>",
174
+ "lstrip": true,
175
+ "normalized": false,
176
+ "rstrip": true,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "32019": {
181
+ "content": "<extra_id_80>",
182
+ "lstrip": true,
183
+ "normalized": false,
184
+ "rstrip": true,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "32020": {
189
+ "content": "<extra_id_79>",
190
+ "lstrip": true,
191
+ "normalized": false,
192
+ "rstrip": true,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "32021": {
197
+ "content": "<extra_id_78>",
198
+ "lstrip": true,
199
+ "normalized": false,
200
+ "rstrip": true,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "32022": {
205
+ "content": "<extra_id_77>",
206
+ "lstrip": true,
207
+ "normalized": false,
208
+ "rstrip": true,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "32023": {
213
+ "content": "<extra_id_76>",
214
+ "lstrip": true,
215
+ "normalized": false,
216
+ "rstrip": true,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "32024": {
221
+ "content": "<extra_id_75>",
222
+ "lstrip": true,
223
+ "normalized": false,
224
+ "rstrip": true,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "32025": {
229
+ "content": "<extra_id_74>",
230
+ "lstrip": true,
231
+ "normalized": false,
232
+ "rstrip": true,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "32026": {
237
+ "content": "<extra_id_73>",
238
+ "lstrip": true,
239
+ "normalized": false,
240
+ "rstrip": true,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "32027": {
245
+ "content": "<extra_id_72>",
246
+ "lstrip": true,
247
+ "normalized": false,
248
+ "rstrip": true,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "32028": {
253
+ "content": "<extra_id_71>",
254
+ "lstrip": true,
255
+ "normalized": false,
256
+ "rstrip": true,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "32029": {
261
+ "content": "<extra_id_70>",
262
+ "lstrip": true,
263
+ "normalized": false,
264
+ "rstrip": true,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "32030": {
269
+ "content": "<extra_id_69>",
270
+ "lstrip": true,
271
+ "normalized": false,
272
+ "rstrip": true,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "32031": {
277
+ "content": "<extra_id_68>",
278
+ "lstrip": true,
279
+ "normalized": false,
280
+ "rstrip": true,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "32032": {
285
+ "content": "<extra_id_67>",
286
+ "lstrip": true,
287
+ "normalized": false,
288
+ "rstrip": true,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "32033": {
293
+ "content": "<extra_id_66>",
294
+ "lstrip": true,
295
+ "normalized": false,
296
+ "rstrip": true,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "32034": {
301
+ "content": "<extra_id_65>",
302
+ "lstrip": true,
303
+ "normalized": false,
304
+ "rstrip": true,
305
+ "single_word": false,
306
+ "special": true
307
+ },
308
+ "32035": {
309
+ "content": "<extra_id_64>",
310
+ "lstrip": true,
311
+ "normalized": false,
312
+ "rstrip": true,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "32036": {
317
+ "content": "<extra_id_63>",
318
+ "lstrip": true,
319
+ "normalized": false,
320
+ "rstrip": true,
321
+ "single_word": false,
322
+ "special": true
323
+ },
324
+ "32037": {
325
+ "content": "<extra_id_62>",
326
+ "lstrip": true,
327
+ "normalized": false,
328
+ "rstrip": true,
329
+ "single_word": false,
330
+ "special": true
331
+ },
332
+ "32038": {
333
+ "content": "<extra_id_61>",
334
+ "lstrip": true,
335
+ "normalized": false,
336
+ "rstrip": true,
337
+ "single_word": false,
338
+ "special": true
339
+ },
340
+ "32039": {
341
+ "content": "<extra_id_60>",
342
+ "lstrip": true,
343
+ "normalized": false,
344
+ "rstrip": true,
345
+ "single_word": false,
346
+ "special": true
347
+ },
348
+ "32040": {
349
+ "content": "<extra_id_59>",
350
+ "lstrip": true,
351
+ "normalized": false,
352
+ "rstrip": true,
353
+ "single_word": false,
354
+ "special": true
355
+ },
356
+ "32041": {
357
+ "content": "<extra_id_58>",
358
+ "lstrip": true,
359
+ "normalized": false,
360
+ "rstrip": true,
361
+ "single_word": false,
362
+ "special": true
363
+ },
364
+ "32042": {
365
+ "content": "<extra_id_57>",
366
+ "lstrip": true,
367
+ "normalized": false,
368
+ "rstrip": true,
369
+ "single_word": false,
370
+ "special": true
371
+ },
372
+ "32043": {
373
+ "content": "<extra_id_56>",
374
+ "lstrip": true,
375
+ "normalized": false,
376
+ "rstrip": true,
377
+ "single_word": false,
378
+ "special": true
379
+ },
380
+ "32044": {
381
+ "content": "<extra_id_55>",
382
+ "lstrip": true,
383
+ "normalized": false,
384
+ "rstrip": true,
385
+ "single_word": false,
386
+ "special": true
387
+ },
388
+ "32045": {
389
+ "content": "<extra_id_54>",
390
+ "lstrip": true,
391
+ "normalized": false,
392
+ "rstrip": true,
393
+ "single_word": false,
394
+ "special": true
395
+ },
396
+ "32046": {
397
+ "content": "<extra_id_53>",
398
+ "lstrip": true,
399
+ "normalized": false,
400
+ "rstrip": true,
401
+ "single_word": false,
402
+ "special": true
403
+ },
404
+ "32047": {
405
+ "content": "<extra_id_52>",
406
+ "lstrip": true,
407
+ "normalized": false,
408
+ "rstrip": true,
409
+ "single_word": false,
410
+ "special": true
411
+ },
412
+ "32048": {
413
+ "content": "<extra_id_51>",
414
+ "lstrip": true,
415
+ "normalized": false,
416
+ "rstrip": true,
417
+ "single_word": false,
418
+ "special": true
419
+ },
420
+ "32049": {
421
+ "content": "<extra_id_50>",
422
+ "lstrip": true,
423
+ "normalized": false,
424
+ "rstrip": true,
425
+ "single_word": false,
426
+ "special": true
427
+ },
428
+ "32050": {
429
+ "content": "<extra_id_49>",
430
+ "lstrip": true,
431
+ "normalized": false,
432
+ "rstrip": true,
433
+ "single_word": false,
434
+ "special": true
435
+ },
436
+ "32051": {
437
+ "content": "<extra_id_48>",
438
+ "lstrip": true,
439
+ "normalized": false,
440
+ "rstrip": true,
441
+ "single_word": false,
442
+ "special": true
443
+ },
444
+ "32052": {
445
+ "content": "<extra_id_47>",
446
+ "lstrip": true,
447
+ "normalized": false,
448
+ "rstrip": true,
449
+ "single_word": false,
450
+ "special": true
451
+ },
452
+ "32053": {
453
+ "content": "<extra_id_46>",
454
+ "lstrip": true,
455
+ "normalized": false,
456
+ "rstrip": true,
457
+ "single_word": false,
458
+ "special": true
459
+ },
460
+ "32054": {
461
+ "content": "<extra_id_45>",
462
+ "lstrip": true,
463
+ "normalized": false,
464
+ "rstrip": true,
465
+ "single_word": false,
466
+ "special": true
467
+ },
468
+ "32055": {
469
+ "content": "<extra_id_44>",
470
+ "lstrip": true,
471
+ "normalized": false,
472
+ "rstrip": true,
473
+ "single_word": false,
474
+ "special": true
475
+ },
476
+ "32056": {
477
+ "content": "<extra_id_43>",
478
+ "lstrip": true,
479
+ "normalized": false,
480
+ "rstrip": true,
481
+ "single_word": false,
482
+ "special": true
483
+ },
484
+ "32057": {
485
+ "content": "<extra_id_42>",
486
+ "lstrip": true,
487
+ "normalized": false,
488
+ "rstrip": true,
489
+ "single_word": false,
490
+ "special": true
491
+ },
492
+ "32058": {
493
+ "content": "<extra_id_41>",
494
+ "lstrip": true,
495
+ "normalized": false,
496
+ "rstrip": true,
497
+ "single_word": false,
498
+ "special": true
499
+ },
500
+ "32059": {
501
+ "content": "<extra_id_40>",
502
+ "lstrip": true,
503
+ "normalized": false,
504
+ "rstrip": true,
505
+ "single_word": false,
506
+ "special": true
507
+ },
508
+ "32060": {
509
+ "content": "<extra_id_39>",
510
+ "lstrip": true,
511
+ "normalized": false,
512
+ "rstrip": true,
513
+ "single_word": false,
514
+ "special": true
515
+ },
516
+ "32061": {
517
+ "content": "<extra_id_38>",
518
+ "lstrip": true,
519
+ "normalized": false,
520
+ "rstrip": true,
521
+ "single_word": false,
522
+ "special": true
523
+ },
524
+ "32062": {
525
+ "content": "<extra_id_37>",
526
+ "lstrip": true,
527
+ "normalized": false,
528
+ "rstrip": true,
529
+ "single_word": false,
530
+ "special": true
531
+ },
532
+ "32063": {
533
+ "content": "<extra_id_36>",
534
+ "lstrip": true,
535
+ "normalized": false,
536
+ "rstrip": true,
537
+ "single_word": false,
538
+ "special": true
539
+ },
540
+ "32064": {
541
+ "content": "<extra_id_35>",
542
+ "lstrip": true,
543
+ "normalized": false,
544
+ "rstrip": true,
545
+ "single_word": false,
546
+ "special": true
547
+ },
548
+ "32065": {
549
+ "content": "<extra_id_34>",
550
+ "lstrip": true,
551
+ "normalized": false,
552
+ "rstrip": true,
553
+ "single_word": false,
554
+ "special": true
555
+ },
556
+ "32066": {
557
+ "content": "<extra_id_33>",
558
+ "lstrip": true,
559
+ "normalized": false,
560
+ "rstrip": true,
561
+ "single_word": false,
562
+ "special": true
563
+ },
564
+ "32067": {
565
+ "content": "<extra_id_32>",
566
+ "lstrip": true,
567
+ "normalized": false,
568
+ "rstrip": true,
569
+ "single_word": false,
570
+ "special": true
571
+ },
572
+ "32068": {
573
+ "content": "<extra_id_31>",
574
+ "lstrip": true,
575
+ "normalized": false,
576
+ "rstrip": true,
577
+ "single_word": false,
578
+ "special": true
579
+ },
580
+ "32069": {
581
+ "content": "<extra_id_30>",
582
+ "lstrip": true,
583
+ "normalized": false,
584
+ "rstrip": true,
585
+ "single_word": false,
586
+ "special": true
587
+ },
588
+ "32070": {
589
+ "content": "<extra_id_29>",
590
+ "lstrip": true,
591
+ "normalized": false,
592
+ "rstrip": true,
593
+ "single_word": false,
594
+ "special": true
595
+ },
596
+ "32071": {
597
+ "content": "<extra_id_28>",
598
+ "lstrip": true,
599
+ "normalized": false,
600
+ "rstrip": true,
601
+ "single_word": false,
602
+ "special": true
603
+ },
604
+ "32072": {
605
+ "content": "<extra_id_27>",
606
+ "lstrip": true,
607
+ "normalized": false,
608
+ "rstrip": true,
609
+ "single_word": false,
610
+ "special": true
611
+ },
612
+ "32073": {
613
+ "content": "<extra_id_26>",
614
+ "lstrip": true,
615
+ "normalized": false,
616
+ "rstrip": true,
617
+ "single_word": false,
618
+ "special": true
619
+ },
620
+ "32074": {
621
+ "content": "<extra_id_25>",
622
+ "lstrip": true,
623
+ "normalized": false,
624
+ "rstrip": true,
625
+ "single_word": false,
626
+ "special": true
627
+ },
628
+ "32075": {
629
+ "content": "<extra_id_24>",
630
+ "lstrip": true,
631
+ "normalized": false,
632
+ "rstrip": true,
633
+ "single_word": false,
634
+ "special": true
635
+ },
636
+ "32076": {
637
+ "content": "<extra_id_23>",
638
+ "lstrip": true,
639
+ "normalized": false,
640
+ "rstrip": true,
641
+ "single_word": false,
642
+ "special": true
643
+ },
644
+ "32077": {
645
+ "content": "<extra_id_22>",
646
+ "lstrip": true,
647
+ "normalized": false,
648
+ "rstrip": true,
649
+ "single_word": false,
650
+ "special": true
651
+ },
652
+ "32078": {
653
+ "content": "<extra_id_21>",
654
+ "lstrip": true,
655
+ "normalized": false,
656
+ "rstrip": true,
657
+ "single_word": false,
658
+ "special": true
659
+ },
660
+ "32079": {
661
+ "content": "<extra_id_20>",
662
+ "lstrip": true,
663
+ "normalized": false,
664
+ "rstrip": true,
665
+ "single_word": false,
666
+ "special": true
667
+ },
668
+ "32080": {
669
+ "content": "<extra_id_19>",
670
+ "lstrip": true,
671
+ "normalized": false,
672
+ "rstrip": true,
673
+ "single_word": false,
674
+ "special": true
675
+ },
676
+ "32081": {
677
+ "content": "<extra_id_18>",
678
+ "lstrip": true,
679
+ "normalized": false,
680
+ "rstrip": true,
681
+ "single_word": false,
682
+ "special": true
683
+ },
684
+ "32082": {
685
+ "content": "<extra_id_17>",
686
+ "lstrip": true,
687
+ "normalized": false,
688
+ "rstrip": true,
689
+ "single_word": false,
690
+ "special": true
691
+ },
692
+ "32083": {
693
+ "content": "<extra_id_16>",
694
+ "lstrip": true,
695
+ "normalized": false,
696
+ "rstrip": true,
697
+ "single_word": false,
698
+ "special": true
699
+ },
700
+ "32084": {
701
+ "content": "<extra_id_15>",
702
+ "lstrip": true,
703
+ "normalized": false,
704
+ "rstrip": true,
705
+ "single_word": false,
706
+ "special": true
707
+ },
708
+ "32085": {
709
+ "content": "<extra_id_14>",
710
+ "lstrip": true,
711
+ "normalized": false,
712
+ "rstrip": true,
713
+ "single_word": false,
714
+ "special": true
715
+ },
716
+ "32086": {
717
+ "content": "<extra_id_13>",
718
+ "lstrip": true,
719
+ "normalized": false,
720
+ "rstrip": true,
721
+ "single_word": false,
722
+ "special": true
723
+ },
724
+ "32087": {
725
+ "content": "<extra_id_12>",
726
+ "lstrip": true,
727
+ "normalized": false,
728
+ "rstrip": true,
729
+ "single_word": false,
730
+ "special": true
731
+ },
732
+ "32088": {
733
+ "content": "<extra_id_11>",
734
+ "lstrip": true,
735
+ "normalized": false,
736
+ "rstrip": true,
737
+ "single_word": false,
738
+ "special": true
739
+ },
740
+ "32089": {
741
+ "content": "<extra_id_10>",
742
+ "lstrip": true,
743
+ "normalized": false,
744
+ "rstrip": true,
745
+ "single_word": false,
746
+ "special": true
747
+ },
748
+ "32090": {
749
+ "content": "<extra_id_9>",
750
+ "lstrip": true,
751
+ "normalized": false,
752
+ "rstrip": true,
753
+ "single_word": false,
754
+ "special": true
755
+ },
756
+ "32091": {
757
+ "content": "<extra_id_8>",
758
+ "lstrip": true,
759
+ "normalized": false,
760
+ "rstrip": true,
761
+ "single_word": false,
762
+ "special": true
763
+ },
764
+ "32092": {
765
+ "content": "<extra_id_7>",
766
+ "lstrip": true,
767
+ "normalized": false,
768
+ "rstrip": true,
769
+ "single_word": false,
770
+ "special": true
771
+ },
772
+ "32093": {
773
+ "content": "<extra_id_6>",
774
+ "lstrip": true,
775
+ "normalized": false,
776
+ "rstrip": true,
777
+ "single_word": false,
778
+ "special": true
779
+ },
780
+ "32094": {
781
+ "content": "<extra_id_5>",
782
+ "lstrip": true,
783
+ "normalized": false,
784
+ "rstrip": true,
785
+ "single_word": false,
786
+ "special": true
787
+ },
788
+ "32095": {
789
+ "content": "<extra_id_4>",
790
+ "lstrip": true,
791
+ "normalized": false,
792
+ "rstrip": true,
793
+ "single_word": false,
794
+ "special": true
795
+ },
796
+ "32096": {
797
+ "content": "<extra_id_3>",
798
+ "lstrip": true,
799
+ "normalized": false,
800
+ "rstrip": true,
801
+ "single_word": false,
802
+ "special": true
803
+ },
804
+ "32097": {
805
+ "content": "<extra_id_2>",
806
+ "lstrip": true,
807
+ "normalized": false,
808
+ "rstrip": true,
809
+ "single_word": false,
810
+ "special": true
811
+ },
812
+ "32098": {
813
+ "content": "<extra_id_1>",
814
+ "lstrip": true,
815
+ "normalized": false,
816
+ "rstrip": true,
817
+ "single_word": false,
818
+ "special": true
819
+ },
820
+ "32099": {
821
+ "content": "<extra_id_0>",
822
+ "lstrip": true,
823
+ "normalized": false,
824
+ "rstrip": true,
825
+ "single_word": false,
826
+ "special": true
827
+ }
828
+ },
829
+ "additional_special_tokens": [
830
+ "<extra_id_0>",
831
+ "<extra_id_1>",
832
+ "<extra_id_2>",
833
+ "<extra_id_3>",
834
+ "<extra_id_4>",
835
+ "<extra_id_5>",
836
+ "<extra_id_6>",
837
+ "<extra_id_7>",
838
+ "<extra_id_8>",
839
+ "<extra_id_9>",
840
+ "<extra_id_10>",
841
+ "<extra_id_11>",
842
+ "<extra_id_12>",
843
+ "<extra_id_13>",
844
+ "<extra_id_14>",
845
+ "<extra_id_15>",
846
+ "<extra_id_16>",
847
+ "<extra_id_17>",
848
+ "<extra_id_18>",
849
+ "<extra_id_19>",
850
+ "<extra_id_20>",
851
+ "<extra_id_21>",
852
+ "<extra_id_22>",
853
+ "<extra_id_23>",
854
+ "<extra_id_24>",
855
+ "<extra_id_25>",
856
+ "<extra_id_26>",
857
+ "<extra_id_27>",
858
+ "<extra_id_28>",
859
+ "<extra_id_29>",
860
+ "<extra_id_30>",
861
+ "<extra_id_31>",
862
+ "<extra_id_32>",
863
+ "<extra_id_33>",
864
+ "<extra_id_34>",
865
+ "<extra_id_35>",
866
+ "<extra_id_36>",
867
+ "<extra_id_37>",
868
+ "<extra_id_38>",
869
+ "<extra_id_39>",
870
+ "<extra_id_40>",
871
+ "<extra_id_41>",
872
+ "<extra_id_42>",
873
+ "<extra_id_43>",
874
+ "<extra_id_44>",
875
+ "<extra_id_45>",
876
+ "<extra_id_46>",
877
+ "<extra_id_47>",
878
+ "<extra_id_48>",
879
+ "<extra_id_49>",
880
+ "<extra_id_50>",
881
+ "<extra_id_51>",
882
+ "<extra_id_52>",
883
+ "<extra_id_53>",
884
+ "<extra_id_54>",
885
+ "<extra_id_55>",
886
+ "<extra_id_56>",
887
+ "<extra_id_57>",
888
+ "<extra_id_58>",
889
+ "<extra_id_59>",
890
+ "<extra_id_60>",
891
+ "<extra_id_61>",
892
+ "<extra_id_62>",
893
+ "<extra_id_63>",
894
+ "<extra_id_64>",
895
+ "<extra_id_65>",
896
+ "<extra_id_66>",
897
+ "<extra_id_67>",
898
+ "<extra_id_68>",
899
+ "<extra_id_69>",
900
+ "<extra_id_70>",
901
+ "<extra_id_71>",
902
+ "<extra_id_72>",
903
+ "<extra_id_73>",
904
+ "<extra_id_74>",
905
+ "<extra_id_75>",
906
+ "<extra_id_76>",
907
+ "<extra_id_77>",
908
+ "<extra_id_78>",
909
+ "<extra_id_79>",
910
+ "<extra_id_80>",
911
+ "<extra_id_81>",
912
+ "<extra_id_82>",
913
+ "<extra_id_83>",
914
+ "<extra_id_84>",
915
+ "<extra_id_85>",
916
+ "<extra_id_86>",
917
+ "<extra_id_87>",
918
+ "<extra_id_88>",
919
+ "<extra_id_89>",
920
+ "<extra_id_90>",
921
+ "<extra_id_91>",
922
+ "<extra_id_92>",
923
+ "<extra_id_93>",
924
+ "<extra_id_94>",
925
+ "<extra_id_95>",
926
+ "<extra_id_96>",
927
+ "<extra_id_97>",
928
+ "<extra_id_98>",
929
+ "<extra_id_99>"
930
+ ],
931
+ "clean_up_tokenization_spaces": true,
932
+ "eos_token": "</s>",
933
+ "extra_ids": 100,
934
+ "legacy": true,
935
+ "model_max_length": 226,
936
+ "pad_token": "<pad>",
937
+ "sp_model_kwargs": {},
938
+ "tokenizer_class": "T5Tokenizer",
939
+ "unk_token": "<unk>"
940
+ }
wav2vec/wav2vec2-base-960h/.gitattributes ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.pb filter=lfs diff=lfs merge=lfs -text
15
+ *.pt filter=lfs diff=lfs merge=lfs -text
16
+ *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
18
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
wav2vec/wav2vec2-base-960h/README.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ datasets:
4
+ - librispeech_asr
5
+ tags:
6
+ - audio
7
+ - automatic-speech-recognition
8
+ - hf-asr-leaderboard
9
+ license: apache-2.0
10
+ widget:
11
+ - example_title: Librispeech sample 1
12
+ src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
13
+ - example_title: Librispeech sample 2
14
+ src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
15
+ model-index:
16
+ - name: wav2vec2-base-960h
17
+ results:
18
+ - task:
19
+ name: Automatic Speech Recognition
20
+ type: automatic-speech-recognition
21
+ dataset:
22
+ name: LibriSpeech (clean)
23
+ type: librispeech_asr
24
+ config: clean
25
+ split: test
26
+ args:
27
+ language: en
28
+ metrics:
29
+ - name: Test WER
30
+ type: wer
31
+ value: 3.4
32
+ - task:
33
+ name: Automatic Speech Recognition
34
+ type: automatic-speech-recognition
35
+ dataset:
36
+ name: LibriSpeech (other)
37
+ type: librispeech_asr
38
+ config: other
39
+ split: test
40
+ args:
41
+ language: en
42
+ metrics:
43
+ - name: Test WER
44
+ type: wer
45
+ value: 8.6
46
+ ---
47
+
48
+ # Wav2Vec2-Base-960h
49
+
50
+ [Facebook's Wav2Vec2](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
51
+
52
+ The base model pretrained and fine-tuned on 960 hours of Librispeech on 16kHz sampled speech audio. When using the model
53
+ make sure that your speech input is also sampled at 16Khz.
54
+
55
+ [Paper](https://arxiv.org/abs/2006.11477)
56
+
57
+ Authors: Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli
58
+
59
+ **Abstract**
60
+
61
+ We show for the first time that learning powerful representations from speech audio alone followed by fine-tuning on transcribed speech can outperform the best semi-supervised methods while being conceptually simpler. wav2vec 2.0 masks the speech input in the latent space and solves a contrastive task defined over a quantization of the latent representations which are jointly learned. Experiments using all labeled data of Librispeech achieve 1.8/3.3 WER on the clean/other test sets. When lowering the amount of labeled data to one hour, wav2vec 2.0 outperforms the previous state of the art on the 100 hour subset while using 100 times less labeled data. Using just ten minutes of labeled data and pre-training on 53k hours of unlabeled data still achieves 4.8/8.2 WER. This demonstrates the feasibility of speech recognition with limited amounts of labeled data.
62
+
63
+ The original model can be found under https://github.com/pytorch/fairseq/tree/master/examples/wav2vec#wav2vec-20.
64
+
65
+
66
+ # Usage
67
+
68
+ To transcribe audio files the model can be used as a standalone acoustic model as follows:
69
+
70
+ ```python
71
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
72
+ from datasets import load_dataset
73
+ import torch
74
+
75
+ # load model and tokenizer
76
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
77
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
78
+
79
+ # load dummy dataset and read soundfiles
80
+ ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
81
+
82
+ # tokenize
83
+ input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest").input_values # Batch size 1
84
+
85
+ # retrieve logits
86
+ logits = model(input_values).logits
87
+
88
+ # take argmax and decode
89
+ predicted_ids = torch.argmax(logits, dim=-1)
90
+ transcription = processor.batch_decode(predicted_ids)
91
+ ```
92
+
93
+ ## Evaluation
94
+
95
+ This code snippet shows how to evaluate **facebook/wav2vec2-base-960h** on LibriSpeech's "clean" and "other" test data.
96
+
97
+ ```python
98
+ from datasets import load_dataset
99
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
100
+ import torch
101
+ from jiwer import wer
102
+
103
+
104
+ librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
105
+
106
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")
107
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
108
+
109
+ def map_to_pred(batch):
110
+ input_values = processor(batch["audio"]["array"], return_tensors="pt", padding="longest").input_values
111
+ with torch.no_grad():
112
+ logits = model(input_values.to("cuda")).logits
113
+
114
+ predicted_ids = torch.argmax(logits, dim=-1)
115
+ transcription = processor.batch_decode(predicted_ids)
116
+ batch["transcription"] = transcription
117
+ return batch
118
+
119
+ result = librispeech_eval.map(map_to_pred, batched=True, batch_size=1, remove_columns=["audio"])
120
+
121
+ print("WER:", wer(result["text"], result["transcription"]))
122
+ ```
123
+
124
+ *Result (WER)*:
125
+
126
+ | "clean" | "other" |
127
+ |---|---|
128
+ | 3.4 | 8.6 |
wav2vec/wav2vec2-base-960h/config.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-base-960h",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "codevector_dim": 256,
11
+ "contrastive_logits_temperature": 0.1,
12
+ "conv_bias": false,
13
+ "conv_dim": [
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512,
20
+ 512
21
+ ],
22
+ "conv_kernel": [
23
+ 10,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 3,
28
+ 2,
29
+ 2
30
+ ],
31
+ "conv_stride": [
32
+ 5,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2,
38
+ 2
39
+ ],
40
+ "ctc_loss_reduction": "sum",
41
+ "ctc_zero_infinity": false,
42
+ "diversity_loss_weight": 0.1,
43
+ "do_stable_layer_norm": false,
44
+ "eos_token_id": 2,
45
+ "feat_extract_activation": "gelu",
46
+ "feat_extract_dropout": 0.0,
47
+ "feat_extract_norm": "group",
48
+ "feat_proj_dropout": 0.1,
49
+ "feat_quantizer_dropout": 0.0,
50
+ "final_dropout": 0.1,
51
+ "gradient_checkpointing": false,
52
+ "hidden_act": "gelu",
53
+ "hidden_dropout": 0.1,
54
+ "hidden_dropout_prob": 0.1,
55
+ "hidden_size": 768,
56
+ "initializer_range": 0.02,
57
+ "intermediate_size": 3072,
58
+ "layer_norm_eps": 1e-05,
59
+ "layerdrop": 0.1,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_prob": 0.05,
64
+ "model_type": "wav2vec2",
65
+ "num_attention_heads": 12,
66
+ "num_codevector_groups": 2,
67
+ "num_codevectors_per_group": 320,
68
+ "num_conv_pos_embedding_groups": 16,
69
+ "num_conv_pos_embeddings": 128,
70
+ "num_feat_extract_layers": 7,
71
+ "num_hidden_layers": 12,
72
+ "num_negatives": 100,
73
+ "pad_token_id": 0,
74
+ "proj_codevector_dim": 256,
75
+ "transformers_version": "4.7.0.dev0",
76
+ "vocab_size": 32
77
+ }
wav2vec/wav2vec2-base-960h/feature_extractor_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_dim": 1,
4
+ "padding_side": "right",
5
+ "padding_value": 0.0,
6
+ "return_attention_mask": false,
7
+ "sampling_rate": 16000
8
+ }
wav2vec/wav2vec2-base-960h/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aa76ab2243c81747a1f832954586bc566090c83a0ac167df6f31f0fa917d74a
3
+ size 377607901
wav2vec/wav2vec2-base-960h/preprocessor_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_size": 1,
4
+ "padding_side": "right",
5
+ "padding_value": 0.0,
6
+ "return_attention_mask": false,
7
+ "sampling_rate": 16000
8
+ }
wav2vec/wav2vec2-base-960h/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
wav2vec/wav2vec2-base-960h/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "return_attention_mask": false, "do_normalize": true}
wav2vec/wav2vec2-base-960h/vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "E": 5, "T": 6, "A": 7, "O": 8, "N": 9, "I": 10, "H": 11, "S": 12, "R": 13, "D": 14, "L": 15, "U": 16, "M": 17, "W": 18, "C": 19, "F": 20, "G": 21, "Y": 22, "P": 23, "B": 24, "V": 25, "K": 26, "'": 27, "X": 28, "J": 29, "Q": 30, "Z": 31}