liang.zhao
commited on
Commit
·
67b506b
1
Parent(s):
8c1c087
update model and config
Browse files- .gitattributes +4 -0
- config.json +2 -2
- modeling_skywork.py +29 -1
- pytorch_model-00001-of-00002.bin → pytorch_model-00001-of-00004.bin +2 -2
- pytorch_model-00002-of-00002.bin → pytorch_model-00002-of-00004.bin +2 -2
- pytorch_model-00003-of-00004.bin +3 -0
- pytorch_model-00004-of-00004.bin +3 -0
- pytorch_model.bin.index.json +0 -0
- skywork_13b_sft.sh +128 -0
.gitattributes
CHANGED
|
@@ -35,3 +35,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
pytorch_model-00001-of-00002.bin filter=lfs diff=lfs merge=lfs -text
|
| 37 |
pytorch_model-00002-of-00002.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
pytorch_model-00001-of-00002.bin filter=lfs diff=lfs merge=lfs -text
|
| 37 |
pytorch_model-00002-of-00002.bin filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
pytorch_model-00001-of-00004.bin filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
pytorch_model-00002-of-00004.bin filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
pytorch_model-00003-of-00004.bin filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
pytorch_model-00004-of-00004.bin filter=lfs diff=lfs merge=lfs -text
|
config.json
CHANGED
|
@@ -33,7 +33,7 @@
|
|
| 33 |
"rms_norm_eps": 1e-06,
|
| 34 |
"tie_word_embeddings": false,
|
| 35 |
"torch_dtype": "bfloat16",
|
| 36 |
-
"transformers_version": "4.
|
| 37 |
"use_cache": true,
|
| 38 |
-
"vocab_size":
|
| 39 |
}
|
|
|
|
| 33 |
"rms_norm_eps": 1e-06,
|
| 34 |
"tie_word_embeddings": false,
|
| 35 |
"torch_dtype": "bfloat16",
|
| 36 |
+
"transformers_version": "4.34.0",
|
| 37 |
"use_cache": true,
|
| 38 |
+
"vocab_size": 65536
|
| 39 |
}
|
modeling_skywork.py
CHANGED
|
@@ -179,6 +179,27 @@ class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
|
|
| 179 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
| 180 |
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
def rotate_half(x):
|
| 183 |
"""Rotates half the hidden dims of the input."""
|
| 184 |
x1 = x[..., : x.shape[-1] // 2]
|
|
@@ -189,7 +210,7 @@ def rotate_half(x):
|
|
| 189 |
# Copied from transformers.models.gpt_neox.modeling_gpt_neox.apply_rotary_pos_emb
|
| 190 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
|
| 191 |
cos = cos[position_ids].unsqueeze(1) # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
|
| 192 |
-
sin = sin[position_ids].unsqueeze(1)
|
| 193 |
q_embed = (q * cos) + (rotate_half(q) * sin)
|
| 194 |
k_embed = (k * cos) + (rotate_half(k) * sin)
|
| 195 |
return q_embed, k_embed
|
|
@@ -290,6 +311,13 @@ class SkyworkAttention(nn.Module):
|
|
| 290 |
scaling_factor=scaling_factor,
|
| 291 |
base=self.rope_theta,
|
| 292 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
else:
|
| 294 |
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
|
| 295 |
|
|
|
|
| 179 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
| 180 |
|
| 181 |
|
| 182 |
+
class SkyworkNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
|
| 183 |
+
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
|
| 184 |
+
self.scaling_factor = scaling_factor
|
| 185 |
+
super().__init__(dim, max_position_embeddings, base, device)
|
| 186 |
+
|
| 187 |
+
def _set_cos_sin_cache(self, seq_len, device, dtype):
|
| 188 |
+
self.max_seq_len_cached = seq_len
|
| 189 |
+
|
| 190 |
+
base = (self.base * self.scaling_factor) ** (self.dim / (self.dim - 2))
|
| 191 |
+
inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
|
| 192 |
+
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
| 193 |
+
|
| 194 |
+
t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
|
| 195 |
+
|
| 196 |
+
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
|
| 197 |
+
# Different from paper, but it uses a different permutation in order to obtain the same calculation
|
| 198 |
+
emb = torch.cat((freqs, freqs), dim=-1)
|
| 199 |
+
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
|
| 200 |
+
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
def rotate_half(x):
|
| 204 |
"""Rotates half the hidden dims of the input."""
|
| 205 |
x1 = x[..., : x.shape[-1] // 2]
|
|
|
|
| 210 |
# Copied from transformers.models.gpt_neox.modeling_gpt_neox.apply_rotary_pos_emb
|
| 211 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
|
| 212 |
cos = cos[position_ids].unsqueeze(1) # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
|
| 213 |
+
sin = sin[position_ids].unsqueeze(1) #
|
| 214 |
q_embed = (q * cos) + (rotate_half(q) * sin)
|
| 215 |
k_embed = (k * cos) + (rotate_half(k) * sin)
|
| 216 |
return q_embed, k_embed
|
|
|
|
| 311 |
scaling_factor=scaling_factor,
|
| 312 |
base=self.rope_theta,
|
| 313 |
)
|
| 314 |
+
elif scaling_type == "ntk":
|
| 315 |
+
self.rotary_emb = SkyworkNTKScalingRotaryEmbedding(
|
| 316 |
+
self.head_dim,
|
| 317 |
+
max_position_embeddings=self.max_position_embeddings,
|
| 318 |
+
scaling_factor=scaling_factor,
|
| 319 |
+
base=self.rope_theta,
|
| 320 |
+
)
|
| 321 |
else:
|
| 322 |
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
|
| 323 |
|
pytorch_model-00001-of-00002.bin → pytorch_model-00001-of-00004.bin
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f344f1c62a065f471de22d3a9ac6a4a4d2c1b8f98a2251080e59be55f7d77632
|
| 3 |
+
size 3982977952
|
pytorch_model-00002-of-00002.bin → pytorch_model-00002-of-00004.bin
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2fcff58e56b6d24abd9588ba1b17c58fbfe76cbfe1c35014c5bd59aa69f8fb7a
|
| 3 |
+
size 3959875181
|
pytorch_model-00003-of-00004.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b903e4ba9505d009982736e9eb21f77c7151f91d0bb8d846756929f147e3eb9
|
| 3 |
+
size 3966949023
|
pytorch_model-00004-of-00004.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:573ab5962dc857d263679e9e6e7493a0d150ec905ba5172af38cbb2eedac5f29
|
| 3 |
+
size 2559125753
|
pytorch_model.bin.index.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
skywork_13b_sft.sh
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
export WANDB_API_KEY=${WANDB_API_KEY:-YOUR_WANDB_API_KEY}
|
| 3 |
+
export WANDB_ENTITY=${WANDB_ENTITY:-YOUR_WANDB_ENTITY}
|
| 4 |
+
export WANDB_PROJECT=${WANDB_PROJECT:-YOUR_WANDB_PROJECT}
|
| 5 |
+
|
| 6 |
+
GPUS_PER_NODE=8
|
| 7 |
+
NODE_RANK=$([ -z "$RANK" ] && echo -n 0 || echo -n $RANK)
|
| 8 |
+
NNODES=$([ -z "$WORLD_SIZE" ] && echo -n 1 || echo -n $WORLD_SIZE)
|
| 9 |
+
|
| 10 |
+
DEBUG="false"
|
| 11 |
+
USE_LORA="false"
|
| 12 |
+
TASK_TYPE="sft"
|
| 13 |
+
|
| 14 |
+
MAX_STEP=1000
|
| 15 |
+
LR=1e-4
|
| 16 |
+
MAX_LENGTH=4096
|
| 17 |
+
|
| 18 |
+
GLOBAL_BATCH_SIZE=32 # 8 * 4
|
| 19 |
+
MICRO_BATCH_SIZE=1
|
| 20 |
+
SAVE_STEP=500
|
| 21 |
+
EVAL_STEP=500
|
| 22 |
+
GRAD_ACC=$((${GLOBAL_BATCH_SIZE} / (${GPUS_PER_NODE} * $NNODES * ${MICRO_BATCH_SIZE}) ))
|
| 23 |
+
|
| 24 |
+
FLAG=Skywork-13B-Base-sft-peaklr${LR}-steps${MAX_STEP}-gbs${GLOBAL_BATCH_SIZE}
|
| 25 |
+
|
| 26 |
+
ROOT_PATH=${ROOT_PATH:-/data/user/your_name}
|
| 27 |
+
MODEL_PATH=${MODEL_PATH:-SKYWORK_13B_BASE_MODEL_PATH}
|
| 28 |
+
|
| 29 |
+
SFT_DATA_DIR=${SFT_DATA_DIR:-"YOUR_DATA_DIR"}
|
| 30 |
+
DATA_CACHE_DIR=${DATA_CACHE_DIR:-"YOUR_DATA_CACHE_DIR"}
|
| 31 |
+
|
| 32 |
+
OUTPUT_DIR=$ROOT_PATH/run_output/skywork-13b-sft-trainer/$FLAG
|
| 33 |
+
LOAD_MODEL_PATH=$([ -z "$MODEL_PATH" ] && echo -n "$OUTPUT_DIR" || echo -n "$MODEL_PATH")
|
| 34 |
+
|
| 35 |
+
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --master_port 29501"
|
| 36 |
+
if [[ $NNODES -gt 1 ]]; then
|
| 37 |
+
|
| 38 |
+
export NCCL_IB_HCA=mlx5
|
| 39 |
+
export NCCL_IB_TC=136
|
| 40 |
+
export NCCL_IB_SL=5
|
| 41 |
+
export NCCL_IB_GID_INDEX=3
|
| 42 |
+
export NCCL_IB_TIMEOUT=22
|
| 43 |
+
export NCCL_SOCKET_IFNAME=bond0
|
| 44 |
+
export NCCL_DEBUG=INFO
|
| 45 |
+
NODE_RANK=$RANK
|
| 46 |
+
if [ "$MASTER_ADDR" == "localhost" ] ; then $MASTER_ADDR=`hostname`; fi
|
| 47 |
+
|
| 48 |
+
echo $MASTER_ADDR
|
| 49 |
+
echo $MASTER_PORT
|
| 50 |
+
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
|
| 51 |
+
fi
|
| 52 |
+
|
| 53 |
+
if [ "$DEBUG" = "true" ]; then
|
| 54 |
+
EVAL_STEP=5
|
| 55 |
+
GLOBAL_BATCH_SIZE=8
|
| 56 |
+
GRAD_ACC=1
|
| 57 |
+
|
| 58 |
+
fi
|
| 59 |
+
|
| 60 |
+
DS_CONFIG=${DS_CONFIG:-train/ds_config/zero3_offload.json}
|
| 61 |
+
|
| 62 |
+
LOG_ARGS="
|
| 63 |
+
--logging_steps 1 \
|
| 64 |
+
--logging_dir tensorboard/$FLAG \
|
| 65 |
+
--logging_strategy steps \
|
| 66 |
+
--logging_first_step True \
|
| 67 |
+
--report_to wandb \
|
| 68 |
+
--run_name $FLAG
|
| 69 |
+
"
|
| 70 |
+
|
| 71 |
+
OUTPUT_ARGS="
|
| 72 |
+
--save_strategy steps \
|
| 73 |
+
--save_total_limit 500 \
|
| 74 |
+
--save_steps $SAVE_STEP \
|
| 75 |
+
--output_dir $OUTPUT_DIR \
|
| 76 |
+
--overwrite_output_dir
|
| 77 |
+
"
|
| 78 |
+
|
| 79 |
+
TRAIN_ARGS="
|
| 80 |
+
--task_type $TASK_TYPE \
|
| 81 |
+
--do_train \
|
| 82 |
+
--max_seq_length $MAX_LENGTH \
|
| 83 |
+
--max_steps $MAX_STEP \
|
| 84 |
+
--lr_scheduler_type constant_with_warmup \
|
| 85 |
+
--learning_rate $LR \
|
| 86 |
+
--weight_decay 0.1 \
|
| 87 |
+
--warmup_steps 20 \
|
| 88 |
+
--adam_beta1 0.9 \
|
| 89 |
+
--adam_beta2 0.95 \
|
| 90 |
+
--gradient_accumulation_steps $GRAD_ACC \
|
| 91 |
+
--per_device_train_batch_size $MICRO_BATCH_SIZE
|
| 92 |
+
"
|
| 93 |
+
|
| 94 |
+
EVAL_ARGS="
|
| 95 |
+
--do_eval \
|
| 96 |
+
--evaluation_strategy steps \
|
| 97 |
+
--eval_steps $EVAL_STEP \
|
| 98 |
+
--per_device_eval_batch_size 1
|
| 99 |
+
"
|
| 100 |
+
|
| 101 |
+
INPUT_ARGS="
|
| 102 |
+
--model_name_or_path $LOAD_MODEL_PATH \
|
| 103 |
+
--tokenizer_name_or_path $LOAD_MODEL_PATH \
|
| 104 |
+
--sft_dataset_dir $SFT_DATA_DIR \
|
| 105 |
+
--data_cache_dir $DATA_CACHE_DIR
|
| 106 |
+
"
|
| 107 |
+
|
| 108 |
+
EXTRA_ARGS="
|
| 109 |
+
--seed 1234 \
|
| 110 |
+
--deepspeed $DS_CONFIG \
|
| 111 |
+
--gradient_checkpointing \
|
| 112 |
+
--ddp_find_unused_parameters False \
|
| 113 |
+
--preprocessing_num_workers 12 \
|
| 114 |
+
--ddp_timeout 30000 \
|
| 115 |
+
--torch_dtype bfloat16 \
|
| 116 |
+
--bf16 \
|
| 117 |
+
--load_in_kbits 16
|
| 118 |
+
"
|
| 119 |
+
|
| 120 |
+
mkdir -p logs/$FLAG || True
|
| 121 |
+
torchrun $DISTRIBUTED_ARGS train/train.py \
|
| 122 |
+
$LOG_ARGS \
|
| 123 |
+
$OUTPUT_ARGS \
|
| 124 |
+
$TRAIN_ARGS \
|
| 125 |
+
$EVAL_ARGS \
|
| 126 |
+
$INPUT_ARGS \
|
| 127 |
+
$EXTRA_ARGS 2>&1 | tee -a logs/$FLAG/$RANK.log
|
| 128 |
+
|