update model and config

Files changed (9) hide show

.gitattributes +4 -0
config.json +2 -2
modeling_skywork.py +29 -1
pytorch_model-00001-of-00002.bin → pytorch_model-00001-of-00004.bin +2 -2
pytorch_model-00002-of-00002.bin → pytorch_model-00002-of-00004.bin +2 -2
pytorch_model-00003-of-00004.bin +3 -0
pytorch_model-00004-of-00004.bin +3 -0
pytorch_model.bin.index.json +0 -0
skywork_13b_sft.sh +128 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 pytorch_model-00001-of-00002.bin filter=lfs diff=lfs merge=lfs -text
 pytorch_model-00002-of-00002.bin filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 pytorch_model-00001-of-00002.bin filter=lfs diff=lfs merge=lfs -text
 pytorch_model-00002-of-00002.bin filter=lfs diff=lfs merge=lfs -text
+pytorch_model-00001-of-00004.bin filter=lfs diff=lfs merge=lfs -text
+pytorch_model-00002-of-00004.bin filter=lfs diff=lfs merge=lfs -text
+pytorch_model-00003-of-00004.bin filter=lfs diff=lfs merge=lfs -text
+pytorch_model-00004-of-00004.bin filter=lfs diff=lfs merge=lfs -text

config.json CHANGED Viewed

@@ -33,7 +33,7 @@
   "rms_norm_eps": 1e-06,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.33.1",
   "use_cache": true,
-  "vocab_size": 65519
 }

   "rms_norm_eps": 1e-06,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.34.0",
   "use_cache": true,
+  "vocab_size": 65536
 }

modeling_skywork.py CHANGED Viewed

@@ -179,6 +179,27 @@ class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -189,7 +210,7 @@ def rotate_half(x):
 # Copied from transformers.models.gpt_neox.modeling_gpt_neox.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     cos = cos[position_ids].unsqueeze(1)  # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
-    sin = sin[position_ids].unsqueeze(1)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -290,6 +311,13 @@ class SkyworkAttention(nn.Module):
                     scaling_factor=scaling_factor,
                     base=self.rope_theta,
                 )
             else:
                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class SkyworkNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        base = (self.base * self.scaling_factor) ** (self.dim / (self.dim - 2))
+        inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
 # Copied from transformers.models.gpt_neox.modeling_gpt_neox.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     cos = cos[position_ids].unsqueeze(1)  # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
+    sin = sin[position_ids].unsqueeze(1)  #
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
                     scaling_factor=scaling_factor,
                     base=self.rope_theta,
                 )
+            elif scaling_type == "ntk":
+                self.rotary_emb = SkyworkNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
             else:
                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

pytorch_model-00001-of-00002.bin → pytorch_model-00001-of-00004.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ef96286501811b2ee17470bbf8c071cafbd66f36e7b9a0d3e0f5fa43a5c6ae28
-size 9983005310

 version https://git-lfs.github.com/spec/v1
+oid sha256:f344f1c62a065f471de22d3a9ac6a4a4d2c1b8f98a2251080e59be55f7d77632
+size 3982977952

pytorch_model-00002-of-00002.bin → pytorch_model-00002-of-00004.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b8e1887f1eef677bac502bfdb6e534601b291a7586c36abb686cfcd398a7e2a7
-size 4485927597

 version https://git-lfs.github.com/spec/v1
+oid sha256:2fcff58e56b6d24abd9588ba1b17c58fbfe76cbfe1c35014c5bd59aa69f8fb7a
+size 3959875181

pytorch_model-00003-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b903e4ba9505d009982736e9eb21f77c7151f91d0bb8d846756929f147e3eb9
+size 3966949023

pytorch_model-00004-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:573ab5962dc857d263679e9e6e7493a0d150ec905ba5172af38cbb2eedac5f29
+size 2559125753

pytorch_model.bin.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

skywork_13b_sft.sh ADDED Viewed

	@@ -0,0 +1,128 @@

+set -x
+export WANDB_API_KEY=${WANDB_API_KEY:-YOUR_WANDB_API_KEY}
+export WANDB_ENTITY=${WANDB_ENTITY:-YOUR_WANDB_ENTITY}
+export WANDB_PROJECT=${WANDB_PROJECT:-YOUR_WANDB_PROJECT}
+GPUS_PER_NODE=8
+NODE_RANK=$([ -z "$RANK" ] && echo -n 0 || echo -n $RANK)
+NNODES=$([ -z "$WORLD_SIZE" ] && echo -n 1 || echo -n $WORLD_SIZE)
+DEBUG="false"
+USE_LORA="false"
+TASK_TYPE="sft"
+MAX_STEP=1000
+LR=1e-4
+MAX_LENGTH=4096
+GLOBAL_BATCH_SIZE=32 # 8 * 4
+MICRO_BATCH_SIZE=1
+SAVE_STEP=500
+EVAL_STEP=500
+GRAD_ACC=$((${GLOBAL_BATCH_SIZE} / (${GPUS_PER_NODE} * $NNODES * ${MICRO_BATCH_SIZE}) ))
+FLAG=Skywork-13B-Base-sft-peaklr${LR}-steps${MAX_STEP}-gbs${GLOBAL_BATCH_SIZE}
+ROOT_PATH=${ROOT_PATH:-/data/user/your_name}
+MODEL_PATH=${MODEL_PATH:-SKYWORK_13B_BASE_MODEL_PATH}
+SFT_DATA_DIR=${SFT_DATA_DIR:-"YOUR_DATA_DIR"}
+DATA_CACHE_DIR=${DATA_CACHE_DIR:-"YOUR_DATA_CACHE_DIR"}
+OUTPUT_DIR=$ROOT_PATH/run_output/skywork-13b-sft-trainer/$FLAG
+LOAD_MODEL_PATH=$([ -z "$MODEL_PATH" ] && echo -n "$OUTPUT_DIR" || echo -n "$MODEL_PATH")
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --master_port 29501"
+if [[ $NNODES -gt 1 ]]; then
+    export NCCL_IB_HCA=mlx5
+    export NCCL_IB_TC=136
+    export NCCL_IB_SL=5
+    export NCCL_IB_GID_INDEX=3
+    export NCCL_IB_TIMEOUT=22
+    export NCCL_SOCKET_IFNAME=bond0
+    export NCCL_DEBUG=INFO
+    NODE_RANK=$RANK
+    if [ "$MASTER_ADDR" == "localhost" ] ; then $MASTER_ADDR=`hostname`; fi
+    echo $MASTER_ADDR
+    echo $MASTER_PORT
+    DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+fi
+if [ "$DEBUG" = "true" ]; then
+    EVAL_STEP=5
+    GLOBAL_BATCH_SIZE=8
+    GRAD_ACC=1
+fi
+DS_CONFIG=${DS_CONFIG:-train/ds_config/zero3_offload.json}
+LOG_ARGS="
+    --logging_steps 1 \
+    --logging_dir tensorboard/$FLAG \
+    --logging_strategy steps \
+    --logging_first_step True \
+    --report_to wandb \
+    --run_name $FLAG
+"
+OUTPUT_ARGS="
+    --save_strategy steps \
+    --save_total_limit 500 \
+    --save_steps $SAVE_STEP \
+    --output_dir $OUTPUT_DIR \
+    --overwrite_output_dir
+"
+TRAIN_ARGS="
+    --task_type $TASK_TYPE \
+    --do_train \
+    --max_seq_length $MAX_LENGTH \
+    --max_steps $MAX_STEP \
+    --lr_scheduler_type constant_with_warmup \
+    --learning_rate $LR \
+    --weight_decay 0.1 \
+    --warmup_steps 20 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.95 \
+    --gradient_accumulation_steps $GRAD_ACC \
+    --per_device_train_batch_size $MICRO_BATCH_SIZE
+"
+EVAL_ARGS="
+    --do_eval \
+    --evaluation_strategy steps \
+    --eval_steps $EVAL_STEP \
+    --per_device_eval_batch_size 1
+"
+INPUT_ARGS="
+    --model_name_or_path $LOAD_MODEL_PATH \
+    --tokenizer_name_or_path $LOAD_MODEL_PATH \
+    --sft_dataset_dir $SFT_DATA_DIR \
+    --data_cache_dir $DATA_CACHE_DIR
+"
+EXTRA_ARGS="
+    --seed 1234 \
+    --deepspeed $DS_CONFIG \
+    --gradient_checkpointing \
+    --ddp_find_unused_parameters False \
+    --preprocessing_num_workers 12 \
+    --ddp_timeout 30000 \
+    --torch_dtype bfloat16 \
+    --bf16 \
+    --load_in_kbits 16
+"
+mkdir -p logs/$FLAG || True
+torchrun $DISTRIBUTED_ARGS train/train.py \
+    $LOG_ARGS \
+    $OUTPUT_ARGS \
+    $TRAIN_ARGS \
+    $EVAL_ARGS \
+    $INPUT_ARGS \
+    $EXTRA_ARGS 2>&1 | tee -a logs/$FLAG/$RANK.log