suhmily commited on
Commit
4106108
·
verified ·
1 Parent(s): 04f229a

Update modeling_minicpm.py

Browse files
Files changed (1) hide show
  1. modeling_minicpm.py +1 -34
modeling_minicpm.py CHANGED
@@ -144,12 +144,10 @@ def compressed_attention(
144
  def calc_chunks_with_stride(cu_seqlen, chunk_size, kernel_stride):
145
  """
146
  Compute the chunks that require Sparse attention, with stride support.
147
-
148
  Args:
149
  cu_seqlen (torch.Tensor): Cumulative sequence lengths for each sample.
150
  chunk_size (int): Chunk size used for Sparse attention.
151
  kernel_stride (int): Stride size when sliding over the sequence.
152
-
153
  Returns:
154
  filtered_indices (torch.Tensor): Indices used to directly index into the key/value tensors.
155
  cu_seqlens_compressed (torch.Tensor): Cumulative sequence lengths after compression.
@@ -192,7 +190,6 @@ class CompressK(torch.nn.Module):
192
  def __init__(self, head_num_k, head_dim, kernel_size, kernel_stride=16):
193
  """
194
  Module for compressing key (K) representations.
195
-
196
  Args:
197
  head_num_k (int): Number of key attention heads.
198
  head_dim (int): Dimension of each attention head.
@@ -208,15 +205,12 @@ class CompressK(torch.nn.Module):
208
  def forward(self, k: torch.Tensor, cu_seqlens):
209
  """
210
  Forward pass for compressing the key (K) tensor.
211
-
212
  Args:
213
  k (torch.Tensor): Input key tensor of shape (total_seq_len, num_heads, head_dim).
214
  cu_seqlens (torch.Tensor): Cumulative sequence lengths for each sample in the batch, typically used for handling variable-length sequences.
215
-
216
  Returns:
217
  compress_k (torch.Tensor): Compressed key tensor.
218
  cu_seqlens_compressed (torch.Tensor): Updated cumulative sequence lengths after compression.
219
-
220
  """
221
  # Compute chunk-related metadata, with stride support
222
  filtered_k_indices, cu_seqlens_compressed = calc_chunks_with_stride(
@@ -535,7 +529,6 @@ def rotate_half(x):
535
 
536
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
537
  """Applies Rotary Position Embedding to the query and key tensors.
538
-
539
  Args:
540
  q (`torch.Tensor`): The query tensor.
541
  k (`torch.Tensor`): The key tensor.
@@ -906,7 +899,6 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
906
  """
907
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
908
  first unpad the input, then computes the attention scores and pad the final attention scores.
909
-
910
  Args:
911
  query_states (`torch.Tensor`):
912
  Input query states to be passed to Flash Attention API
@@ -1136,7 +1128,6 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
1136
  """
1137
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
1138
  first unpad the input, then computes the attention scores and pad the final attention scores.
1139
-
1140
  Args:
1141
  query_states (`torch.Tensor`):
1142
  Input query states to be passed to Flash Attention API
@@ -1364,7 +1355,6 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
1364
  """
1365
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
1366
  first unpad the input, then computes the attention scores and pad the final attention scores.
1367
-
1368
  Args:
1369
  query_states (`torch.Tensor`):
1370
  Input query states to be passed to Flash Attention API
@@ -1628,11 +1618,9 @@ MINICPM_START_DOCSTRING = r"""
1628
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1629
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1630
  etc.)
1631
-
1632
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
1633
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
1634
  and behavior.
1635
-
1636
  Parameters:
1637
  config ([`MiniCPMConfig`]):
1638
  Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -1672,50 +1660,38 @@ MINICPM_INPUTS_DOCSTRING = r"""
1672
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1673
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1674
  it.
1675
-
1676
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1677
  [`PreTrainedTokenizer.__call__`] for details.
1678
-
1679
  [What are input IDs?](../glossary#input-ids)
1680
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1681
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1682
-
1683
  - 1 for tokens that are **not masked**,
1684
  - 0 for tokens that are **masked**.
1685
-
1686
  [What are attention masks?](../glossary#attention-mask)
1687
-
1688
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1689
  [`PreTrainedTokenizer.__call__`] for details.
1690
-
1691
  If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
1692
  `past_key_values`).
1693
-
1694
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1695
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1696
  information on the default strategy.
1697
-
1698
  - 1 indicates the head is **not masked**,
1699
  - 0 indicates the head is **masked**.
1700
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1701
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1702
  config.n_positions - 1]`.
1703
-
1704
  [What are position IDs?](../glossary#position-ids)
1705
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1706
  Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1707
  blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
1708
  returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
1709
-
1710
  Two formats are allowed:
1711
  - a [`~cache_utils.Cache`] instance;
1712
  - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1713
  shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
1714
  cache format.
1715
-
1716
  The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
1717
  legacy cache format will be returned.
1718
-
1719
  If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
1720
  have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
1721
  of shape `(batch_size, sequence_length)`.
@@ -1744,7 +1720,6 @@ MINICPM_INPUTS_DOCSTRING = r"""
1744
  class MiniCPMModel(MiniCPMPreTrainedModel):
1745
  """
1746
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
1747
-
1748
  Args:
1749
  config: MiniCPMConfig
1750
  """
@@ -1971,20 +1946,14 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
1971
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1972
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1973
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1974
-
1975
  Returns:
1976
-
1977
  Example:
1978
-
1979
  ```python
1980
  >>> from transformers import AutoTokenizer, MiniCPMForCausalLM
1981
-
1982
  >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1983
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1984
-
1985
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
1986
  >>> inputs = tokenizer(prompt, return_tensors="pt")
1987
-
1988
  >>> # Generate
1989
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1990
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -2164,10 +2133,8 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
2164
  @add_start_docstrings(
2165
  """
2166
  The MiniCPM Model transformer with a sequence classification head on top (linear layer).
2167
-
2168
  [`MiniCPMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
2169
  (e.g. GPT-2) do.
2170
-
2171
  Since it does classification on the last token, it requires to know the position of the last token. If a
2172
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
2173
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
@@ -2280,4 +2247,4 @@ class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
2280
  past_key_values=transformer_outputs.past_key_values,
2281
  hidden_states=transformer_outputs.hidden_states,
2282
  attentions=transformer_outputs.attentions,
2283
- )
 
144
  def calc_chunks_with_stride(cu_seqlen, chunk_size, kernel_stride):
145
  """
146
  Compute the chunks that require Sparse attention, with stride support.
 
147
  Args:
148
  cu_seqlen (torch.Tensor): Cumulative sequence lengths for each sample.
149
  chunk_size (int): Chunk size used for Sparse attention.
150
  kernel_stride (int): Stride size when sliding over the sequence.
 
151
  Returns:
152
  filtered_indices (torch.Tensor): Indices used to directly index into the key/value tensors.
153
  cu_seqlens_compressed (torch.Tensor): Cumulative sequence lengths after compression.
 
190
  def __init__(self, head_num_k, head_dim, kernel_size, kernel_stride=16):
191
  """
192
  Module for compressing key (K) representations.
 
193
  Args:
194
  head_num_k (int): Number of key attention heads.
195
  head_dim (int): Dimension of each attention head.
 
205
  def forward(self, k: torch.Tensor, cu_seqlens):
206
  """
207
  Forward pass for compressing the key (K) tensor.
 
208
  Args:
209
  k (torch.Tensor): Input key tensor of shape (total_seq_len, num_heads, head_dim).
210
  cu_seqlens (torch.Tensor): Cumulative sequence lengths for each sample in the batch, typically used for handling variable-length sequences.
 
211
  Returns:
212
  compress_k (torch.Tensor): Compressed key tensor.
213
  cu_seqlens_compressed (torch.Tensor): Updated cumulative sequence lengths after compression.
 
214
  """
215
  # Compute chunk-related metadata, with stride support
216
  filtered_k_indices, cu_seqlens_compressed = calc_chunks_with_stride(
 
529
 
530
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
531
  """Applies Rotary Position Embedding to the query and key tensors.
 
532
  Args:
533
  q (`torch.Tensor`): The query tensor.
534
  k (`torch.Tensor`): The key tensor.
 
899
  """
900
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
901
  first unpad the input, then computes the attention scores and pad the final attention scores.
 
902
  Args:
903
  query_states (`torch.Tensor`):
904
  Input query states to be passed to Flash Attention API
 
1128
  """
1129
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
1130
  first unpad the input, then computes the attention scores and pad the final attention scores.
 
1131
  Args:
1132
  query_states (`torch.Tensor`):
1133
  Input query states to be passed to Flash Attention API
 
1355
  """
1356
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
1357
  first unpad the input, then computes the attention scores and pad the final attention scores.
 
1358
  Args:
1359
  query_states (`torch.Tensor`):
1360
  Input query states to be passed to Flash Attention API
 
1618
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1619
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1620
  etc.)
 
1621
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
1622
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
1623
  and behavior.
 
1624
  Parameters:
1625
  config ([`MiniCPMConfig`]):
1626
  Model configuration class with all the parameters of the model. Initializing with a config file does not
 
1660
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1661
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1662
  it.
 
1663
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1664
  [`PreTrainedTokenizer.__call__`] for details.
 
1665
  [What are input IDs?](../glossary#input-ids)
1666
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1667
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
1668
  - 1 for tokens that are **not masked**,
1669
  - 0 for tokens that are **masked**.
 
1670
  [What are attention masks?](../glossary#attention-mask)
 
1671
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1672
  [`PreTrainedTokenizer.__call__`] for details.
 
1673
  If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
1674
  `past_key_values`).
 
1675
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1676
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1677
  information on the default strategy.
 
1678
  - 1 indicates the head is **not masked**,
1679
  - 0 indicates the head is **masked**.
1680
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1681
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1682
  config.n_positions - 1]`.
 
1683
  [What are position IDs?](../glossary#position-ids)
1684
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1685
  Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1686
  blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
1687
  returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
1688
  Two formats are allowed:
1689
  - a [`~cache_utils.Cache`] instance;
1690
  - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1691
  shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
1692
  cache format.
 
1693
  The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
1694
  legacy cache format will be returned.
 
1695
  If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
1696
  have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
1697
  of shape `(batch_size, sequence_length)`.
 
1720
  class MiniCPMModel(MiniCPMPreTrainedModel):
1721
  """
1722
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
 
1723
  Args:
1724
  config: MiniCPMConfig
1725
  """
 
1946
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1947
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1948
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
 
1949
  Returns:
 
1950
  Example:
 
1951
  ```python
1952
  >>> from transformers import AutoTokenizer, MiniCPMForCausalLM
 
1953
  >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1954
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
 
1955
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
1956
  >>> inputs = tokenizer(prompt, return_tensors="pt")
 
1957
  >>> # Generate
1958
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1959
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
2133
  @add_start_docstrings(
2134
  """
2135
  The MiniCPM Model transformer with a sequence classification head on top (linear layer).
 
2136
  [`MiniCPMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
2137
  (e.g. GPT-2) do.
 
2138
  Since it does classification on the last token, it requires to know the position of the last token. If a
2139
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
2140
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
 
2247
  past_key_values=transformer_outputs.past_key_values,
2248
  hidden_states=transformer_outputs.hidden_states,
2249
  attentions=transformer_outputs.attentions,
2250
+ )