Update modeling_minicpm.py
Browse files- modeling_minicpm.py +1 -34
modeling_minicpm.py
CHANGED
|
@@ -144,12 +144,10 @@ def compressed_attention(
|
|
| 144 |
def calc_chunks_with_stride(cu_seqlen, chunk_size, kernel_stride):
|
| 145 |
"""
|
| 146 |
Compute the chunks that require Sparse attention, with stride support.
|
| 147 |
-
|
| 148 |
Args:
|
| 149 |
cu_seqlen (torch.Tensor): Cumulative sequence lengths for each sample.
|
| 150 |
chunk_size (int): Chunk size used for Sparse attention.
|
| 151 |
kernel_stride (int): Stride size when sliding over the sequence.
|
| 152 |
-
|
| 153 |
Returns:
|
| 154 |
filtered_indices (torch.Tensor): Indices used to directly index into the key/value tensors.
|
| 155 |
cu_seqlens_compressed (torch.Tensor): Cumulative sequence lengths after compression.
|
|
@@ -192,7 +190,6 @@ class CompressK(torch.nn.Module):
|
|
| 192 |
def __init__(self, head_num_k, head_dim, kernel_size, kernel_stride=16):
|
| 193 |
"""
|
| 194 |
Module for compressing key (K) representations.
|
| 195 |
-
|
| 196 |
Args:
|
| 197 |
head_num_k (int): Number of key attention heads.
|
| 198 |
head_dim (int): Dimension of each attention head.
|
|
@@ -208,15 +205,12 @@ class CompressK(torch.nn.Module):
|
|
| 208 |
def forward(self, k: torch.Tensor, cu_seqlens):
|
| 209 |
"""
|
| 210 |
Forward pass for compressing the key (K) tensor.
|
| 211 |
-
|
| 212 |
Args:
|
| 213 |
k (torch.Tensor): Input key tensor of shape (total_seq_len, num_heads, head_dim).
|
| 214 |
cu_seqlens (torch.Tensor): Cumulative sequence lengths for each sample in the batch, typically used for handling variable-length sequences.
|
| 215 |
-
|
| 216 |
Returns:
|
| 217 |
compress_k (torch.Tensor): Compressed key tensor.
|
| 218 |
cu_seqlens_compressed (torch.Tensor): Updated cumulative sequence lengths after compression.
|
| 219 |
-
|
| 220 |
"""
|
| 221 |
# Compute chunk-related metadata, with stride support
|
| 222 |
filtered_k_indices, cu_seqlens_compressed = calc_chunks_with_stride(
|
|
@@ -535,7 +529,6 @@ def rotate_half(x):
|
|
| 535 |
|
| 536 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
|
| 537 |
"""Applies Rotary Position Embedding to the query and key tensors.
|
| 538 |
-
|
| 539 |
Args:
|
| 540 |
q (`torch.Tensor`): The query tensor.
|
| 541 |
k (`torch.Tensor`): The key tensor.
|
|
@@ -906,7 +899,6 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
|
|
| 906 |
"""
|
| 907 |
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
|
| 908 |
first unpad the input, then computes the attention scores and pad the final attention scores.
|
| 909 |
-
|
| 910 |
Args:
|
| 911 |
query_states (`torch.Tensor`):
|
| 912 |
Input query states to be passed to Flash Attention API
|
|
@@ -1136,7 +1128,6 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
|
|
| 1136 |
"""
|
| 1137 |
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
|
| 1138 |
first unpad the input, then computes the attention scores and pad the final attention scores.
|
| 1139 |
-
|
| 1140 |
Args:
|
| 1141 |
query_states (`torch.Tensor`):
|
| 1142 |
Input query states to be passed to Flash Attention API
|
|
@@ -1364,7 +1355,6 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
|
|
| 1364 |
"""
|
| 1365 |
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
|
| 1366 |
first unpad the input, then computes the attention scores and pad the final attention scores.
|
| 1367 |
-
|
| 1368 |
Args:
|
| 1369 |
query_states (`torch.Tensor`):
|
| 1370 |
Input query states to be passed to Flash Attention API
|
|
@@ -1628,11 +1618,9 @@ MINICPM_START_DOCSTRING = r"""
|
|
| 1628 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
| 1629 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
| 1630 |
etc.)
|
| 1631 |
-
|
| 1632 |
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
| 1633 |
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
| 1634 |
and behavior.
|
| 1635 |
-
|
| 1636 |
Parameters:
|
| 1637 |
config ([`MiniCPMConfig`]):
|
| 1638 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
|
@@ -1672,50 +1660,38 @@ MINICPM_INPUTS_DOCSTRING = r"""
|
|
| 1672 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| 1673 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
| 1674 |
it.
|
| 1675 |
-
|
| 1676 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| 1677 |
[`PreTrainedTokenizer.__call__`] for details.
|
| 1678 |
-
|
| 1679 |
[What are input IDs?](../glossary#input-ids)
|
| 1680 |
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 1681 |
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
| 1682 |
-
|
| 1683 |
- 1 for tokens that are **not masked**,
|
| 1684 |
- 0 for tokens that are **masked**.
|
| 1685 |
-
|
| 1686 |
[What are attention masks?](../glossary#attention-mask)
|
| 1687 |
-
|
| 1688 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| 1689 |
[`PreTrainedTokenizer.__call__`] for details.
|
| 1690 |
-
|
| 1691 |
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
| 1692 |
`past_key_values`).
|
| 1693 |
-
|
| 1694 |
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
| 1695 |
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
|
| 1696 |
information on the default strategy.
|
| 1697 |
-
|
| 1698 |
- 1 indicates the head is **not masked**,
|
| 1699 |
- 0 indicates the head is **masked**.
|
| 1700 |
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 1701 |
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
| 1702 |
config.n_positions - 1]`.
|
| 1703 |
-
|
| 1704 |
[What are position IDs?](../glossary#position-ids)
|
| 1705 |
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
|
| 1706 |
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
| 1707 |
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
| 1708 |
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
| 1709 |
-
|
| 1710 |
Two formats are allowed:
|
| 1711 |
- a [`~cache_utils.Cache`] instance;
|
| 1712 |
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
|
| 1713 |
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
|
| 1714 |
cache format.
|
| 1715 |
-
|
| 1716 |
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
|
| 1717 |
legacy cache format will be returned.
|
| 1718 |
-
|
| 1719 |
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
| 1720 |
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
| 1721 |
of shape `(batch_size, sequence_length)`.
|
|
@@ -1744,7 +1720,6 @@ MINICPM_INPUTS_DOCSTRING = r"""
|
|
| 1744 |
class MiniCPMModel(MiniCPMPreTrainedModel):
|
| 1745 |
"""
|
| 1746 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
|
| 1747 |
-
|
| 1748 |
Args:
|
| 1749 |
config: MiniCPMConfig
|
| 1750 |
"""
|
|
@@ -1971,20 +1946,14 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
|
|
| 1971 |
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
| 1972 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
| 1973 |
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
| 1974 |
-
|
| 1975 |
Returns:
|
| 1976 |
-
|
| 1977 |
Example:
|
| 1978 |
-
|
| 1979 |
```python
|
| 1980 |
>>> from transformers import AutoTokenizer, MiniCPMForCausalLM
|
| 1981 |
-
|
| 1982 |
>>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
|
| 1983 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
| 1984 |
-
|
| 1985 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
| 1986 |
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
| 1987 |
-
|
| 1988 |
>>> # Generate
|
| 1989 |
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
| 1990 |
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
@@ -2164,10 +2133,8 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
|
|
| 2164 |
@add_start_docstrings(
|
| 2165 |
"""
|
| 2166 |
The MiniCPM Model transformer with a sequence classification head on top (linear layer).
|
| 2167 |
-
|
| 2168 |
[`MiniCPMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
| 2169 |
(e.g. GPT-2) do.
|
| 2170 |
-
|
| 2171 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
| 2172 |
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
|
| 2173 |
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
|
|
@@ -2280,4 +2247,4 @@ class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
|
|
| 2280 |
past_key_values=transformer_outputs.past_key_values,
|
| 2281 |
hidden_states=transformer_outputs.hidden_states,
|
| 2282 |
attentions=transformer_outputs.attentions,
|
| 2283 |
-
)
|
|
|
|
| 144 |
def calc_chunks_with_stride(cu_seqlen, chunk_size, kernel_stride):
|
| 145 |
"""
|
| 146 |
Compute the chunks that require Sparse attention, with stride support.
|
|
|
|
| 147 |
Args:
|
| 148 |
cu_seqlen (torch.Tensor): Cumulative sequence lengths for each sample.
|
| 149 |
chunk_size (int): Chunk size used for Sparse attention.
|
| 150 |
kernel_stride (int): Stride size when sliding over the sequence.
|
|
|
|
| 151 |
Returns:
|
| 152 |
filtered_indices (torch.Tensor): Indices used to directly index into the key/value tensors.
|
| 153 |
cu_seqlens_compressed (torch.Tensor): Cumulative sequence lengths after compression.
|
|
|
|
| 190 |
def __init__(self, head_num_k, head_dim, kernel_size, kernel_stride=16):
|
| 191 |
"""
|
| 192 |
Module for compressing key (K) representations.
|
|
|
|
| 193 |
Args:
|
| 194 |
head_num_k (int): Number of key attention heads.
|
| 195 |
head_dim (int): Dimension of each attention head.
|
|
|
|
| 205 |
def forward(self, k: torch.Tensor, cu_seqlens):
|
| 206 |
"""
|
| 207 |
Forward pass for compressing the key (K) tensor.
|
|
|
|
| 208 |
Args:
|
| 209 |
k (torch.Tensor): Input key tensor of shape (total_seq_len, num_heads, head_dim).
|
| 210 |
cu_seqlens (torch.Tensor): Cumulative sequence lengths for each sample in the batch, typically used for handling variable-length sequences.
|
|
|
|
| 211 |
Returns:
|
| 212 |
compress_k (torch.Tensor): Compressed key tensor.
|
| 213 |
cu_seqlens_compressed (torch.Tensor): Updated cumulative sequence lengths after compression.
|
|
|
|
| 214 |
"""
|
| 215 |
# Compute chunk-related metadata, with stride support
|
| 216 |
filtered_k_indices, cu_seqlens_compressed = calc_chunks_with_stride(
|
|
|
|
| 529 |
|
| 530 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
|
| 531 |
"""Applies Rotary Position Embedding to the query and key tensors.
|
|
|
|
| 532 |
Args:
|
| 533 |
q (`torch.Tensor`): The query tensor.
|
| 534 |
k (`torch.Tensor`): The key tensor.
|
|
|
|
| 899 |
"""
|
| 900 |
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
|
| 901 |
first unpad the input, then computes the attention scores and pad the final attention scores.
|
|
|
|
| 902 |
Args:
|
| 903 |
query_states (`torch.Tensor`):
|
| 904 |
Input query states to be passed to Flash Attention API
|
|
|
|
| 1128 |
"""
|
| 1129 |
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
|
| 1130 |
first unpad the input, then computes the attention scores and pad the final attention scores.
|
|
|
|
| 1131 |
Args:
|
| 1132 |
query_states (`torch.Tensor`):
|
| 1133 |
Input query states to be passed to Flash Attention API
|
|
|
|
| 1355 |
"""
|
| 1356 |
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
|
| 1357 |
first unpad the input, then computes the attention scores and pad the final attention scores.
|
|
|
|
| 1358 |
Args:
|
| 1359 |
query_states (`torch.Tensor`):
|
| 1360 |
Input query states to be passed to Flash Attention API
|
|
|
|
| 1618 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
| 1619 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
| 1620 |
etc.)
|
|
|
|
| 1621 |
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
| 1622 |
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
| 1623 |
and behavior.
|
|
|
|
| 1624 |
Parameters:
|
| 1625 |
config ([`MiniCPMConfig`]):
|
| 1626 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
|
|
|
| 1660 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| 1661 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
| 1662 |
it.
|
|
|
|
| 1663 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| 1664 |
[`PreTrainedTokenizer.__call__`] for details.
|
|
|
|
| 1665 |
[What are input IDs?](../glossary#input-ids)
|
| 1666 |
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 1667 |
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
|
|
| 1668 |
- 1 for tokens that are **not masked**,
|
| 1669 |
- 0 for tokens that are **masked**.
|
|
|
|
| 1670 |
[What are attention masks?](../glossary#attention-mask)
|
|
|
|
| 1671 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| 1672 |
[`PreTrainedTokenizer.__call__`] for details.
|
|
|
|
| 1673 |
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
| 1674 |
`past_key_values`).
|
|
|
|
| 1675 |
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
| 1676 |
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
|
| 1677 |
information on the default strategy.
|
|
|
|
| 1678 |
- 1 indicates the head is **not masked**,
|
| 1679 |
- 0 indicates the head is **masked**.
|
| 1680 |
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 1681 |
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
| 1682 |
config.n_positions - 1]`.
|
|
|
|
| 1683 |
[What are position IDs?](../glossary#position-ids)
|
| 1684 |
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
|
| 1685 |
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
| 1686 |
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
| 1687 |
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
|
|
|
| 1688 |
Two formats are allowed:
|
| 1689 |
- a [`~cache_utils.Cache`] instance;
|
| 1690 |
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
|
| 1691 |
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
|
| 1692 |
cache format.
|
|
|
|
| 1693 |
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
|
| 1694 |
legacy cache format will be returned.
|
|
|
|
| 1695 |
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
| 1696 |
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
| 1697 |
of shape `(batch_size, sequence_length)`.
|
|
|
|
| 1720 |
class MiniCPMModel(MiniCPMPreTrainedModel):
|
| 1721 |
"""
|
| 1722 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
|
|
|
|
| 1723 |
Args:
|
| 1724 |
config: MiniCPMConfig
|
| 1725 |
"""
|
|
|
|
| 1946 |
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
| 1947 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
| 1948 |
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
|
|
|
| 1949 |
Returns:
|
|
|
|
| 1950 |
Example:
|
|
|
|
| 1951 |
```python
|
| 1952 |
>>> from transformers import AutoTokenizer, MiniCPMForCausalLM
|
|
|
|
| 1953 |
>>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
|
| 1954 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
|
|
|
| 1955 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
| 1956 |
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
|
|
|
| 1957 |
>>> # Generate
|
| 1958 |
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
| 1959 |
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
|
|
| 2133 |
@add_start_docstrings(
|
| 2134 |
"""
|
| 2135 |
The MiniCPM Model transformer with a sequence classification head on top (linear layer).
|
|
|
|
| 2136 |
[`MiniCPMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
| 2137 |
(e.g. GPT-2) do.
|
|
|
|
| 2138 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
| 2139 |
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
|
| 2140 |
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
|
|
|
|
| 2247 |
past_key_values=transformer_outputs.past_key_values,
|
| 2248 |
hidden_states=transformer_outputs.hidden_states,
|
| 2249 |
attentions=transformer_outputs.attentions,
|
| 2250 |
+
)
|