openbmb
/

InfLLM-V2-Long-Sparse-Base

@@ -144,12 +144,10 @@ def compressed_attention(
 def calc_chunks_with_stride(cu_seqlen, chunk_size, kernel_stride):
     """
     Compute the chunks that require Sparse attention, with stride support.
     Args:
         cu_seqlen (torch.Tensor): Cumulative sequence lengths for each sample.
         chunk_size (int): Chunk size used for Sparse attention.
         kernel_stride (int): Stride size when sliding over the sequence.
     Returns:
         filtered_indices (torch.Tensor): Indices used to directly index into the key/value tensors.
         cu_seqlens_compressed (torch.Tensor): Cumulative sequence lengths after compression.
@@ -192,7 +190,6 @@ class CompressK(torch.nn.Module):
     def __init__(self, head_num_k, head_dim, kernel_size, kernel_stride=16):
         """
         Module for compressing key (K) representations.
         Args:
             head_num_k (int): Number of key attention heads.
             head_dim (int): Dimension of each attention head.
@@ -208,15 +205,12 @@ class CompressK(torch.nn.Module):
     def forward(self, k: torch.Tensor, cu_seqlens):
         """
         Forward pass for compressing the key (K) tensor.
         Args:
             k (torch.Tensor): Input key tensor of shape (total_seq_len, num_heads, head_dim).
             cu_seqlens (torch.Tensor): Cumulative sequence lengths for each sample in the batch, typically used for handling variable-length sequences.
         Returns:
             compress_k (torch.Tensor): Compressed key tensor.
             cu_seqlens_compressed (torch.Tensor): Updated cumulative sequence lengths after compression.
         """
         # Compute chunk-related metadata, with stride support
         filtered_k_indices, cu_seqlens_compressed = calc_chunks_with_stride(
@@ -535,7 +529,6 @@ def rotate_half(x):
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
         q (`torch.Tensor`): The query tensor.
         k (`torch.Tensor`): The key tensor.
@@ -906,7 +899,6 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
@@ -1136,7 +1128,6 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
             """
             Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
             first unpad the input, then computes the attention scores and pad the final attention scores.
             Args:
                 query_states (`torch.Tensor`):
                     Input query states to be passed to Flash Attention API
@@ -1364,7 +1355,6 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
@@ -1628,11 +1618,9 @@ MINICPM_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`MiniCPMConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -1672,50 +1660,38 @@ MINICPM_INPUTS_DOCSTRING = r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
             - a [`~cache_utils.Cache`] instance;
             - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
             shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
             cache format.
             The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
             legacy cache format will be returned.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
             of shape `(batch_size, sequence_length)`.
@@ -1744,7 +1720,6 @@ MINICPM_INPUTS_DOCSTRING = r"""
 class MiniCPMModel(MiniCPMPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
     Args:
         config: MiniCPMConfig
     """
@@ -1971,20 +1946,14 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, MiniCPMForCausalLM
         >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -2164,10 +2133,8 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
 @add_start_docstrings(
     """
     The MiniCPM Model transformer with a sequence classification head on top (linear layer).
     [`MiniCPMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
     no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
@@ -2280,4 +2247,4 @@ class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
-        )

 def calc_chunks_with_stride(cu_seqlen, chunk_size, kernel_stride):
     """
     Compute the chunks that require Sparse attention, with stride support.
     Args:
         cu_seqlen (torch.Tensor): Cumulative sequence lengths for each sample.
         chunk_size (int): Chunk size used for Sparse attention.
         kernel_stride (int): Stride size when sliding over the sequence.
     Returns:
         filtered_indices (torch.Tensor): Indices used to directly index into the key/value tensors.
         cu_seqlens_compressed (torch.Tensor): Cumulative sequence lengths after compression.
     def __init__(self, head_num_k, head_dim, kernel_size, kernel_stride=16):
         """
         Module for compressing key (K) representations.
         Args:
             head_num_k (int): Number of key attention heads.
             head_dim (int): Dimension of each attention head.
     def forward(self, k: torch.Tensor, cu_seqlens):
         """
         Forward pass for compressing the key (K) tensor.
         Args:
             k (torch.Tensor): Input key tensor of shape (total_seq_len, num_heads, head_dim).
             cu_seqlens (torch.Tensor): Cumulative sequence lengths for each sample in the batch, typically used for handling variable-length sequences.
         Returns:
             compress_k (torch.Tensor): Compressed key tensor.
             cu_seqlens_compressed (torch.Tensor): Updated cumulative sequence lengths after compression.
         """
         # Compute chunk-related metadata, with stride support
         filtered_k_indices, cu_seqlens_compressed = calc_chunks_with_stride(
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
         q (`torch.Tensor`): The query tensor.
         k (`torch.Tensor`): The key tensor.
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
             """
             Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
             first unpad the input, then computes the attention scores and pad the final attention scores.
             Args:
                 query_states (`torch.Tensor`):
                     Input query states to be passed to Flash Attention API
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`MiniCPMConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
             - a [`~cache_utils.Cache`] instance;
             - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
             shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
             cache format.
             The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
             legacy cache format will be returned.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
             of shape `(batch_size, sequence_length)`.
 class MiniCPMModel(MiniCPMPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
     Args:
         config: MiniCPMConfig
     """
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, MiniCPMForCausalLM
         >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 @add_start_docstrings(
     """
     The MiniCPM Model transformer with a sequence classification head on top (linear layer).
     [`MiniCPMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
     no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
+        )