LucaOne-default-step17.6M / configuration_lucaone.py
LucaGroup's picture
Update weights and modeling code to latest version
24298c5 verified
#!/usr/bin/env python
# encoding: utf-8
'''
@license: (C) Copyright 2025, Hey.
@author: Hey
@email: [email protected]
@tel: 137****6540
@datetime: 2025/12/30 11:34
@project: lucaone
@file: tokenization_lucaone
@desc: tokenization_lucaone
'''
from typing import Literal
from transformers import PretrainedConfig
class LucaGPLMConfig(PretrainedConfig):
model_type = "lucaone"
def __init__(
self,
vocab_size: int = 39,
pad_token_id: int = 0,
unk_token_id: int = 1,
bos_token_id: int = 2,
eos_token_id: int = 3,
sep_token_id: int = 3,
mask_token_id: int = 4,
hidden_act: str = "gelu",
max_position_embeddings: int = 4096,
type_vocab_size: int = 2,
num_hidden_layers: int = 20,
num_attention_heads: int = 40,
hidden_size: int = 2560,
ffn_dim: int = 10240,
no_position_embeddings: bool = True,
no_token_type_embeddings: bool = False,
alphabet: str = "gene_prot",
token_dropout: bool = False,
attention_probs_dropout_prob: float = 0.0,
hidden_dropout_prob: float = 0.0,
use_embed_layer_norm: bool = False,
use_last_layer_norm: bool = True,
embed_scale: float = 1.0,
ignore_index: int = -100,
layer_norm_eps: float = 1e-12,
initializer_range: float = 0.02,
task_level: Literal["seq_level", "token_level"] = "seq_level",
task_type: Literal["embedding", "mlm", "multi_class", "binary_class", "regression", "multi_label"] = "embedding",
classifier_num_labels: int = -1,
classifier_dropout_prob: float = 0.1,
classifier_pooling_type: Literal["cls", "value_attention", "context_attention", "mean"] = "value_attention",
classifier_loss_type: Literal["binary_cross_entropy", "cross_entropy", "mse", "mae"] = "cross_entropy",
classifier_loss_reduction: Literal["mean", "sum", "none"] = "mean",
classifier_pos_weight: float=1.0,
classifier_weight: list=None,
tie_word_embeddings: bool=True,
**kwargs
):
super().__init__(
tie_word_embeddings=tie_word_embeddings,
pad_token_id=pad_token_id,
**kwargs
)
self.alphabet = alphabet
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.no_token_type_embeddings = no_token_type_embeddings
self.no_position_embeddings = no_position_embeddings
self.num_hidden_layers = num_hidden_layers
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
self.ffn_dim = ffn_dim
self.token_dropout = token_dropout
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.hidden_dropout_prob = hidden_dropout_prob
self.classifier_dropout_prob = classifier_dropout_prob
self.ignore_index = ignore_index
self.use_embed_layer_norm = use_embed_layer_norm
self.use_last_layer_norm = use_last_layer_norm
self.embed_scale = embed_scale
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
self.unk_token_id = unk_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.sep_token_id = sep_token_id
self.mask_token_id = mask_token_id
self.hidden_act = hidden_act
self.classifier_num_labels = classifier_num_labels
self.classifier_pooling_type = classifier_pooling_type
self.task_level = task_level
self.task_type = task_type
self.classifier_loss_type = classifier_loss_type
self.classifier_loss_reduction = classifier_loss_reduction
self.classifier_pos_weight = classifier_pos_weight
self.classifier_weight = classifier_weight
__all__ = ["LucaGPLMConfig"]