Spaces:
Running
Running
| from transformers import AutoTokenizer | |
| import gradio as gr | |
| import random | |
| checkpoint = "dslim/bert-base-NER" | |
| checkpoints = [ | |
| checkpoint, | |
| "TinyLlama/TinyLlama-1.1B-Chat-v1.0", | |
| "microsoft/phi-2", | |
| "openai/whisper-large-v3", | |
| "NousResearch/Nous-Hermes-2-Yi-34B", | |
| "bert-base-cased" | |
| ] | |
| placeholder = "Type anything in this text box and hit Tokenize!" | |
| sequences = [ | |
| "The quick brown π¦ fox jumps over the lazy π dog!", | |
| "How vexingly β© quick daft π¦ zebras jump?", | |
| "Pack my π¦ box with five dozen π· liquor jugs.", | |
| "The five π₯ boxing π§ββοΈ wizards jump quickly~", | |
| "While making deep βοΈ excavations we found some quaint bronze π jewelry!", | |
| "Whenever the π¦ fox jumped, the πΏοΈ squirrel gazed suspiciously...", | |
| "We promptly π§ββοΈ judged antique ivory buckles for the next π prize." | |
| ] | |
| def randomize_sequence(): | |
| return random.choice(sequences) | |
| sequence = randomize_sequence | |
| def load_vocab(target_model, current_model): | |
| checkpoint = target_model | |
| if target_model == current_model: | |
| gr.Info(f"Tokenizer already loaded: {checkpoint}") | |
| else: | |
| load_tokenizer(checkpoint) | |
| gr.Info(f"Tokenizer loaded: {checkpoint}") | |
| vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1])) | |
| unk = next(iter(vocab)) | |
| vocab.pop(unk) | |
| vocab_sorted = "\n".join(vocab) | |
| vocab_size = len(vocab) | |
| gr.Info(f"Tokenizer vocab size: {vocab_size}") | |
| return checkpoint, vocab_size, unk, vocab_sorted | |
| def load_tokenizer(checkpoint): | |
| if not "tokenizer" in globals(): | |
| global tokenizer | |
| if len(checkpoint) > 0: | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| except Exception as error: | |
| gr.Warning("Unexpected error!") | |
| raise gr.Error(f"{error}") | |
| else: | |
| return ValueError("Tokenizer cannot be empty!") | |
| def tokenize_er(checkpoint, sequence): | |
| try: | |
| load_tokenizer(checkpoint) | |
| tokens = tokenizer.tokenize(sequence) | |
| ids = tokenizer.convert_tokens_to_ids(tokens) | |
| token_id_pair = [] | |
| if len(tokens) == len(ids): | |
| for i in range(len(ids)): | |
| token_id_pair.append([tokens[i],ids[i]]) | |
| return token_id_pair | |
| except NameError: | |
| gr.Warning("Select Tokenizer before sequencing.") | |
| return [[None, None]] | |
| except Exception as error: | |
| gr.Warning("Unexpected error!") | |
| raise gr.Error(f"{error}") | |
| def de_tokenize_er(checkpoint, pairs): | |
| try: | |
| load_tokenizer(checkpoint) | |
| tokens = [] | |
| ids = [] | |
| for row in pairs: | |
| tokens.append(row[0]) | |
| try: | |
| ids.append(int(row[1])) | |
| except: | |
| ids.append(0) | |
| tokens_ids= tokenizer.convert_tokens_to_ids(tokens) | |
| decoded_tokens = tokenizer.decode(tokens_ids) | |
| decoded_ids = tokenizer.decode(ids) | |
| return tokens_ids, decoded_tokens, decoded_ids | |
| except NameError: | |
| gr.Warning("Tokenize sequence before decoding.") | |
| return None, None, None | |
| except Exception as error: | |
| gr.Warning("Unexpected error!") | |
| raise gr.Error(f"{error}") | |
| with gr.Blocks() as frontend: | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| gr.Markdown("# π Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... π΅οΈπ³οΈ\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right β‘οΈ\n\nβ οΈ Loading the full vocabulary can take a few seconds and the browser might stutter.") | |
| with gr.Row(): | |
| gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from π€ Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.") | |
| with gr.Row(): | |
| input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False) | |
| #btn_load_vocab = gr.Button(value="Load Vocabulary") | |
| with gr.Row(): | |
| gr.Markdown("\n#### 2. Sequence & Tokenize") | |
| with gr.Row(): | |
| input_sequence = gr.TextArea(label="Sequence", value=sequence, placeholder=placeholder, lines=3, interactive=True, show_label=False, container=False) | |
| with gr.Row(): | |
| btn_tokenize = gr.Button(value="Tokenize!") | |
| btn_random_seq = gr.Button(value="Randomize!") | |
| with gr.Row(): | |
| gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!") | |
| with gr.Row(): | |
| token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","Vocabulary ID"], value=[[None,0]], type="array", datatype=["str", "number"], height=400, interactive=True) | |
| with gr.Row(): | |
| btn_decode = gr.Button(value="Decode") | |
| btn_clear_pairs = gr.ClearButton(value="Clear Token/IDs", components=[token_id_pair]) | |
| with gr.Row(): | |
| with gr.Column(): | |
| output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False) | |
| output_decoded_tokens = gr.TextArea(label="Decoded Re-encoded Tokens", interactive=False) | |
| with gr.Column(): | |
| output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False) | |
| with gr.Column(scale=1): | |
| with gr.Group(): | |
| gr.Markdown("### π² Tokenizer Data") | |
| output_checkpoint = gr.Textbox(visible=False) | |
| output_vocab_count = gr.Number(label="Vocab Size", interactive=False) | |
| output_token_zero = gr.Textbox(label="Token 0", interactive=False) | |
| output_vocab = gr.Code(label="Vocabulary IDs") | |
| input_checkpoint.change(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_token_zero, output_vocab], queue=True) | |
| btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair], queue=True) | |
| btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence]) | |
| btn_decode.click(fn=de_tokenize_er, inputs=[input_checkpoint, token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids], queue=True) | |
| frontend.load(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_token_zero, output_vocab], queue=True) | |
| frontend.launch() |