from transformers import AutoTokenizer
model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
print("--- LLaMA 3 토크나이저 로드 성공! ---")
print(tokenizer)
결과
(pruning) C:\Users\KHH\source\lab\EfficientML\LightweightChallenge>python test_tokenizer.py
tokenizer_config.json: 100%|██████████████████████████████████████████████████████| 50.6k/50.6k [00:00<00:00, 50.5MB/s]
C:\Users\KHH.conda\envs\pruning\lib\site-packages\huggingface_hub\file_download.py:143: UserWarning: huggingface_hub cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\Users\KHH.cache\huggingface\hub\models--meta-llama--Meta-Llama-3-8B. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the HF_HUB_DISABLE_SYMLINKS_WARNING environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
warnings.warn(message)
tokenizer.json: 100%|█████████████████████████████████████████████████████████████| 9.09M/9.09M [00:01<00:00, 7.66MB/s]
special_tokens_map.json: 100%|██████████████████████████████████████████████████████| 73.0/73.0 [00:00<00:00, 72.7kB/s]
--- LLaMA 3 토크나이저 로드 성공! ---
PreTrainedTokenizerFast(name_or_path='meta-llama/Meta-Llama-3-8B', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|end_of_text|>'}, clean_up_tokenization_spaces=True), added_tokens_decoder={
128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
128004: AddedToken("<|reserved_special_token_2|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
128005: AddedToken("<|reserved_special_token_3|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
128006: AddedToken("<|start_header_id|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
...
128253: AddedToken("<|reserved_special_token_248|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
128254: AddedToken("<|reserved_special_token_249|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
128255: AddedToken("<|reserved_special_token_250|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}