:트랜스포머 핵식 원리와 허깅페이스 패키지 활용법
<내 코랩>
https://colab.research.google.com/drive/1iFwetiMpnwmi5RDFVs_F6vOazqBKbpVW#scrollTo=PjR2EHeMFiQv
### GPT 입력값 만들기
# 2-11) GPT 토크나이저 선언
from transformers import GPT2Tokenizer
tokenizer_gpt = GPT2Tokenizer.from_pretrained("/gdrive/My Drive/nlpbook/bbpe")
tokenizer_gpt.pad_token= "[PAD]"
# 2-12) GPT토크나이저로 토큰화하기
sentences = [
"아 더빙.. 진짜 짜증나네요 목소리",
"흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나",
"별루 였다..",
]
tokenizer_sentences = [tokenizer_gpt.tokenize(sentence) for sentence in sentences]
>tokenizer_sentences
[out put] :
[['ìķĦ', 'ĠëįĶë¹Ļ', '..', 'Ġì§Ħì§ľ', 'Ġì§ľì¦ĿëĤĺ', 'ëĦ¤ìļĶ', 'Ġ목ìĨĮ리'],
['íĿł',
'...',
'íı¬ìĬ¤íĦ°',
'ë³´ê³ł',
'Ġì´ĪëĶ©',
'ìĺģíĻĶ',
'ì¤Ħ',
'....',
'ìĺ¤ë²Ħ',
'ìĹ°ê¸°',
'ì¡°ì°¨',
'Ġê°Ģë³į',
'ì§Ģ',
'ĠìķĬ',
'구ëĤĺ'],
['ë³Ħ루', 'Ġìĺ', 'Ģëĭ¤', '..']]
# 2-13) GPT모델 입력 만들기
batch_inputs = tokenizer_gpt(
sentences,
padding="max_length",
max_length=12,
truncation=True,
)
> batch_inputs.keys()
[out put] dict_keys(['input_ids', 'attention_mask'])
>batch_inputs['input_ids']
[out put] [[334, 2338, 263, 581, 4055, 464, 3808, 0, 0, 0, 0, 0],
[3693, 336, 2876, 758, 2883, 356, 806, 422, 9875, 875, 2960, 7292],
[4957, 451, 3653, 263, 0, 0, 0, 0, 0, 0, 0, 0]]
> batch_inputs['attention_mask']
[out put] [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]
# 2-14) BERT 토크나이저 선언
from transformers import BertTokenizer
tokenizer_bert = BertTokenizer.from_pretrained(
"/gdrive/My Drive/nlpbook/wordpiece",
do_lower_case = False,
)
# 2-15) bert 토크나이저로 토큰화하기기
sentences = [
"아 더빙.. 진짜 짜증나네요 목소리",
"흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나",
"별루 였다..",
]
tokenizer_sentences = [tokenizer_bert.tokenize(sentence) for sentence in sentences]
# 2-16) BERT 모델 입력 만들기
batch_inputs = tokenizer_bert(
sentences,
padding="max_length",
max_length=12,
truncation=True,
)
> batch_inputs.keys()
[out put]: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
>batch_inputs['input_ids']
[out put][[2, 620, 2631, 16, 16, 1993, 3678, 1990, 3323, 3, 0, 0],
[2, 997, 16, 16, 16, 2609, 2045, 2796, 1981, 1189, 16, 3],
[2, 3274, 9508, 16, 16, 3, 0, 0, 0, 0, 0, 0]]
>batch_inputs['attention_mask']
[out put][[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]
>batch_inputs['token_type_ids']
[out put][[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]