출처: Natural Language Processing with Transformers by Lewis Tunstall, Leandro von Werra, and Thomas Wolf (O’Reilly). Copyright 2022 Lewis Tunstall, Leandro von Werra, and Thomas Wolf
load_dataset("csv", data_files="test.txt", sep=";")
해결 방식
• Randomly oversample the minority class.
• Randomly undersample the majority class.
• Gather more labeled data from the underrepresented classes.
text = "nice to meet you"
tokenized_text = list(text) # ['n', 'i', 'c', 'e', ' ', 't', 'o', ' ', 'm', 'e', 'e', 't', ' ', 'y', 'o', 'u']
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))}
print(token2idx) # {' ': 0, 'c': 1, 'e': 2, 'i': 3, 'm': 4, 'n': 5, 'o': 6, 't': 7, 'u': 8, 'y': 9}
input_ids = [token2idx[token] for token in tokenized_text]
print(input_ids) # [5, 3, 1, 2, 0, 7, 6, 0, 4, 2, 2, 7, 0, 9, 6, 8]
df = pd.DataFrame({'station': ['청담역', '강남구청역', '건대입구역'], 'label': [0,1,2]})
pd.get_dummies(df['station'])
text.split()
from transformers import AutoTokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
text = "Tokenizing text is a task of nlp"
encoded_text = tokenizer(text)
# {'input_ids': [101, 19204, 6026, 3793, 2003, 1037, 4708, 1997, 17953, 2361, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
# ['[CLS]', 'token', '##izing', 'text', 'is', 'a', 'task', 'of', 'nl', '##p', '[SEP]']
print(tokenizer.convert_tokens_to_string(tokens))
# [CLS] tokenizing text is a task of nlp [SEP]