Legacy
from torchtext.legacy import data, datasets
TEXT = data.Field(sequential=True, batch_first=True, lower=True)
LABEL = data.Field(sequential=False, batch_first=True)
train_set, test_set = datasets.IMDB.splits(TEXT, LABEL)
New
from torchtext.datasets import IMDB
train_iter, test_iter = IMDB(root='.data', split=('train', 'test'))
Legacy
TEXT.build_vocab(train_set, min_freq=5)
LABEL.build_vocab(train_set)
print(TEXT.vocab.stoi)
>>> defaultdict(..., {'<unk>': 0, '<pad>': 1, 'the': 2, ... })
New
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
tokenizer = get_tokenizer('basic_english')
def yield_tokens(data_iter):
for _, text in data_iter:
# key= '1 or 2 or 3...', value : 'text sequence'
yield tokenizer(text) # 토큰화된 text 리턴
vocab = build_vocab_from_iterator(
iterator=yield_tokens(train_iter), # must yield list or token iterator.
min_freq=5,
specials=['<unk>'],) # <unk> token을 지정
vocab.set_default_index(vocab['<unk>']) # oov(out of vocabulary) 일때 반환되는 토큰
print((('<unk>') in vocab), (('<pad>') in vocab))
>>> True False
Legacy
train_set, valid_set = train_set.split(split_ratio=0.8)
train_iter, val_iter, test_iter = data.BucketIterator.splits(
(train_set, valid_set, test_set), batch_size=32,
shuffle=True, repeat=False)
New
from torch.utils.data import random_split
def train_valid_split(train_iterator, split_ratio=0.8, seed=42):
train_count = int(split_ratio * len(train_iterator))
valid_count = len(train_iterator) - train_count
generator = torch.Generator().manual_seed(seed)
train_set, valid_set = random_split(
train_iterator, lengths=[train_count, valid_count], generator=generator)
return train_set, valid_set
# iterable type에서 map style로 변환해야 length check 가능
train_iter = to_map_style_dataset(train_iter)
train_set, valid_set = split_train_valid(train_iter)
train_dataloader = DataLoader(
train_set, batch_size=64, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(
valid_set, batch_size=64, shuffle=True, collate_fn=collate_batch)
# 데이터 변환
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)
'''
# dataset 형태에 따라 아래 방식처럼 생성 가능
text_pipeline = lambda x: \
[vocab['<BOS>']] + [vocab[token] for token in tokenizer(x)] + [vocab['<EOS>']]
label_pipeline = lambda x: 1 if x == 'pos' else 0
'''
def collate_batch(batch):
label_list, text_list = [], []
for (_label, _text) in batch:
label_list.append(label_pipeline(_label))
processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
text_list.append(processed_text)
label_list = torch.tensor(label_list, dtype=torch.int64)
text_tensor = pad_sequence(text_list, padding_value=1, batch_first=True)
return text_tensor, label_list
Legacy
def train(model, optimizer, train_iter):
model.train()
for batch in train_iter:
x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
y.data.sub_(1) # <unk>:0 인 token 값 제거
optimizer.zero_grad()
logit = model(x)
loss = F.cross_entropy(logit, y)
loss.backward()
optimizer.step()
def evaluate(model, valid_iter):
model.eval()
corrects, total_loss = 0, 0
for batch in valid_iter:
x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
y.data.sub_(1) # <unk>:0 인 token 값 제거
logit = model(x)
loss = F.cross_entropy(logit, y, reduction='sum')
total_loss += loss.item()
corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
size = len(valid_iter.dataset)
avg_loss = total_loss / size
avg_accuracy = 100.0 * corrects / size
print(total_loss, 100.0 * corrects, size)
return avg_loss, avg_accuracy
for epoch in range(1, EPOCHS+1):
train(model, optimizer, train_iter)
val_loss, val_accuracy = evaluate(model, val_iter)
print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f" % (
epoch, val_loss, val_accuracy))
New
def train(model, optimizer, train_iter):
model.train()
for x, y in train_iter:
x, y = x.to(DEVICE), y.to(DEVICE)
y.sub_(1) # <unk>:0 인 token 값 제거
optimizer.zero_grad()
logit = model(x)
loss = F.cross_entropy(logit, y)
loss.backward()
optimizer.step()
def evaluate(model, valid_iter, total_valid_set_len):
model.eval()
corrects, total_loss, total_count = 0, 0, 0
for x, y in valid_iter:
x, y = x.to(DEVICE), y.to(DEVICE)
y.sub_(1) # <unk>:0 인 token 값 제거
logit = model(x)
loss = F.cross_entropy(logit, y, reduction='sum')
total_loss += loss.item()
corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
size = total_valid_set_len
avg_loss = total_loss / size
avg_accuracy = 100.0 * corrects / size
print(total_loss, 100.0 * corrects, size)
return avg_loss, avg_accuracy
for epoch in range(1, EPOCHS+1):
train(model, optimizer, train_dataloader)
with torch.no_grad():
val_loss, val_accuracy = evaluate(model, val_dataloader, len(val_set))
print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f" % (
epoch, val_loss, val_accuracy))
Migration Code : https://github.com/groovallstar/pytorch_rnn_tutorial/blob/main/8_2_torchtext_migration.ipynb