전남대학교 인공감정지능 기초연구실 주최의 제 4회 한국인 감정인식 국제경진대회에 참여한 결과물이다.
from pororo import Pororo
mt = Pororo(task="translation", lang="multi")
def translate(text, lang):
txt = mt(text, src='ko', tgt=lang)
res = mt(txt, src=lang, tgt='ko')
return res
pororo_en = train_data.copy()
pororo_ja = train_data.copy()
# 한국어 -> 영어 -> 한국어
pororo_en['sentence'] = pororo_en['sentence'].progress_apply(lambda x: translate(x, 'en'))
pororo_en['context'] = pororo_en['context'].progress_apply(lambda x: translate(x, 'en'))
# 한국어 -> 일본어 -> 한국어
pororo_ja['sentence'] = pororo_ja['sentence'].progress_apply(lambda x: translate(x, 'ja'))
pororo_ja['context'] = pororo_ja['context'].progress_apply(lambda x: translate(x, 'ja'))
# 증강한 데이터와 원래 데이터 통합
train_data = pd.concat([train_data, pororo_en, pororo_ja])
train_labels = train_data[['sentence_id','label']]
batch_size = 16
max_len = 64
epoch = 3
learning_rate = 2e-5
weight_decay = 0.01
warmup_rate = 0.1
checkpoint = 'klue/roberta-large'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
params = {
'batch_size' : 16,
'max_len' : 64,
'epoch' : 3,
'learning_rate' :2e-5,
'weight_decay' : 0.01,
'warmup_rate' : 0.1
}
class RoBERTaDataset(torch.utils.data.Dataset):
def __init__(self, data, label):
self.data = data
self.label = label
def __getitem__(self, idx): # 객체 슬라이싱 설정
item = {key: value[idx].clone().detach() for key, value in self.data.items()}
item['labels'] = torch.tensor(self.label[idx])
return item
def __len__(self):
return len(self.label)
def changer(label, state=True):
res = []
if state:
labels = {"dysphoria": 0, "euphoria": 1, "neutral": 2, "none": 3}
for i in label:
res.append(labels[i])
elif state == False:
labels = {0 : "dysphoria", 1 : "euphoria", 2 : "neutral"}
for i, v in enumerate(label):
res.append([i,labels[v]])
return res
F1 = F1Score(num_classes=3, average = 'micro')
class FocalLoss(nn.CrossEntropyLoss):
''' Focal loss for classification tasks on imbalanced datasets '''
def __init__(self, gamma=4, alpha=None, ignore_index=0, reduction='none'):
super().__init__(weight=alpha, ignore_index=ignore_index, reduction='none')
self.reduction = reduction
self.gamma = gamma
def forward(self, input_, target):
cross_entropy = super().forward(input_, target)
# Temporarily mask out ignore index to '0' for valid gather-indices input.
# This won't contribute final loss as the cross_entropy contribution
# for these would be zero.
target = target * (target != self.ignore_index).long()
input_prob = torch.gather(F.softmax(input_, 1), 1, target.unsqueeze(1))
loss = torch.pow(1 - input_prob, self.gamma) * cross_entropy
return torch.mean(loss)
Loss = FocalLoss()
def RoBERTa_trainer():
accelerator = Accelerator()
config = AutoConfig.from_pretrained(checkpoint)
config.num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config=config)
optimizer = AdamW(model.parameters(), lr= params['learning_rate'], weight_decay= params['weight_decay'])
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=params['batch_size'], sampler=train_subsampler)
eval_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=params['batch_size'], sampler=test_subsampler)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)
total_epoch = params['epoch']
num_training_steps = total_epoch * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))
lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=int(params['warmup_rate'] * num_training_steps),
num_training_steps=num_training_steps,
)
for epoch in range(total_epoch):
seed_everything(epoch)
train_f1 = 0.0
test_f1 = 0.0
model.train()
for batch_id, batch in enumerate(train_dataloader):
res = model(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])
loss = Loss(res[0], batch['labels'])
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
train_f1 += F1(res.logits.cpu(), batch['labels'].data.cpu())
print("epoch {} train_f1 {}".format(epoch+1, train_f1 / (batch_id+1)))
model.eval()
for batch_id, batch in enumerate(eval_dataloader):
with torch.no_grad():
res = model(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])
test_f1 += F1(res.logits.cpu(), batch['labels'].data.cpu())
print("epoch {} test_f1 {}".format(epoch+1, test_f1 / (batch_id+1)))
gc.collect()
accelerator.wait_for_everyone()
final_model = accelerator.unwrap_model(model)
final_model.save_pretrained('./experiments/experiment' + str(fold), save_function = accelerator.save)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
tokenized_train = tokenizer(
list(train_data['sentence']),
list(train_data['context']),
return_tensors="pt",
max_length=params['max_len'],
padding=True,
truncation=True,
add_special_tokens=True
)
for fold, (train_ids, test_ids) in enumerate(kfold.split(train_data, train_labels['label'])):
print('--------------------------------')
print(f'FOLD {fold}')
print('--------------------------------')
train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
train_label = changer(train_labels['label'])
train_dataset = RoBERTaDataset(tokenized_train, train_label)
notebook_launcher(RoBERTa_trainer)
5개의 fold로 학습된 모델들을 가져와 추론하여 예측값을 생성하고 모아서 hard voting 해주었다.
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
test_label = changer(test_data['label'].values)
df = test_data.copy()
tokenized_test = tokenizer(
list(test_data['sentence']),
list(test_data['context']),
return_tensors="pt",
max_length=params['max_len'],
padding=True,
truncation=True,
add_special_tokens=True
)
test_dataset = RoBERTaDataset(tokenized_test, test_label)
dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)
for fold in range(5):
config = AutoConfig.from_pretrained(checkpoint)
config.num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained('./experiments/experiment' + str(fold), num_labels=3)
model.resize_token_embeddings(tokenizer.vocab_size)
accelerator = Accelerator()
model = accelerator.unwrap_model(model)
output_pred = []
model, dataloader= accelerator.prepare(model, dataloader)
model.eval()
for i, data in enumerate(tqdm(dataloader)):
with torch.no_grad():
outputs = model(input_ids=data['input_ids'], attention_mask=data['attention_mask'])
logits = outputs[0]
logits = logits.detach().cpu().numpy()
result = np.argmax(logits, axis=-1)
output_pred.append(result)
pred_answer = np.concatenate(output_pred).tolist()
answer = changer(pred_answer, state=False)
df_label = pd.DataFrame(answer, columns=['index', 'label'])
df[f'label{fold}'] = df_label['label']