해당 글은 제로베이스데이터스쿨 학습자료를 참고하여 작성되었습니다
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz
!gzip -dkv SQuAD_it-*.json.gz
!pip install datasets
!pip install transformers
from datasets import load_dataset
data_files = {"train": "./SQuAD_it-train.json", "test": "./SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset
-------------------------------------------------------------------
DatasetDict({
train: Dataset({
features: ['paragraphs', 'title'],
num_rows: 442
})
test: Dataset({
features: ['paragraphs', 'title'],
num_rows: 48
})
})
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip
data_files = {"train": "drugsComTrain_raw.tsv", "test":"drugsComTest_raw.tsv"}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
drug_dataset
-----------------------------------------------------------------
DatasetDict({
train: Dataset({
features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
num_rows: 161297
})
test: Dataset({
features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
num_rows: 53766
})
})
drug_dataset = drug_dataset.rename_column( # 칼럼명 변경
original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset
-------------------------------------------------------------------
DatasetDict({
train: Dataset({
features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
num_rows: 161297
})
test: Dataset({
features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
num_rows: 53766
})
})
def lowercase_condition(example):
return {"condition": example["condition"].lower()}
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None) # Nonetype 제거
drug_dataset.map(lowercase_condition) # 데이터를 소문자로 변경
def compute_review_length(example):
return {"review_length": len(example["review"].split())}
drug_dataset = drug_dataset.map(compute_review_length)
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30) # 리뷰가 너무 긴 데이터는 필터링
print(drug_dataset.num_rows)
-----------------------------------------------
{'train': 138514, 'test': 46108}
import html
new_drug_dataset = drug_dataset.map(
lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)
new_drug_dataset
-------------------------------------------------------------------------
DatasetDict({
train: Dataset({
features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
num_rows: 138514
})
test: Dataset({
features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
num_rows: 46108
})
})
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_and_split(examples):
return tokenizer(
examples["review"],
truncation=True,
max_length=128,
return_overflowing_tokens=True # 길이를 넘어가는 토큰 반환 여부
)
len(drug_dataset["train"][0]['review'].split()) # 전체 문장을 띄어쓰기로 분할한 수
----------------------
141
result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]] # max length 내부와 초과한 대상
--------------------------
[128, 49]
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english" # 사용할 모델
tokenizer = AutoTokenizer.from_pretrained(checkpoint) # 모델의 tokenizer 로딩
raw_inputs = [
"I've been waiting for a HuggingFace course my whole life.",
"I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt") # input을 숫자로 토큰화
inputs["input_ids"]
-----------------------------------------------------------------------------------
tensor([[ 101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172,
2607, 2026, 2878, 2166, 1012, 102],
[ 101, 1045, 5223, 2023, 2061, 2172, 999, 102, 0, 0,
0, 0, 0, 0, 0, 0]])
inputs["attention_mask"] # 문장길이를 맞추기 위해 0으로 패딩
-------------------------------------------------------------
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model.config.id2label
---------------------------------------------
{0: 'NEGATIVE', 1: 'POSITIVE'}
outputs = model(**inputs)
outputs
-----------------------------------------------------------------------
SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607, 1.6123],
[ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
import torch
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions
--------------------------------------------
tensor([[4.0195e-02, 9.5981e-01],
[9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)
from transformers import BertConfig, BertModel
config = BertConfig() # default 파라미터
config
----------------------------------------------------
BertConfig {
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.28.1",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 30522
}
model = BertModel(config)
model
------------------------------------------------------
BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(30522, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0-11): 12 x BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
...
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
config.hidden_size = 48 # 786 -> 48
model = BertModel(config)
model
-----------------------------------------------------------
BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(30522, 48, padding_idx=0)
(position_embeddings): Embedding(512, 48)
(token_type_embeddings): Embedding(2, 48)
(LayerNorm): LayerNorm((48,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0-11): 12 x BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=48, out_features=48, bias=True)
(key): Linear(in_features=48, out_features=48, bias=True)
(value): Linear(in_features=48, out_features=48, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=48, out_features=48, bias=True)
(LayerNorm): LayerNorm((48,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
...
(pooler): BertPooler(
(dense): Linear(in_features=48, out_features=48, bias=True)
(activation): Tanh()
)
)
model.save_pretrained("./test")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)
print(tokens)
-----------------------------------------------
['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']
sequence = "Using a Transformer network is manual, KT-12312"
tokens = tokenizer.tokenize(sequence)
print(tokens)
--------------------------------------------
['Using', 'a', 'Trans', '##former', 'network', 'is', 'manual', ',', 'K', '##T', '-', '123', '##12']
ids = tokenizer.convert_tokens_to_ids(tokens)
ids
-------------------------------------------------
[7993, 170, 13809, 23763, 2443, 1110, 9506, 117, 148, 1942, 118, 13414, 11964]
tokenizer.decode(ids)
------------------------------------------
'Using a Transformer network is manual, KT - 12312'
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english' # 사용할 모델명
tokenizer = AutoTokenizer.from_pretrained(checkpoint) # 모델의 토큰화
model = AutoModelForSequenceClassification.from_pretrained(checkpoint) # 모델 로딩
sequence = "I've been waiting for a HuggingFace course my whole life." # text
tokens = tokenizer.tokenize(sequence) # text -> token
ids = tokenizer.convert_tokens_to_ids(tokens) # token -> ids
inputs_ids = torch.tensor([ids])
print("input IDs:\n", inputs_ids, end="\n\n")
output = model(inputs_ids)
print("Logits:", output.logits, end="\n\n")
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print("Predicts:\n", predictions)
-------------------------------------------------------------------------
input IDs:
tensor([[ 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607,
2026, 2878, 2166, 1012]])
Logits: tensor([[-2.7276, 2.8789]], grad_fn=<AddmmBackward0>)
Predicts:
tensor([[4.0195e-02, 9.5981e-01],
[9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)
batched_ids = [
[200, 200, 200],
[200, 200, tokenizer.pad_token_id],
]
attention_mask = [
[1, 1, 1],
[1, 1, 0],
]
outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)
-------------------------------------------------
tensor([[ 1.5694, -1.3895],
[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
raw_datasets = load_dataset("glue", "mrpc") # 데이터셋 로딩
checkpoint = "bert-base-uncased" # 불러올 모델명
tokenizer = AutoTokenizer.from_pretrained(checkpoint) # 모델 토큰화 로딩
raw_datasets['train'].features
-------------------------------------------------------
{'sentence1': Value(dtype='string', id=None),
'sentence2': Value(dtype='string', id=None),
'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
'idx': Value(dtype='int32', id=None)}
def tokenize_function(example):
return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # 토큰화
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # 토큰화 자동패딩
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"]) # 불필요칼럼제거
tokenized_datasets = tokenized_datasets.rename_column("label", "labels") # 칼럼명 수정
tokenized_datasets.set_format("torch") # 형식 "torch"로 변환
tokenized_datasets["train"].column_names
-------------------------------------------------------------
['labels', 'input_ids', 'token_type_ids', 'attention_mask']
from torch.utils.data import DataLoader
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
test_dataloader = DataLoader(
tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)
for batch in train_dataloader:
break
{k: v.shape for k, v in batch.items()}
-------------------------------------------------------
{'labels': torch.Size([8]),
'input_ids': torch.Size([8, 66]),
'token_type_ids': torch.Size([8, 66]),
'attention_mask': torch.Size([8, 66])}
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
from transformers import get_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps
)
print(num_training_steps)
-----------------------------------------------
1377
import torch
is_cuda = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(is_cuda)
model.to(device)
device
----------------------------------------------
device(type='cuda')
from tqdm import tqdm
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
-----------------------------------------------------------------
100%|██████████| 1377/1377 [01:06<00:00, 21.74it/s]
from datasets import load_metric
metric = load_metric("glue", "mrpc")
model.eval()
for batch in test_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
outputs = model(**batch)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
metric.add_batch(predictions=predictions, references=batch["labels"])
metric.compute()
-----------------------------------------------------------
{'accuracy': 0.8578431372549019, 'f1': 0.8989547038327526}