Hugging Face: Use Model

InSung-Na·2023년 5월 8일
0
post-thumbnail

해당 글은 제로베이스데이터스쿨 학습자료를 참고하여 작성되었습니다

1. zip_dataset_load

SQuAD_it

Data download & unzip

!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz

!gzip -dkv SQuAD_it-*.json.gz

데이터 불러오기

!pip install datasets
!pip install transformers
from datasets import load_dataset

data_files = {"train": "./SQuAD_it-train.json", "test": "./SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset
-------------------------------------------------------------------
DatasetDict({
    train: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 442
    })
    test: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 48
    })
})

drugsCom_raw

데이터 로드

!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip
data_files = {"train": "drugsComTrain_raw.tsv", "test":"drugsComTest_raw.tsv"}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
drug_dataset
-----------------------------------------------------------------
DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

전처리

drug_dataset = drug_dataset.rename_column(  # 칼럼명 변경
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset
-------------------------------------------------------------------
DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

데이터 소문자화

def lowercase_condition(example):
  return {"condition": example["condition"].lower()}
  
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None) # Nonetype 제거
drug_dataset.map(lowercase_condition) # 데이터를 소문자로 변경

리뷰 길이 제한

def compute_review_length(example):
  return {"review_length": len(example["review"].split())}
  
drug_dataset = drug_dataset.map(compute_review_length)
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30) # 리뷰가 너무 긴 데이터는 필터링
print(drug_dataset.num_rows)
-----------------------------------------------
{'train': 138514, 'test': 46108}

1 batch

import html

new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)
new_drug_dataset
-------------------------------------------------------------------------
DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 138514
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

AutoTokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_split(examples):
  return tokenizer(
      examples["review"],
      truncation=True,
      max_length=128,
      return_overflowing_tokens=True  # 길이를 넘어가는 토큰 반환 여부
  )
len(drug_dataset["train"][0]['review'].split()) # 전체 문장을 띄어쓰기로 분할한 수
----------------------
141
result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]       # max length 내부와 초과한 대상
--------------------------
[128, 49]

2. Model

from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"  # 사용할 모델
tokenizer = AutoTokenizer.from_pretrained(checkpoint)           # 모델의 tokenizer 로딩

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")  # input을 숫자로 토큰화
inputs["input_ids"]
-----------------------------------------------------------------------------------
tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]])
inputs["attention_mask"]    # 문장길이를 맞추기 위해 0으로 패딩
-------------------------------------------------------------
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])

분류모델

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model.config.id2label
---------------------------------------------
{0: 'NEGATIVE', 1: 'POSITIVE'}
outputs = model(**inputs)
outputs
-----------------------------------------------------------------------
SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

지문에 대한 긍정 부정 분류

  • 첫번째 문장은 긍정
  • 두번째 문장은 부정
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions
--------------------------------------------
tensor([[4.0195e-02, 9.5981e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)

BertModel Config Modify

Config 불러오기

from transformers import BertConfig, BertModel

config = BertConfig()   # default 파라미터
config
----------------------------------------------------
BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

모델에 config 적용

model = BertModel(config)
model
------------------------------------------------------
BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
...
  (pooler): BertPooler(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
  )
)

Config 수정해보기

config.hidden_size = 48    # 786 -> 48
model = BertModel(config)
model
-----------------------------------------------------------
BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 48, padding_idx=0)
    (position_embeddings): Embedding(512, 48)
    (token_type_embeddings): Embedding(2, 48)
    (LayerNorm): LayerNorm((48,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=48, out_features=48, bias=True)
            (key): Linear(in_features=48, out_features=48, bias=True)
            (value): Linear(in_features=48, out_features=48, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=48, out_features=48, bias=True)
            (LayerNorm): LayerNorm((48,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
...
  (pooler): BertPooler(
    (dense): Linear(in_features=48, out_features=48, bias=True)
    (activation): Tanh()
  )
)

model 저장하기

model.save_pretrained("./test")

3. Tokenizer

tokenizer가 적절하지 못하면

  • tokenzier의 사용모델(환경)에 적절한 것을 사용하여야 한다
  • ex. A데이터셋을 기반으로 학습하였는데 B데이터셋을 사용한 경우
  • 인식하지 못하면 '[UNK]'
  • 자주 사용하는 접두어, 접미어를 학습한 경우 '##'같은 것이 생성됨
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)
-----------------------------------------------
['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']
  • 토큰화가 적합하지 않으면 의미없는 토큰으로 분리될 수 있음
sequence = "Using a Transformer network is manual, KT-12312"
tokens = tokenizer.tokenize(sequence)

print(tokens)
--------------------------------------------
['Using', 'a', 'Trans', '##former', 'network', 'is', 'manual', ',', 'K', '##T', '-', '123', '##12']

Convert와 Decode

ids = tokenizer.convert_tokens_to_ids(tokens)
ids
-------------------------------------------------
[7993, 170, 13809, 23763, 2443, 1110, 9506, 117, 148, 1942, 118, 13414, 11964]
tokenizer.decode(ids)
------------------------------------------
'Using a Transformer network is manual, KT - 12312'

4. Training

  • dataloader -> model -> optimizer -> loss -> training
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'          # 사용할 모델명
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                   # 모델의 토큰화
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)  # 모델 로딩

sequence = "I've been waiting for a HuggingFace course my whole life."  # text

tokens = tokenizer.tokenize(sequence)   # text -> token
ids = tokenizer.convert_tokens_to_ids(tokens) # token -> ids

inputs_ids = torch.tensor([ids])
print("input IDs:\n", inputs_ids, end="\n\n")

output = model(inputs_ids)
print("Logits:", output.logits, end="\n\n")

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print("Predicts:\n", predictions)
-------------------------------------------------------------------------
input IDs:
 tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])

Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)

Predicts:
 tensor([[4.0195e-02, 9.5981e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)

다중 데이터셋 학습

batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)
-------------------------------------------------
tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)

데이터 로드

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc") # 데이터셋 로딩
checkpoint = "bert-base-uncased"            # 불러올 모델명
tokenizer = AutoTokenizer.from_pretrained(checkpoint)   # 모델 토큰화 로딩
raw_datasets['train'].features
-------------------------------------------------------
{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

전처리

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)  # 토큰화

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)    # 토큰화 자동패딩

tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])   # 불필요칼럼제거
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")	# 칼럼명 수정
tokenized_datasets.set_format("torch")	# 형식 "torch"로 변환
tokenized_datasets["train"].column_names
-------------------------------------------------------------
['labels', 'input_ids', 'token_type_ids', 'attention_mask']

데이터 로더 생성

from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
test_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}
-------------------------------------------------------
{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 66]),
 'token_type_ids': torch.Size([8, 66]),
 'attention_mask': torch.Size([8, 66])}

모델링

  • True or False 이므로 num_labels=2
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

스케쥴러

from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print(num_training_steps)
-----------------------------------------------
1377

CUDA 설정

  • 실습환경: 코랩
import torch

is_cuda = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(is_cuda)
model.to(device)
device
----------------------------------------------
device(type='cuda')

모델 학습

from tqdm import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
-----------------------------------------------------------------
100%|██████████| 1377/1377 [01:06<00:00, 21.74it/s]

모델 평가

from datasets import load_metric

metric = load_metric("glue", "mrpc")
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    
metric.compute()
-----------------------------------------------------------
{'accuracy': 0.8578431372549019, 'f1': 0.8989547038327526}

0개의 댓글