제로베이스 데이터취업스쿨 DAY85 GPT1~7

NAYOUNG KIM·2023년 7월 11일

제로베이스 교육

목록 보기

54/54

Pipeline

from transformers import pipeline

classifier = pipeline('sentiment-analysis')
classifier = pipeline('zero-shot-classification')
generator = pipeline("text-generation")
unmasker = pipeline("fill-mask")
ner = pipeline("ner", grouped_entities=True)
question_answer = pipeline("question-answering")
summarizer = pipeline("summarization")
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")

# 1. sentiment-analysis
classifier("I've been waiting for a HuggingFace course my whole life.")
# [{'label': 'POSITIVE', 'score': 0.9598046541213989}]

# 2. zero-shot-classification
classifier("This is a course about the Transforemrs library",
          candidate_labels=["education", "politics", "business"])
# {'sequence': 'This is a course about the Transforemrs library',
#  'labels': ['education', 'politics', 'business'],
#  'scores': [0.8416575193405151, 0.09498909115791321, 0.06335334479808807]}

# 3. text-generation
generator("In this course, we will teach you how to ")

list_ = ["In this course, we will teach you how to ", "This is a course about the Transforemrs library"]
for sentence in list_:
    print(generator(sentence, num_return_sequences=1, max_length=50))
    
generator = pipeline("text-generation", model="huggingtweets/dril")    
# [{'generated_text': 'In this course, we will teach you how to \xa0use Microsoft Office with and without Office 365 for a fun, easy-to-use interface. If you are familiar with how to use the Microsoft Office suite, our course covers the basics,'}]
# [{'generated_text': 'This is a course about the Transforemrs library. It was written by me and this was the result of an experiment I did with some very well trained people.\n\nThe Transforemrs library features a "new kind" of code'}]

# 4. fill-mask
unmasker("This course will teach you all about <mask> models", top_k=2)
# [{'score': 0.19631549715995789,
#   'token': 30412,
#   'token_str': ' mathematical',
#   'sequence': 'This course will teach you all about mathematical models'},
#  {'score': 0.044491808861494064,
#   'token': 745,
#   'token_str': ' building',
#   'sequence': 'This course will teach you all about building models'}]

# 5. ner
ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")
#[{'entity_group': 'PER',
#  'score': 0.9981694,
#  'word': 'Sylvain',
#  'start': 11,
#  'end': 18},
# {'entity_group': 'ORG',
#  'score': 0.9796019,
#  'word': 'Hugging Face',
#  'start': 33,
#  'end': 45},
# {'entity_group': 'LOC',
#  'score': 0.9932106,
#  'word': 'Brooklyn',
#  'start': 49,
#  'end': 57}]

# 6. question-answering
question_answer(question="Where do I work?", 
               context="My name is Sylvain and I work at Hugging Face in Brooklyn.")
# {'score': 0.638590395450592, 'start': 33, 'end': 45, 'answer': 'Hugging Face'}               

# 7. summarization
summarizer(
    '''
    Michael Joseph Jackson (August 29, 1958 – June 25, 2009) was an American singer, 
    songwriter, dancer, and philanthropist. Known as the "King of Pop", he is regarded 
    as one of the most significant cultural figures of the 20th century. During his
    four-decade career, his contributions to music, dance, and fashion, along with his 
    publicized personal life, made him a global figure in popular culture. Jackson influenced 
    artists across many music genres; through stage and video performances, he popularized
    complicated dance moves such as the moonwalk, to which he gave the name, as well as the robot.
    '''
)               
# [{'summary_text': 'he was an american singer, songwriter, dancer, and philanthropist . he is regarded as one of the most significant cultural figures of the 20th century . his contributions to music, dance, and fashion made him a global figure in popular culture .'}]

# 8. translation
translator("Ce cours est produit par HuggingFace")
# [{'translation_text': 'This course is produced by HuggingFace'}]

Training

import torch
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification

# tokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this too much!"
]

inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

# model1
model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)
outputs.last_hidden_state.shape  # torch.Size([2, 16, 768])

# model2
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model.config.id2label  # {0: 'NEGATIVE', 1: 'POSITIVE'}
outputs = model(**inputs)

# prediction
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

# save
model.save_pretrained('./test')

Bert

모델 파라미터 변경 가능

from transformers import BertConfig, BertModel

config = BertConfig()
config.hidden_size = 48

model = BertModel(config)

정리

dataloader -> model -> optimizer -> loss -> training

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my while life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
# tokenizer.decode([1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607, 2026,  2096,  2166,  1012])

inputs_ids = torch.tensor([ids])
print("inputs_ids : ", inputs_ids)

output = model(inputs_ids)
print("output : ", output.logits)

# 여러개 문장일 때 
batched_ids = [
    [200,200,200],
    [200,200,tokenizer.pad_token_id]
]

attention_mask = [
    [1,1,1],
    [1,1,0]
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

dataloader 사용 편하게!

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
from tqdm import tqdm
from datasets import load_metric

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  # 데이터 전처리 자동화

tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) # T/F니까 2

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

print(num_training_steps)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

progress_bar = tqdm(range(num_training_steps))

# 학습
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
# evaluate
metric = load_metric("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    
metric.compute()
# {'accuracy': 0.8627450980392157, 'f1': 0.9054054054054055}

NAYOUNG KIM

21세기 주인공

이전 포스트

제로베이스 데이터취업스쿨 DAY85 GPT1~7

제로베이스 교육

Pipeline

Training

Bert

정리

dataloader 사용 편하게!

제로베이스 데이터취업스쿨 DAY84 OpenCV5~8

0개의 댓글