# Loss is CE
criterion = nn.CrossEntropyLoss()
# reduce LR on Plateau
if args.opt == "adam":
optimizer = optim.Adam(net.parameters(), lr=args.lr)
elif args.opt == "sgd":
optimizer = optim.SGD(net.parameters(), lr=args.lr)
elif args.opt == "adamw":
optimizer = optim.AdamW(net.parameters(), lr=args.lr, weight_decay=5e-4)
args.opt
에 따라 optimizer을 선택한다.
if not args.cos:
from torch.optim import lr_scheduler
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, verbose=True, min_lr=1e-3*1e-5, factor=0.1)
else:
scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs-1)
scheduler = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=1, after_scheduler=scheduler_cosine)
lr_sheduler.ReduceLROnPlateau
설명1, 설명2
GradualWarmupScheduler
예시
def sparse_selection():
s = 1e-4
for m in net.modules():
if isinstance(m, channel_selection):
m.indexes.grad.data.add_(s*torch.sign(m.indexes.data)) # L1
modules
: 설명
isinstance
: 설명
_add
: 파이토치 사칙 연산 간단한 함수 설명
##### Training
def train(epoch):
print('\nEpoch: %d' % epoch)
net.train()
train_loss = 0
correct = 0
total = 0
for batch_idx, (inputs, targets) in enumerate(trainloader):
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, targets)
loss.backward()
sparse_selection()
optimizer.step()
train_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
% (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
return train_loss/(batch_idx+1)