Model parallel
alexnet
class ModelParallelResNet50(ResNet):
def __init__(self, *args, **kwargs):
super(ModelParallelResNet50, self).__init__(Bottleneck, [3, 4, 6, 3],
num_classes=num_classes, *args, **kwargs)
# 첫번째 모델을 cuda 0에 할당 : GPU0
self.seq1 = nn.Sequential(self.conv1, self.bn1, self.relu, self.maxpool,
self.layer1, self.layer2).to('cuda:0')
# 두번째 모델을 cuda 1에 할당 : GPU1
self.seq2 = nn.Sequential(self.layer3, self.layer4, self.avgpool,).to('cuda:1')
self.fc.to('cuda:1')
# 두 모델 연결하기
def forward(self, x):
x = self.seq2(self.seq1(x).to('cuda:1'))
return self.fc(x.view(x.size(0), -1))
Data parallel
데이터를 나눠 GPU에 할당후 결과의 평균을 취하는 방법
minibatch 수식과 유사한데 한번에 여러 GPU에서 수행
PyTorch에서는 DataParallel 과 DistributedDataParallel 을 제공
DataParallel
# 이게 전부.. 생각보다 많이 간단
parallel_model = torch.nn.DataParallel(model)
# sampler 사용
train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
shuffle = False
pin_memory = True
# pin memory 사용
trainloader = torch.utils.data.DataLoader(train_data, batch_size=20, shuffle=Truepin_memory=pin_memory,
num_workers=3,shuffle=shuffle, sampler=train_sampler)
def main():
n_gpus = torch.cuda.device_count()
torch.multiprocessing.spawn(main_worker, nprocs=n_gpus, args=(n_gpus, ))
def main_worker(gpu, n_gpus):
image_size = 224
batch_size = 512
num_worker = 8
epochs = ...
# batch_size와 num_worker를 gpu만큼 잘라주어야 함
batch_size = int(batch_size / n_gpus)
num_worker = int(num_worker / n_gpus)
# 멀티 프로세싱 통신 규약 정의
torch.distributed.init_process_group(backend='nccl’, init_method='tcp://127.0.0.1:2568’, world_size=n_gpus, rank=gpu)
model = MODEL
torch.cuda.set_device(gpu)
model = model.cuda(gpu)
# Distributed DataParallel
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
from multiprocessing import Pool
def f(x):
return x*x
if __name__ == '__main__':
with Pool(5) as p:
print(p.map(f, [1, 2, 3]))