양자화 방식 비교

References

구분동적 양자화정적 양자화
양자화 시점RuntimePre-computation
적용 대상WeightsWeights and Activation
추가 훈련 필요 여부불필요필요
속도 및 효율성비교적 느림비교적 빠르고 효율적
적용 용이성간단하게 적용 가능복잡하고 데이터 의존적

동적 양자화 CPU 추론 속도 비교

  • TL; DR
    batchzize가 작고 매우 소규모인의 네트워크에는 양자화 추론이 오히려 비효율적
    충분히 큰 네트워크에 적절한 batchsize를 가져갈 때 효율적
  • 테스트 환경
    샘플 수: 10000
    feature 수: 100
    hidden layer 차원: 1024

테스트 결과1

  • Network: 4 linear blocks
batchsizefp328bitratio
11.154951.88925+63.6%
320.188300.13925-26.0%
5120.062030.04936-20.4%

테스트 결과2

  • Network: 36 linear blocks
batchsizefp328bitratio
122.1225415.92955-28.0%
322.121461.19603-43.6%
5120.554940.38583-30.5%
  • 테스트용 소스코드
# %% [markdown]
# ## Setup

# %%
import os
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import time
from tqdm import tqdm

# %%
class CFG:
    sampling_size = 10_000

# %% [markdown]
# ## Create data

# %%
data = torch.randn((CFG.sampling_size, 100))
data

# %% [markdown]
# ## Dynamic Quant

# %%
class Linear(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Linear, self).__init__()
        self.fc = nn.Sequential(
            torch.nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
        )

    def forward(self, x):
        return self.fc(x)

# define a floating point model
class Basemodel(torch.nn.Module):
    def __init__(self, input_dim=100, hidden_dim=1024, n_layers=4):
        super(Basemodel, self).__init__()
        self.fc = nn.Sequential(*[Linear(input_dim, hidden_dim) if i == 0 else Linear(hidden_dim, hidden_dim) for i in range(n_layers)])

    def forward(self, x):
        return self.fc(x)

# create a model instance
base_small = Basemodel(n_layers=4)
# create a quantized model instance
quant_small = torch.ao.quantization.quantize_dynamic(
    base_small,  # the original model
    {torch.nn.Linear},  # a set of layers to dynamically quantize
    dtype=torch.qint8
)  # the target dtype for quantized weights

# create a model instance
base_large = Basemodel(n_layers=36)
# create a quantized model instance
quant_large = torch.ao.quantization.quantize_dynamic(
    base_large,  # the original model
    {torch.nn.Linear},  # a set of layers to dynamically quantize
    dtype=torch.qint8
)  # the target dtype for quantized weights

# %%
# small
DQ1 = pd.DataFrame(index=["1", "32", "512"], columns=["fp32", "quant"])
# large
DQ2 = pd.DataFrame(index=["1", "32", "512"], columns=["fp32", "quant"])

# %% [markdown]
# ## TC. DQ1

# %% [markdown]
# ### for single batch

# %%
batch_size = 1
start_time = time.time()
with torch.no_grad():
    for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
        out = base_small(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ1.loc[str(batch_size), "fp32"] = elapsed

# %%
batch_size = 1
start_time = time.time()
with torch.no_grad():
    for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
        out = quant_small(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ1.loc[str(batch_size), "quant"] = elapsed

# %% [markdown]
# ### for small batch

# %%
batch_size = 32
start_time = time.time()
with torch.no_grad():
    for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
        out = base_small(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ1.loc[str(batch_size), "fp32"] = elapsed

# %%
batch_size = 32
start_time = time.time()
with torch.no_grad():
    for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
        out = quant_small(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ1.loc[str(batch_size), "quant"] = elapsed

# %% [markdown]
# ### for large batch

# %%
batch_size = 512
start_time = time.time()
with torch.no_grad():
    for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
        out = base_small(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ1.loc[str(batch_size), "fp32"] = elapsed

# %%
batch_size = 512
start_time = time.time()
with torch.no_grad():
    for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
        out = quant_small(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ1.loc[str(batch_size), "quant"] = elapsed

# %% [markdown]
# ## TC. DQ2

# %% [markdown]
# ### for single batch

# %%
batch_size = 1
start_time = time.time()
with torch.no_grad():
    for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
        out = base_large(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ2.loc[str(batch_size), "fp32"] = elapsed

# %%
batch_size = 1
start_time = time.time()
with torch.no_grad():
    for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
        out = quant_large(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ2.loc[str(batch_size), "quant"] = elapsed

# %% [markdown]
# ### for small batch

# %%
batch_size = 32
start_time = time.time()
with torch.no_grad():
    for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
        out = base_large(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ2.loc[str(batch_size), "fp32"] = elapsed

# %%
batch_size = 32
start_time = time.time()
with torch.no_grad():
    for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
        out = quant_large(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ2.loc[str(batch_size), "quant"] = elapsed

# %% [markdown]
# ### for large batch

# %%
batch_size = 512
start_time = time.time()
with torch.no_grad():
    for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
        out = base_large(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ2.loc[str(batch_size), "fp32"] = elapsed

# %%
batch_size = 512
start_time = time.time()
with torch.no_grad():
    for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
        out = quant_large(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ2.loc[str(batch_size), "quant"] = elapsed

# %%
DQ1["ratio"] = ((DQ1["quant"] - DQ1["fp32"]) / DQ1["fp32"])
DQ1

# %%
DQ2["ratio"] = ((DQ2["quant"] - DQ2["fp32"]) / DQ2["fp32"])
DQ2
profile
바로 활용 가능한 정보 공유를 목적으로 합니다

0개의 댓글