References
| 구분 | 동적 양자화 | 정적 양자화 |
|---|---|---|
| 양자화 시점 | Runtime | Pre-computation |
| 적용 대상 | Weights | Weights and Activation |
| 추가 훈련 필요 여부 | 불필요 | 필요 |
| 속도 및 효율성 | 비교적 느림 | 비교적 빠르고 효율적 |
| 적용 용이성 | 간단하게 적용 가능 | 복잡하고 데이터 의존적 |
테스트 결과1
| batchsize | fp32 | 8bit | ratio |
|---|---|---|---|
| 1 | 1.15495 | 1.88925 | +63.6% |
| 32 | 0.18830 | 0.13925 | -26.0% |
| 512 | 0.06203 | 0.04936 | -20.4% |
테스트 결과2
| batchsize | fp32 | 8bit | ratio |
|---|---|---|---|
| 1 | 22.12254 | 15.92955 | -28.0% |
| 32 | 2.12146 | 1.19603 | -43.6% |
| 512 | 0.55494 | 0.38583 | -30.5% |
# %% [markdown]
# ## Setup
# %%
import os
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import time
from tqdm import tqdm
# %%
class CFG:
sampling_size = 10_000
# %% [markdown]
# ## Create data
# %%
data = torch.randn((CFG.sampling_size, 100))
data
# %% [markdown]
# ## Dynamic Quant
# %%
class Linear(torch.nn.Module):
def __init__(self, input_dim, hidden_dim):
super(Linear, self).__init__()
self.fc = nn.Sequential(
torch.nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
)
def forward(self, x):
return self.fc(x)
# define a floating point model
class Basemodel(torch.nn.Module):
def __init__(self, input_dim=100, hidden_dim=1024, n_layers=4):
super(Basemodel, self).__init__()
self.fc = nn.Sequential(*[Linear(input_dim, hidden_dim) if i == 0 else Linear(hidden_dim, hidden_dim) for i in range(n_layers)])
def forward(self, x):
return self.fc(x)
# create a model instance
base_small = Basemodel(n_layers=4)
# create a quantized model instance
quant_small = torch.ao.quantization.quantize_dynamic(
base_small, # the original model
{torch.nn.Linear}, # a set of layers to dynamically quantize
dtype=torch.qint8
) # the target dtype for quantized weights
# create a model instance
base_large = Basemodel(n_layers=36)
# create a quantized model instance
quant_large = torch.ao.quantization.quantize_dynamic(
base_large, # the original model
{torch.nn.Linear}, # a set of layers to dynamically quantize
dtype=torch.qint8
) # the target dtype for quantized weights
# %%
# small
DQ1 = pd.DataFrame(index=["1", "32", "512"], columns=["fp32", "quant"])
# large
DQ2 = pd.DataFrame(index=["1", "32", "512"], columns=["fp32", "quant"])
# %% [markdown]
# ## TC. DQ1
# %% [markdown]
# ### for single batch
# %%
batch_size = 1
start_time = time.time()
with torch.no_grad():
for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
out = base_small(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ1.loc[str(batch_size), "fp32"] = elapsed
# %%
batch_size = 1
start_time = time.time()
with torch.no_grad():
for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
out = quant_small(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ1.loc[str(batch_size), "quant"] = elapsed
# %% [markdown]
# ### for small batch
# %%
batch_size = 32
start_time = time.time()
with torch.no_grad():
for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
out = base_small(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ1.loc[str(batch_size), "fp32"] = elapsed
# %%
batch_size = 32
start_time = time.time()
with torch.no_grad():
for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
out = quant_small(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ1.loc[str(batch_size), "quant"] = elapsed
# %% [markdown]
# ### for large batch
# %%
batch_size = 512
start_time = time.time()
with torch.no_grad():
for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
out = base_small(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ1.loc[str(batch_size), "fp32"] = elapsed
# %%
batch_size = 512
start_time = time.time()
with torch.no_grad():
for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
out = quant_small(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ1.loc[str(batch_size), "quant"] = elapsed
# %% [markdown]
# ## TC. DQ2
# %% [markdown]
# ### for single batch
# %%
batch_size = 1
start_time = time.time()
with torch.no_grad():
for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
out = base_large(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ2.loc[str(batch_size), "fp32"] = elapsed
# %%
batch_size = 1
start_time = time.time()
with torch.no_grad():
for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
out = quant_large(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ2.loc[str(batch_size), "quant"] = elapsed
# %% [markdown]
# ### for small batch
# %%
batch_size = 32
start_time = time.time()
with torch.no_grad():
for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
out = base_large(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ2.loc[str(batch_size), "fp32"] = elapsed
# %%
batch_size = 32
start_time = time.time()
with torch.no_grad():
for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
out = quant_large(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ2.loc[str(batch_size), "quant"] = elapsed
# %% [markdown]
# ### for large batch
# %%
batch_size = 512
start_time = time.time()
with torch.no_grad():
for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
out = base_large(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ2.loc[str(batch_size), "fp32"] = elapsed
# %%
batch_size = 512
start_time = time.time()
with torch.no_grad():
for batch in DataLoader(data, batch_size=batch_size, shuffle=False):
out = quant_large(batch)
elapsed = round((time.time() - start_time), 5)
print(f"Elapsed: {elapsed}")
DQ2.loc[str(batch_size), "quant"] = elapsed
# %%
DQ1["ratio"] = ((DQ1["quant"] - DQ1["fp32"]) / DQ1["fp32"])
DQ1
# %%
DQ2["ratio"] = ((DQ2["quant"] - DQ2["fp32"]) / DQ2["fp32"])
DQ2