정의
소스
# make test data
import numpy as np
import pandas as pd
rows = 1_000_000
hotel_price = pd.DataFrame([(f'hotel_{i}', int(p * 100)) for i, p in enumerate(np.random.rand(rows))], columns=['hotel', 'price'])
hotel_price
# 검증 source
level_all_algo = []
for algo in ['loop', 'iterrows', 'itertuples', 'withapply', 'isin', 'cut', 'digitize']:
print(f'[{algo}]')
%time globals()[f'apply_level_{algo}'](df=hotel_price)
level_all_algo.append(hotel_price['level'].values)
import numpy as np
import pandas as pd
import sys
rows = 1_000_000
hotel_price = pd.DataFrame([(f'hotel_{i}', int(p * 100)) for i, p in enumerate(np.random.rand(rows))], columns=['hotel', 'price'])
hotel_price
def apply_level(price):
if 0 <= price < 30:
level = 'low'
elif 30 <= price < 70:
level = 'medium'
elif 70 <= price <= 100:
level = 'high'
return level
def apply_level_loop(df: pd.DataFrame):
level_list = []
for i in range(len(df)):
level = apply_level(df.iloc[i]['price'])
level_list.append(level)
df['level'] = level_list
def apply_level_iterrows(df: pd.DataFrame):
level_list = []
for i, r in df.iterrows():
level = apply_level(r['price'])
level_list.append(level)
df['level'] = level_list
def apply_level_itertuples(df: pd.DataFrame):
level_list = []
for r in df.itertuples():
level = apply_level(r.price)
level_list.append(level)
df['level'] = level_list
def apply_level_withapply(df: pd.DataFrame):
df['level'] = df.apply(lambda r: apply_level(r['price']), axis=1)
def apply_level_isin(df: pd.DataFrame):
low = df['price'].isin(range(0,30))
medium = df['price'].isin(range(30,70))
high = df['price'].isin(range(70,101))
df.loc[low, 'level'] = 'low'
df.loc[medium, 'level'] = 'medium'
df.loc[high, 'level'] = 'high'
def apply_level_cut(df: pd.DataFrame):
df['level'] = pd.cut(x=df['price'], bins=[0, 30, 70, 101], include_lowest=True, right=False, labels=['low','medium','high'])
def apply_level_digitize(df: pd.DataFrame):
level_list = np.array(['low','medium','high'])
bins = np.digitize(df['price'], bins=[30,70])
df['level'] = level_list[bins]
level_all_algo = []
for algo in ['loop', 'iterrows', 'itertuples', 'withapply', 'isin', 'cut', 'digitize']:
print(f'[{algo}]')
%time globals()[f'apply_level_{algo}'](df=hotel_price)
level_all_algo.append(hotel_price['level'].values)
np.array([(level_all_algo[0] == level).all() for level in level_all_algo]).all()
f'np: {np.__version__}, pd: {pd.__version__}, python: {sys.version}'
결과
[loop]
CPU times: user 48.5 s, sys: 0 ns, total: 48.5 s
Wall time: 48.5 s
[iterrows]
CPU times: user 30 s, sys: 13.9 ms, total: 30 s
Wall time: 30 s
[itertuples]
CPU times: user 361 ms, sys: 7 µs, total: 361 ms
Wall time: 360 ms
[withapply]
CPU times: user 3.12 s, sys: 9.99 ms, total: 3.13 s
Wall time: 3.13 s
[isin]
CPU times: user 61.3 ms, sys: 14 µs, total: 61.3 ms
Wall time: 61.1 ms
[cut]
CPU times: user 21.5 ms, sys: 16 µs, total: 21.5 ms
Wall time: 24.7 ms
[digitize]
CPU times: user 41.4 ms, sys: 0 ns, total: 41.4 ms
Wall time: 41.4 ms
'np: 1.22.4, pd: 1.2.4, python: 3.8.0 (default, Dec 9 2021, 17:53:27) \n[GCC 8.4.0]'
결론
- 레퍼런스는
digitize
이 cut
보다 빠르다고 했지만 구현해보니 cut
이 빨랐다...
- 가장 속도가 빨랐던
cut
과 loop
를 비교하니 2000
배 가까이 빨랐다😱
- pandas에서 loop를 사용할 땐 웬만하면
digitize
이나 cut
을 사용할 것!