import pandas as pd
import numpy as np
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
- 날짜형의 테이터 date_range로 기본 날짜 지정
- periods로 기간 지정
dates = pd.date_range('20210112', periods=6)
dates
DatetimeIndex(['2021-01-12', '2021-01-13', '2021-01-14', '2021-01-15',
'2021-01-16', '2021-01-17'],
dtype='datetime64[ns]', freq='D')
- 6행 4열의 random 변수를 만들고 칼럼 지정, index는 지정해준 기본 날짜 사용
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df
|
A |
B |
C |
D |
2021-01-12 |
-0.986029 |
-0.323405 |
-0.817609 |
1.865523 |
2021-01-13 |
0.124553 |
-0.756995 |
-0.482517 |
-1.080976 |
2021-01-14 |
0.764960 |
1.066535 |
-0.096648 |
-0.802937 |
2021-01-15 |
0.643063 |
-1.049050 |
0.712733 |
-0.494647 |
2021-01-16 |
0.833873 |
-0.016826 |
0.996553 |
0.777417 |
2021-01-17 |
0.521498 |
0.943716 |
-0.526571 |
2.036754 |
df.index
DatetimeIndex(['2021-01-12', '2021-01-13', '2021-01-14', '2021-01-15',
'2021-01-16', '2021-01-17'],
dtype='datetime64[ns]', freq='D')
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
df.values
array([[-0.98602928, -0.32340495, -0.8176089 , 1.86552341],
[ 0.12455269, -0.75699516, -0.48251671, -1.08097641],
[ 0.76496044, 1.06653499, -0.09664802, -0.80293701],
[ 0.64306333, -1.04904954, 0.71273275, -0.49464736],
[ 0.83387333, -0.01682556, 0.99655303, 0.77741723],
[ 0.52149815, 0.94371634, -0.52657107, 2.03675382]])
df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2021-01-12 to 2021-01-17
Freq: D
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A 6 non-null float64
1 B 6 non-null float64
2 C 6 non-null float64
3 D 6 non-null float64
dtypes: float64(4)
memory usage: 240.0 bytes
- describe()를 사용해 통계적 개요를 확인
df.describe()
|
A |
B |
C |
D |
count |
6.000000 |
6.000000 |
6.000000 |
6.000000 |
mean |
0.316986 |
-0.022671 |
-0.035676 |
0.383522 |
std |
0.685694 |
0.872174 |
0.732311 |
1.371735 |
min |
-0.986029 |
-1.049050 |
-0.817609 |
-1.080976 |
25% |
0.223789 |
-0.648598 |
-0.515557 |
-0.725865 |
50% |
0.582281 |
-0.170115 |
-0.289582 |
0.141385 |
75% |
0.734486 |
0.703581 |
0.510388 |
1.593497 |
max |
0.833873 |
1.066535 |
0.996553 |
2.036754 |
- by로 지정된 칼럼을 기준으로 sort_values를 통해 정렬
- ascending을 사용해 내림차순이나 오름차순 정렬
df.sort_values(by='B', ascending=False)
|
A |
B |
C |
D |
2021-01-14 |
0.764960 |
1.066535 |
-0.096648 |
-0.802937 |
2021-01-17 |
0.521498 |
0.943716 |
-0.526571 |
2.036754 |
2021-01-16 |
0.833873 |
-0.016826 |
0.996553 |
0.777417 |
2021-01-12 |
-0.986029 |
-0.323405 |
-0.817609 |
1.865523 |
2021-01-13 |
0.124553 |
-0.756995 |
-0.482517 |
-1.080976 |
2021-01-15 |
0.643063 |
-1.049050 |
0.712733 |
-0.494647 |
df
|
A |
B |
C |
D |
2021-01-12 |
-0.986029 |
-0.323405 |
-0.817609 |
1.865523 |
2021-01-13 |
0.124553 |
-0.756995 |
-0.482517 |
-1.080976 |
2021-01-14 |
0.764960 |
1.066535 |
-0.096648 |
-0.802937 |
2021-01-15 |
0.643063 |
-1.049050 |
0.712733 |
-0.494647 |
2021-01-16 |
0.833873 |
-0.016826 |
0.996553 |
0.777417 |
2021-01-17 |
0.521498 |
0.943716 |
-0.526571 |
2.036754 |
df['A']
2021-01-12 -0.986029
2021-01-13 0.124553
2021-01-14 0.764960
2021-01-15 0.643063
2021-01-16 0.833873
2021-01-17 0.521498
Freq: D, Name: A, dtype: float64
df[0:3]
|
A |
B |
C |
D |
2021-01-12 |
-0.986029 |
-0.323405 |
-0.817609 |
1.865523 |
2021-01-13 |
0.124553 |
-0.756995 |
-0.482517 |
-1.080976 |
2021-01-14 |
0.764960 |
1.066535 |
-0.096648 |
-0.802937 |
df['20210113':'20210116']
|
A |
B |
C |
D |
2021-01-13 |
0.124553 |
-0.756995 |
-0.482517 |
-1.080976 |
2021-01-14 |
0.764960 |
1.066535 |
-0.096648 |
-0.802937 |
2021-01-15 |
0.643063 |
-1.049050 |
0.712733 |
-0.494647 |
2021-01-16 |
0.833873 |
-0.016826 |
0.996553 |
0.777417 |
- df.loc을 사용해 특정 날짜의 데이터만 확인
df.loc[dates[0]]
A -0.986029
B -0.323405
C -0.817609
D 1.865523
Name: 2021-01-12 00:00:00, dtype: float64
df.loc[:,['A','B']]
|
A |
B |
2021-01-12 |
-0.986029 |
-0.323405 |
2021-01-13 |
0.124553 |
-0.756995 |
2021-01-14 |
0.764960 |
1.066535 |
2021-01-15 |
0.643063 |
-1.049050 |
2021-01-16 |
0.833873 |
-0.016826 |
2021-01-17 |
0.521498 |
0.943716 |
df.loc['20210113':'20210115', ['A','B']]
|
A |
B |
2021-01-13 |
0.124553 |
-0.756995 |
2021-01-14 |
0.764960 |
1.066535 |
2021-01-15 |
0.643063 |
-1.049050 |
- 2021년 01월 12일의 A,B 데이터만 선택해 확인
df.loc['20210112',['A','B']]
A -0.986029
B -0.323405
Name: 2021-01-12 00:00:00, dtype: float64
df.loc[dates[0],'A']
-0.9860292817860736
df.iloc[3]
A 0.643063
B -1.049050
C 0.712733
D -0.494647
Name: 2021-01-15 00:00:00, dtype: float64
df.iloc[3:5, 0:2]
|
A |
B |
2021-01-15 |
0.643063 |
-1.049050 |
2021-01-16 |
0.833873 |
-0.016826 |
df.iloc[[1,2,4],[0,2]]
|
A |
C |
2021-01-13 |
0.124553 |
-0.482517 |
2021-01-14 |
0.764960 |
-0.096648 |
2021-01-16 |
0.833873 |
0.996553 |
df.iloc[1:3, :]
|
A |
B |
C |
D |
2021-01-13 |
0.124553 |
-0.756995 |
-0.482517 |
-1.080976 |
2021-01-14 |
0.764960 |
1.066535 |
-0.096648 |
-0.802937 |
df.iloc[:, 1:3]
|
B |
C |
2021-01-12 |
-0.323405 |
-0.817609 |
2021-01-13 |
-0.756995 |
-0.482517 |
2021-01-14 |
1.066535 |
-0.096648 |
2021-01-15 |
-1.049050 |
0.712733 |
2021-01-16 |
-0.016826 |
0.996553 |
2021-01-17 |
0.943716 |
-0.526571 |
- df의 A가 0이상인 것만 확인
- 특정 조건을 만족하는 데이터만 얻을 수 있음
df[df.A > 0]
|
A |
B |
C |
D |
2021-01-13 |
0.124553 |
-0.756995 |
-0.482517 |
-1.080976 |
2021-01-14 |
0.764960 |
1.066535 |
-0.096648 |
-0.802937 |
2021-01-15 |
0.643063 |
-1.049050 |
0.712733 |
-0.494647 |
2021-01-16 |
0.833873 |
-0.016826 |
0.996553 |
0.777417 |
2021-01-17 |
0.521498 |
0.943716 |
-0.526571 |
2.036754 |
df[df['A'] > 0]
|
A |
B |
C |
D |
2021-01-13 |
0.124553 |
-0.756995 |
-0.482517 |
-1.080976 |
2021-01-14 |
0.764960 |
1.066535 |
-0.096648 |
-0.802937 |
2021-01-15 |
0.643063 |
-1.049050 |
0.712733 |
-0.494647 |
2021-01-16 |
0.833873 |
-0.016826 |
0.996553 |
0.777417 |
2021-01-17 |
0.521498 |
0.943716 |
-0.526571 |
2.036754 |
df[df > 0]
|
A |
B |
C |
D |
2021-01-12 |
NaN |
NaN |
NaN |
1.865523 |
2021-01-13 |
0.124553 |
NaN |
NaN |
NaN |
2021-01-14 |
0.764960 |
1.066535 |
NaN |
NaN |
2021-01-15 |
0.643063 |
NaN |
0.712733 |
NaN |
2021-01-16 |
0.833873 |
NaN |
0.996553 |
0.777417 |
2021-01-17 |
0.521498 |
0.943716 |
NaN |
2.036754 |
df2 = df.copy()
df2["E"] = ['one', 'one', 'two', 'three', 'four', 'three']
df2
|
A |
B |
C |
D |
E |
2021-01-12 |
-0.986029 |
-0.323405 |
-0.817609 |
1.865523 |
one |
2021-01-13 |
0.124553 |
-0.756995 |
-0.482517 |
-1.080976 |
one |
2021-01-14 |
0.764960 |
1.066535 |
-0.096648 |
-0.802937 |
two |
2021-01-15 |
0.643063 |
-1.049050 |
0.712733 |
-0.494647 |
three |
2021-01-16 |
0.833873 |
-0.016826 |
0.996553 |
0.777417 |
four |
2021-01-17 |
0.521498 |
0.943716 |
-0.526571 |
2.036754 |
three |
df2['E'].isin(['two', 'four'])
2021-01-12 False
2021-01-13 False
2021-01-14 True
2021-01-15 False
2021-01-16 True
2021-01-17 False
Freq: D, Name: E, dtype: bool
df2[df2['E'].isin(['two', 'four'])]
|
A |
B |
C |
D |
E |
2021-01-14 |
0.764960 |
1.066535 |
-0.096648 |
-0.802937 |
two |
2021-01-16 |
0.833873 |
-0.016826 |
0.996553 |
0.777417 |
four |
df
|
A |
B |
C |
D |
2021-01-12 |
-0.986029 |
-0.323405 |
-0.817609 |
1.865523 |
2021-01-13 |
0.124553 |
-0.756995 |
-0.482517 |
-1.080976 |
2021-01-14 |
0.764960 |
1.066535 |
-0.096648 |
-0.802937 |
2021-01-15 |
0.643063 |
-1.049050 |
0.712733 |
-0.494647 |
2021-01-16 |
0.833873 |
-0.016826 |
0.996553 |
0.777417 |
2021-01-17 |
0.521498 |
0.943716 |
-0.526571 |
2.036754 |
- apply를 통해 cumsum을 사용해 누적합을 알 수 있음
df.apply(np.cumsum)
|
A |
B |
C |
D |
2021-01-12 |
-0.986029 |
-0.323405 |
-0.817609 |
1.865523 |
2021-01-13 |
-0.861477 |
-1.080400 |
-1.300126 |
0.784547 |
2021-01-14 |
-0.096516 |
-0.013865 |
-1.396774 |
-0.018390 |
2021-01-15 |
0.546547 |
-1.062915 |
-0.684041 |
-0.513037 |
2021-01-16 |
1.380421 |
-1.079740 |
0.312512 |
0.264380 |
2021-01-17 |
1.901919 |
-0.136024 |
-0.214059 |
2.301134 |
- lamda를 이용해 최대값과 최소값의 차이를 확인
df.apply(lambda x: x.max() - x.min())
A 1.819903
B 2.115585
C 1.814162
D 3.117730
dtype: float64
DataFrame 병합하기
- concat
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B' : ['B0', 'B1', 'B2', 'B3'],
'C' : ['C0', 'C1', 'C2', 'C3'],
'D' : ['D0', 'D1', 'D2', 'D3']},
index = [0, 1, 2, 3])
df2 = pd.DataFrame({'A' : ['A4', 'A5', 'A6', 'A7'],
'B' : ['B4', 'B5', 'B6', 'B7'],
'C' : ['C4', 'B5', 'B6', 'B7'],
'D' : ['D4', 'D5', 'D6', 'D7']},
index = [4, 5, 6, 7])
df3 = pd.DataFrame({'A' : ['A8', 'A9', 'A10', 'A11'],
'B' : ['B8', 'B9', 'B10', 'B11'],
'C' : ['C8', 'C9', 'C10', 'C11'],
'D' : ['D8', 'D9', 'D10', 'D11']},
index = [8, 9, 10, 11])
df1
|
A |
B |
C |
D |
0 |
A0 |
B0 |
C0 |
D0 |
1 |
A1 |
B1 |
C1 |
D1 |
2 |
A2 |
B2 |
C2 |
D2 |
3 |
A3 |
B3 |
C3 |
D3 |
df2
|
A |
B |
C |
D |
4 |
A4 |
B4 |
C4 |
D4 |
5 |
A5 |
B5 |
B5 |
D5 |
6 |
A6 |
B6 |
B6 |
D6 |
7 |
A7 |
B7 |
B7 |
D7 |
df3
|
A |
B |
C |
D |
8 |
A8 |
B8 |
C8 |
D8 |
9 |
A9 |
B9 |
C9 |
D9 |
10 |
A10 |
B10 |
C10 |
D10 |
11 |
A11 |
B11 |
C11 |
D11 |
- concat은 데이터를 열 방향으로 단순히 합치는 것
result = pd.concat([df1, df2, df3])
result
|
A |
B |
C |
D |
0 |
A0 |
B0 |
C0 |
D0 |
1 |
A1 |
B1 |
C1 |
D1 |
2 |
A2 |
B2 |
C2 |
D2 |
3 |
A3 |
B3 |
C3 |
D3 |
4 |
A4 |
B4 |
C4 |
D4 |
5 |
A5 |
B5 |
B5 |
D5 |
6 |
A6 |
B6 |
B6 |
D6 |
7 |
A7 |
B7 |
B7 |
D7 |
8 |
A8 |
B8 |
C8 |
D8 |
9 |
A9 |
B9 |
C9 |
D9 |
10 |
A10 |
B10 |
C10 |
D10 |
11 |
A11 |
B11 |
C11 |
D11 |
- key옵션을 줘서 구분
- MultiIndex가 되어서 level을 형성
result = pd.concat([df1, df2, df3], keys=['X', 'Y', 'Z'])
result
|
|
A |
B |
C |
D |
X |
0 |
A0 |
B0 |
C0 |
D0 |
1 |
A1 |
B1 |
C1 |
D1 |
2 |
A2 |
B2 |
C2 |
D2 |
3 |
A3 |
B3 |
C3 |
D3 |
Y |
4 |
A4 |
B4 |
C4 |
D4 |
5 |
A5 |
B5 |
B5 |
D5 |
6 |
A6 |
B6 |
B6 |
D6 |
7 |
A7 |
B7 |
B7 |
D7 |
Z |
8 |
A8 |
B8 |
C8 |
D8 |
9 |
A9 |
B9 |
C9 |
D9 |
10 |
A10 |
B10 |
C10 |
D10 |
11 |
A11 |
B11 |
C11 |
D11 |
result.index
MultiIndex([('X', 0),
('X', 1),
('X', 2),
('X', 3),
('Y', 4),
('Y', 5),
('Y', 6),
('Y', 7),
('Z', 8),
('Z', 9),
('Z', 10),
('Z', 11)],
)
result.index.get_level_values(0)
Index(['X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z'], dtype='object')
result.index.get_level_values(1)
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')
df4 = pd.DataFrame({'B' : ['B2', 'B3', 'B6', 'B7'],
'D' : ['D2', 'D3', 'D6', 'D7'],
'F' : ['F2', 'F3', 'F6', 'F7']},
index = [2, 3, 6, 7])
df4
|
B |
D |
F |
2 |
B2 |
D2 |
F2 |
3 |
B3 |
D3 |
F3 |
6 |
B6 |
D6 |
F6 |
7 |
B7 |
D7 |
F7 |
df1
|
A |
B |
C |
D |
0 |
A0 |
B0 |
C0 |
D0 |
1 |
A1 |
B1 |
C1 |
D1 |
2 |
A2 |
B2 |
C2 |
D2 |
3 |
A3 |
B3 |
C3 |
D3 |
- df1과 df4를 concat을 이용해서 합치기
- index를 기준으로 데이터를 합침
- 값을 가질 수 없는 곳은 NaN으로 저장
result = pd.concat([df1, df4], axis=1)
result
|
A |
B |
C |
D |
B |
D |
F |
0 |
A0 |
B0 |
C0 |
D0 |
NaN |
NaN |
NaN |
1 |
A1 |
B1 |
C1 |
D1 |
NaN |
NaN |
NaN |
2 |
A2 |
B2 |
C2 |
D2 |
B2 |
D2 |
F2 |
3 |
A3 |
B3 |
C3 |
D3 |
B3 |
D3 |
F3 |
6 |
NaN |
NaN |
NaN |
NaN |
B6 |
D6 |
F6 |
7 |
NaN |
NaN |
NaN |
NaN |
B7 |
D7 |
F7 |
- inner조인을 통해 공통된 index로 합치고 공통되지 않은 index는 버림
result = pd.concat([df1, df4], axis=1, join='inner')
result
|
A |
B |
C |
D |
B |
D |
F |
2 |
A2 |
B2 |
C2 |
D2 |
B2 |
D2 |
F2 |
3 |
A3 |
B3 |
C3 |
D3 |
B3 |
D3 |
F3 |
result = pd.concat([df1, df4], axis=1).reindex(df1.index)
result
|
A |
B |
C |
D |
B |
D |
F |
0 |
A0 |
B0 |
C0 |
D0 |
NaN |
NaN |
NaN |
1 |
A1 |
B1 |
C1 |
D1 |
NaN |
NaN |
NaN |
2 |
A2 |
B2 |
C2 |
D2 |
B2 |
D2 |
F2 |
3 |
A3 |
B3 |
C3 |
D3 |
B3 |
D3 |
F3 |
- 두 데이터의 index를 무시하고 합친 후 다시 index를 부여
result = pd.concat([df1, df4], ignore_index=True)
result
|
A |
B |
C |
D |
F |
0 |
A0 |
B0 |
C0 |
D0 |
NaN |
1 |
A1 |
B1 |
C1 |
D1 |
NaN |
2 |
A2 |
B2 |
C2 |
D2 |
NaN |
3 |
A3 |
B3 |
C3 |
D3 |
NaN |
4 |
NaN |
B2 |
NaN |
D2 |
F2 |
5 |
NaN |
B3 |
NaN |
D3 |
F3 |
6 |
NaN |
B6 |
NaN |
D6 |
F6 |
7 |
NaN |
B7 |
NaN |
D7 |
F7 |
- merge
left = pd.DataFrame({'A' : ['A0', 'A1', 'A2', 'A3'],
'B' : ['B0', 'B1', 'B2', 'B3'],
'key' : ['K0', 'K4', 'K2', 'K3']})
right = pd.DataFrame({'C' : ['C0', 'C1', 'C2', 'C3'],
'D' : ['D0', 'D1', 'D2', 'D3'],
'key' : ['K0', 'K1', 'K2', 'K3']})
left
|
A |
B |
key |
0 |
A0 |
B0 |
K0 |
1 |
A1 |
B1 |
K4 |
2 |
A2 |
B2 |
K2 |
3 |
A3 |
B3 |
K3 |
right
|
C |
D |
key |
0 |
C0 |
D0 |
K0 |
1 |
C1 |
D1 |
K1 |
2 |
C2 |
D2 |
K2 |
3 |
C3 |
D3 |
K3 |
- on = key를 통해 공통된 key에 대해서만 합침
pd.merge(left, right, on='key')
|
A |
B |
key |
C |
D |
0 |
A0 |
B0 |
K0 |
C0 |
D0 |
1 |
A2 |
B2 |
K2 |
C2 |
D2 |
2 |
A3 |
B3 |
K3 |
C3 |
D3 |
pd.merge(left, right, how='left', on='key')
|
A |
B |
key |
C |
D |
0 |
A0 |
B0 |
K0 |
C0 |
D0 |
1 |
A1 |
B1 |
K4 |
NaN |
NaN |
2 |
A2 |
B2 |
K2 |
C2 |
D2 |
3 |
A3 |
B3 |
K3 |
C3 |
D3 |
pd.merge(left, right, how='right', on='key')
|
A |
B |
key |
C |
D |
0 |
A0 |
B0 |
K0 |
C0 |
D0 |
1 |
A2 |
B2 |
K2 |
C2 |
D2 |
2 |
A3 |
B3 |
K3 |
C3 |
D3 |
3 |
NaN |
NaN |
K1 |
C1 |
D1 |
pd.merge(left, right, how='outer', on='key')
|
A |
B |
key |
C |
D |
0 |
A0 |
B0 |
K0 |
C0 |
D0 |
1 |
A1 |
B1 |
K4 |
NaN |
NaN |
2 |
A2 |
B2 |
K2 |
C2 |
D2 |
3 |
A3 |
B3 |
K3 |
C3 |
D3 |
4 |
NaN |
NaN |
K1 |
C1 |
D1 |
pd.merge(left, right, how='inner', on='key')
|
A |
B |
key |
C |
D |
0 |
A0 |
B0 |
K0 |
C0 |
D0 |
1 |
A2 |
B2 |
K2 |
C2 |
D2 |
2 |
A3 |
B3 |
K3 |
C3 |
D3 |
pivot_table
import numpy as np
import pandas as pd
df = pd.read_excel('./data/02. sales-funnel.xlsx')
df.head(10)
|
Account |
Name |
Rep |
Manager |
Product |
Quantity |
Price |
Status |
0 |
714466 |
Trantow-Barrows |
Craig Booker |
Debra Henley |
CPU |
1 |
30000 |
presented |
1 |
714466 |
Trantow-Barrows |
Craig Booker |
Debra Henley |
Software |
1 |
10000 |
presented |
2 |
714466 |
Trantow-Barrows |
Craig Booker |
Debra Henley |
Maintenance |
2 |
5000 |
pending |
3 |
737550 |
Fritsch, Russel and Anderson |
Craig Booker |
Debra Henley |
CPU |
1 |
35000 |
declined |
4 |
146832 |
Kiehn-Spinka |
Daniel Hilton |
Debra Henley |
CPU |
2 |
65000 |
won |
5 |
218895 |
Kulas Inc |
Daniel Hilton |
Debra Henley |
CPU |
2 |
40000 |
pending |
6 |
218895 |
Kulas Inc |
Daniel Hilton |
Debra Henley |
Software |
1 |
10000 |
presented |
7 |
412290 |
Jerde-Hilpert |
John Smith |
Debra Henley |
Maintenance |
2 |
5000 |
pending |
8 |
740150 |
Barton LLC |
John Smith |
Debra Henley |
CPU |
1 |
35000 |
declined |
9 |
141962 |
Herman LLC |
Cedric Moss |
Fred Anderson |
CPU |
2 |
65000 |
won |
pd.pivot_table(df, index=['Name'])
|
Account |
Price |
Quantity |
Name |
|
|
|
Barton LLC |
740150 |
35000 |
1.000000 |
Fritsch, Russel and Anderson |
737550 |
35000 |
1.000000 |
Herman LLC |
141962 |
65000 |
2.000000 |
Jerde-Hilpert |
412290 |
5000 |
2.000000 |
Kassulke, Ondricka and Metz |
307599 |
7000 |
3.000000 |
Keeling LLC |
688981 |
100000 |
5.000000 |
Kiehn-Spinka |
146832 |
65000 |
2.000000 |
Koepp Ltd |
729833 |
35000 |
2.000000 |
Kulas Inc |
218895 |
25000 |
1.500000 |
Purdy-Kunde |
163416 |
30000 |
1.000000 |
Stokes LLC |
239344 |
7500 |
1.000000 |
Trantow-Barrows |
714466 |
15000 |
1.333333 |
Name 칼럼이 index가 되고 별도로 지정하지 않으면 숫자형 데이터 칼럼들이 남게 됨
- 중복된 Name은 하나로 합쳐지고 value들은 평균을 가지게 된다.
pd.pivot_table(df, index=['Name', 'Rep', 'Manager'])
|
|
|
Account |
Price |
Quantity |
Name |
Rep |
Manager |
|
|
|
Barton LLC |
John Smith |
Debra Henley |
740150 |
35000 |
1.000000 |
Fritsch, Russel and Anderson |
Craig Booker |
Debra Henley |
737550 |
35000 |
1.000000 |
Herman LLC |
Cedric Moss |
Fred Anderson |
141962 |
65000 |
2.000000 |
Jerde-Hilpert |
John Smith |
Debra Henley |
412290 |
5000 |
2.000000 |
Kassulke, Ondricka and Metz |
Wendy Yule |
Fred Anderson |
307599 |
7000 |
3.000000 |
Keeling LLC |
Wendy Yule |
Fred Anderson |
688981 |
100000 |
5.000000 |
Kiehn-Spinka |
Daniel Hilton |
Debra Henley |
146832 |
65000 |
2.000000 |
Koepp Ltd |
Wendy Yule |
Fred Anderson |
729833 |
35000 |
2.000000 |
Kulas Inc |
Daniel Hilton |
Debra Henley |
218895 |
25000 |
1.500000 |
Purdy-Kunde |
Cedric Moss |
Fred Anderson |
163416 |
30000 |
1.000000 |
Stokes LLC |
Cedric Moss |
Fred Anderson |
239344 |
7500 |
1.000000 |
Trantow-Barrows |
Craig Booker |
Debra Henley |
714466 |
15000 |
1.333333 |
pd.pivot_table(df, index=['Manager', 'Rep'], values=['Price'])
|
|
Price |
Manager |
Rep |
|
Debra Henley |
Craig Booker |
20000.000000 |
Daniel Hilton |
38333.333333 |
John Smith |
20000.000000 |
Fred Anderson |
Cedric Moss |
27500.000000 |
Wendy Yule |
44250.000000 |
- value들을 평균값대신 합계로 나타내고 싶을 때
pd.pivot_table(df, index=["Manager", 'Rep'], values=['Price'], aggfunc=np.sum)
|
|
Price |
Manager |
Rep |
|
Debra Henley |
Craig Booker |
80000 |
Daniel Hilton |
115000 |
John Smith |
40000 |
Fred Anderson |
Cedric Moss |
110000 |
Wendy Yule |
177000 |
pd.pivot_table(df, index=["Manager", 'Rep', 'Product'],
values=['Price', 'Quantity'],
aggfunc=[np.sum, np.mean], fill_value = 0, margins=True)
|
|
|
sum |
mean |
|
|
|
Price |
Quantity |
Price |
Quantity |
Manager |
Rep |
Product |
|
|
|
|
Debra Henley |
Craig Booker |
CPU |
65000 |
2 |
32500.000000 |
1.000000 |
Maintenance |
5000 |
2 |
5000.000000 |
2.000000 |
Software |
10000 |
1 |
10000.000000 |
1.000000 |
Daniel Hilton |
CPU |
105000 |
4 |
52500.000000 |
2.000000 |
Software |
10000 |
1 |
10000.000000 |
1.000000 |
John Smith |
CPU |
35000 |
1 |
35000.000000 |
1.000000 |
Maintenance |
5000 |
2 |
5000.000000 |
2.000000 |
Fred Anderson |
Cedric Moss |
CPU |
95000 |
3 |
47500.000000 |
1.500000 |
Maintenance |
5000 |
1 |
5000.000000 |
1.000000 |
Software |
10000 |
1 |
10000.000000 |
1.000000 |
Wendy Yule |
CPU |
165000 |
7 |
82500.000000 |
3.500000 |
Maintenance |
7000 |
3 |
7000.000000 |
3.000000 |
Monitor |
5000 |
2 |
5000.000000 |
2.000000 |
All |
|
|
522000 |
30 |
30705.882353 |
1.764706 |