import pandas as pd # 설치시 pip install pandas
import numpy as np
import matplotlib.pyplot as plt
# Series 생성
s = pd.Series([1, 3, 5, np.nan, 6, 8])
# 0 1.0
# 1 3.0
# 2 5.0
# 3 NaN
# 4 6.0
# 5 8.0
# dtype: float64
# DataFrame 생성
dates = pd.date_range('20130101', periods=6)
# DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
# '2013-01-05', '2013-01-06'],
# dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
# A B C D
# 2013-01-01 1.571507 0.160021 -0.015071 -0.118588
# 2013-01-02 -1.037697 -0.891196 0.495447 0.453095
# 2013-01-03 -1.682384 -0.026006 -0.152957 -0.212614
# 2013-01-04 -0.108757 -0.958267 0.407331 0.187037
# 2013-01-05 1.092380 2.841777 -0.125714 -0.760722
# 2013-01-06 1.638509 -0.601126 -1.043931 -1.330950
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3]*4, dtype='int32'),
'E': pd.Categorical(['test', 'train', 'test', 'train']),
'F': 'foo'})
# A B C D E F
# 0 1.0 2013-01-02 1.0 3 test foo
# 1 1.0 2013-01-02 1.0 3 train foo
# 2 1.0 2013-01-02 1.0 3 test foo
# 3 1.0 2013-01-02 1.0 3 train foo
df2.dtypes
# A float64
# B datetime64[ns]
# C float32
# D int32
# E category
# F object
# dtype: object
df.index
# DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
# '2013-01-05', '2013-01-06'],
# dtype='datetime64[ns]', freq='D')
df.columns
# Index(['A', 'B', 'C', 'D'], dtype='object')
df.values
# [[ 1.571507 0.160021 -0.015071 -0.118588]
# [-1.037697 -0.891196 0.495447 0.453095]
# [-1.682384 -0.026006 -0.152957 -0.212614]
# [-0.108757 -0.958267 0.407331 0.187037]
# [ 1.09238 2.841777 -0.125714 -0.760722]
# [ 1.638509 -0.601126 -1.043931 -1.33095 ]]
df.describe()
# A B C D
# count 6.000000 6.000000 6.000000 6.000000
# mean 0.245593 0.087534 -0.072482 -0.297124
# std 1.407466 1.423367 0.549378 0.651149
# min -1.682384 -0.958267 -1.043931 -1.330950
# 25% -0.805462 -0.818679 -0.146146 -0.623695
# 50% 0.491811 -0.313566 -0.070392 -0.165601
# 75% 1.451725 0.113514 0.301730 0.110631
# max 1.638509 2.841777 0.495447 0.453095
df.T
# 2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06
# A 1.571507 -1.037697 -1.682384 -0.108757 1.092380 1.638509
# B 0.160021 -0.891196 -0.026006 -0.958267 2.841777 -0.601126
# C -0.015071 0.495447 -0.152957 0.407331 -0.125714 -1.043931
# D -0.118588 0.453095 -0.212614 0.187037 -0.760722 -1.330950
df.sort_index(axis=1, ascending=False)
# D C B A
# 2013-01-01 -1.135632 -1.509059 -0.282863 0.469112
# 2013-01-02 -1.044236 0.119209 -0.173215 1.212112
# 2013-01-03 1.071804 -0.494929 -2.104569 -0.861849
# 2013-01-04 0.271860 -1.039575 -0.706771 0.721555
# 2013-01-05 -1.087401 0.276232 0.567020 -0.424972
# 2013-01-06 0.524988 -1.478427 0.113648 -0.673690
df.sort_values(by='B')
# A B C D
# 2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
# 2013-01-04 0.721555 -0.706771 -1.039575 0.271860
# 2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
# 2013-01-02 1.212112 -0.173215 0.119209 -1.044236
# 2013-01-06 -0.673690 0.113648 -1.478427 0.524988
# 2013-01-05 -0.424972 0.567020 0.276232 -1.087401
데이터프레임 자체가 갖고 있는 인덱싱&슬라이싱 기능을 이용할 수 있습니다.
특정 컬럼의 값들만 가져오고 싶다면 df['A'](df.A와 동일)와 같은 형태로 입력합니다. 리턴되는 값은 Series의 자료구조를 갖고 있습니다.
단, 컬럼의 이름이 간혹 df.A로 쓰면 에러가 나는 경우가 발생하기 때문에 df['A']를 추천합니다.
df['A']
# 2013-01-01 0.469112
# 2013-01-02 1.212112
# 2013-01-03 -0.861849
# 2013-01-04 0.721555
# 2013-01-05 -0.424972
# 2013-01-06 -0.673690
# req: D, Name: A, dtype: float64
type(df['A'])
# <class 'pandas.core.series.Series'>
## 맨 처음 3개의 행
df[0:3]
# A B C D
# 2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
# 2013-01-02 1.212112 -0.173215 0.119209 -1.044236
# 2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
## 인덱스명에 해당하는 값
df['20130102':'20130104']
# A B C D
# 2013-01-02 1.212112 -0.173215 0.119209 -1.044236
# 2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
# 2013-01-04 0.721555 -0.706771 -1.039575 0.271860
df.loc[dates[0]]
# A 1.571507
# B 0.160021
# C -0.015071
# D -0.118588
# Name: 2013-01-01 00:00:00, dtype: float64
df.loc[:,['A','B']]
# A B
# 2013-01-01 1.571507 0.160021
# 2013-01-02 -1.037697 -0.891196
# 2013-01-03 -1.682384 -0.026006
# 2013-01-04 -0.108757 -0.958267
# 2013-01-05 1.092380 2.841777
# 2013-01-06 1.638509 -0.601126
df.loc['20130102':'20130104',['A','B']]
# A B
# 2013-01-02 -1.037697 -0.891196
# 2013-01-03 -1.682384 -0.026006
# 2013-01-04 -0.108757 -0.958267
df.loc['20130102',['A','B']]
# A -1.037697
# B -0.891196
# Name: 2013-01-02 00:00:00, dtype: float64
df.loc[dates[0],'A']
# 1.571506676720408
df.iloc[3]
# A -0.108757
# B -0.958267
# C 0.407331
# D 0.187037
# Name: 2013-01-04 00:00:00, dtype: float64
df.iloc[3:5,0:2]
# A B
# 2013-01-04 -0.108757 -0.958267
# 2013-01-05 1.092380 2.841777
df.iloc[[1,2,4],[0,2]]
# A C
# 2013-01-02 -1.037697 0.495447
# 2013-01-03 -1.682384 -0.152957
# 2013-01-05 1.092380 -0.125714
df.iloc[1:3,:]
# A B C D
# 2013-01-02 -1.037697 -0.891196 0.495447 0.453095
# 2013-01-03 -1.682384 -0.026006 -0.152957 -0.212614
df.iloc[:,1:3]
# B C
# 2013-01-01 0.160021 -0.015071
# 2013-01-02 -0.891196 0.495447
# 2013-01-03 -0.026006 -0.152957
# 2013-01-04 -0.958267 0.407331
# 2013-01-05 2.841777 -0.125714
# 2013-01-06 -0.601126 -1.043931
df.iloc[1,1]
# -0.89119558600132898
df.iat[1,1]
# 0.89119558600132898
df[df['A'] > 0]
# A B C D
# 2013-01-01 1.571507 0.160021 -0.015071 -0.118588
# 2013-01-05 1.092380 2.841777 -0.125714 -0.760722
# 2013-01-06 1.638509 -0.601126 -1.043931 -1.330950
df[df > 0]
# A B C D
# 2013-01-01 1.571507 0.160021 NaN NaN
# 2013-01-02 NaN NaN 0.495447 0.453095
# 2013-01-03 NaN NaN NaN NaN
# 2013-01-04 NaN NaN 0.407331 0.187037
# 2013-01-05 1.092380 2.841777 NaN NaN
# 2013-01-06 1.638509 NaN NaN NaN
: 열이 list의 값들을 포함하고 있는 모든 행들을 골라낼 때 주로 사용한다.
df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
df.isin([1, 3, 12, 'a'])
# A B
# 0 True True
# 1 False False
# 2 True False
# 테이블 복사
df2 = df.copy()
# 새로운 컬럼 E에 값 추가
df2['E'] = ['one','one', 'two','three','four','three']
# A B C D E
# 2013-01-01 1.571507 0.160021 -0.015071 -0.118588 one
# 2013-01-02 -1.037697 -0.891196 0.495447 0.453095 one
# 2013-01-03 -1.682384 -0.026006 -0.152957 -0.212614 two
# 2013-01-04 -0.108757 -0.958267 0.407331 0.187037 three
# 2013-01-05 1.092380 2.841777 -0.125714 -0.760722 four
# 2013-01-06 1.638509 -0.601126 -1.043931 -1.330950 three
df2[df2['E'].isin(['two','four'])]
# A B C D E
# 2013-01-03 -1.682384 -0.026006 -0.152957 -0.212614 two
# 2013-01-05 1.092380 2.841777 -0.125714 -0.760722 four
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102',periods=6))
# 2013-01-02 1
# 2013-01-03 2
# 2013-01-04 3
# 2013-01-05 4
# 2013-01-06 5
# 2013-01-07 6
# Freq: D, dtype: int64
df['F'] = s1
# 0번째 인덱스, 'A' 컬럼을 0으로 변경
df.loc[dates[0],'A'] = 0
# 0번째 인덱스, 1번째 컬럼을 0으로 변경
df.iloc[0,1] = 0
# 전체 인덱스, 'D' 컬럼 데이터를 변경
df.loc[:,'D'] = np.array([5] * len(df))
df
# A B C D F
# 2013-01-01 0.000000 0.000000 -0.015071 5 NaN
# 2013-01-02 -1.037697 -0.891196 0.495447 5 1.0
# 2013-01-03 -1.682384 -0.026006 -0.152957 5 2.0
# 2013-01-04 -0.108757 -0.958267 0.407331 5 3.0
# 2013-01-05 1.092380 2.841777 -0.125714 5 4.0
# 2013-01-06 1.638509 -0.601126 -1.043931 5 5.0
df2 = df.copy()
# 0보다 큰 데이터만 음수로 변경
df2[df2 > 0] = -df2
df2
# A B C D F
# 2013-01-01 0.000000 0.000000 -0.015071 -5 NaN
# 2013-01-02 -1.037697 -0.891196 -0.495447 -5 -1.0
# 2013-01-03 -1.682384 -0.026006 -0.152957 -5 -2.0
# 2013-01-04 -0.108757 -0.958267 -0.407331 -5 -3.0
# 2013-01-05 -1.092380 -2.841777 -0.125714 -5 -4.0
# 2013-01-06 -1.638509 -0.601126 -1.043931 -5 -5.0
여러가지 이유로 우리는 데이터를 전부 다 측정하지 못하는 경우가 종종 발생합니다.
이처럼 측정되지 못하여 비어있는 데이터를 ‘결측치’라고 합니다. - pandas 에서는 결측치를 np.nan 으로 나타냅니다.
pandas 에서는 결측치를 기본적으로 연산에서 제외시키고 있습니다.
또한 머신러닝, 딥러닝의 경우 결측치가 존재한다면, 코드가 오류나는 경우도 존재하기 때문에 항상 데이터 분석을 하기 전에는 데이터 결측치를 확인하는 습관을 가지는 것이 중요합니다.
reindex()을 통해 컬럼이나 인덱스를 추가, 삭제, 변경 등의 작업이 가능합니다. 결측 데이터를 만들기 위해 ‘E’ 컬럼을 생성합니다.
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1],'E'] = 1
df1
# A B C D F E
# 2013-01-01 0.000000 0.000000 -0.015071 5 NaN 1.0
# 2013-01-02 -1.037697 -0.891196 0.495447 5 1.0 1.0
# 2013-01-03 -1.682384 -0.026006 -0.152957 5 2.0 NaN
# 2013-01-04 -0.108757 -0.958267 0.407331 5 3.0 NaN
df1.dropna(how='any')
# A B C D F E
# 2013-01-02 -1.037697 -0.891196 0.495447 5 1.0 1.0
df1.fillna(value=5)
# A B C D F E
# 2013-01-01 0.000000 0.000000 -0.015071 5 5.0 1.0
# 2013-01-02 -1.037697 -0.891196 0.495447 5 1.0 1.0
# 2013-01-03 -1.682384 -0.026006 -0.152957 5 2.0 5.0
# 2013-01-04 -0.108757 -0.958267 0.407331 5 3.0 5.0
pd.isna(df1)
# A B C D F E
# 2013-01-01 False False False False True False
# 2013-01-02 False False False False False False
# 2013-01-03 False False False False False True
# 2013-01-04 False False False False False True
통계적 지표가 계산이 가능합니다.
평균 구하기
df.mean()
# A -0.004474
# B -0.383981
# C -0.687758
# D 5.000000
# F 3.000000
# dtype: float64
df.mean(1)
# 2013-01-01 0.872735
# 2013-01-02 1.431621
# 2013-01-03 0.707731
# 2013-01-04 1.395042
# 2013-01-05 1.883656
# 2013-01-06 1.592306
# Freq: D, dtype: float64
df.apply(np.cumsum)
# A B C D F
# 2013-01-01 0.000000 0.000000 -1.509059 5 NaN
# 2013-01-02 1.212112 -0.173215 -1.389850 10 1.0
# 2013-01-03 0.350263 -2.277784 -1.884779 15 3.0
# 2013-01-04 1.071818 -2.984555 -2.924354 20 6.0
# 2013-01-05 0.646846 -2.417535 -2.648122 25 10.0
# 2013-01-06 -0.026844 -2.303886 -4.126549 30 15.0
df.apply(lambda x: x.max() - x.min())
# A 2.073961
# B 2.671590
# C 1.785291
# D 0.000000
# F 4.000000
# dtype: float64
df = pd.DataFrame(np.random.randn(10, 4))
# 0 1 2 3
# 0 -0.548702 1.467327 -1.015962 -0.483075
# 1 1.637550 -1.217659 -0.291519 -1.745505
# 2 -0.263952 0.991460 -0.919069 0.266046
# 3 -0.709661 1.669052 1.037882 -1.705775
# 4 -0.919854 -0.042379 1.247642 -0.009920
# 5 0.290213 0.495767 0.362949 1.548106
# 6 -1.131345 -0.089329 0.337863 -0.945867
# 7 -0.932132 1.956030 0.017587 -0.016692
# 8 -0.575247 0.254161 -1.143704 0.215897
# 9 1.193555 -0.077118 -0.408530 -0.862495
# break it into pieces
pieces = [df[:3], df[3:7], df[7:]]
# concatenate agaain
pd.concat(pieces)
# 0 1 2 3
# 0 -0.548702 1.467327 -1.015962 -0.483075
# 1 1.637550 -1.217659 -0.291519 -1.745505
# 2 -0.263952 0.991460 -0.919069 0.266046
# 3 -0.709661 1.669052 1.037882 -1.705775
# 4 -0.919854 -0.042379 1.247642 -0.009920
# 5 0.290213 0.495767 0.362949 1.548106
# 6 -1.131345 -0.089329 0.337863 -0.945867
# 7 -0.932132 1.956030 0.017587 -0.016692
# 8 -0.575247 0.254161 -1.143704 0.215897
# 9 1.193555 -0.077118 -0.408530 -0.862495
# 1
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
# key lval
# 0 foo 1
# 1 foo 2
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
# key rval
# 0 foo 4
# 1 foo 5
merged = pd.merge(left, right, on='key')
# key lval rval
# 0 foo 1 4
# 1 foo 1 5
# 2 foo 2 4
# 3 foo 2 5
# 2
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
# key lval
# 0 foo 1
# 1 bar 2
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
# key rval
# 0 foo 4
# 1 bar 5
merged = pd.merge(left, right, on='key')
# key lval rval
# 0 foo 1 4
# 1 bar 2 5
SQL과 유사한 group by에 관련된 내용은 아래와 같은 과정을 말합니다.
아래의 예시처럼 같은 그룹의 합도 구할 수 있지만, .agg() 함수를 통해 여러가지 값을 확인할 수 있습니다.
(ex. df.groupby('A').agg(['min', 'max']))
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
# A B C D
# 0 foo one -1.202872 -0.055224
# 1 bar one -1.814470 2.395985
# 2 foo two 1.018601 1.552825
# 3 bar three -0.595447 0.166599
# 4 foo two 1.395433 0.047609
# 5 bar two -0.392670 -0.136473
# 6 foo one 0.007207 -0.561757
# 7 foo three 1.928123 -1.623033
df.groupby('A').sum()
# C D
# A
# bar -2.802588 2.42611
# foo 3.146492 -0.63958
df.groupby(['A', 'B']).sum()
# C D
# A B
# bar one -1.814470 2.395985
# three -0.595447 0.166599
# two -0.392670 -0.136473
# foo one -1.195665 -0.616981
# three 1.928123 -1.623033
# two 2.414034 1.600434
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
# A B C D E
# 0 one A foo -0.268332 -1.378239
# 1 one B foo -1.168934 0.263587
# 2 two C foo 1.245084 0.882631
# 3 three A bar 1.339747 0.770703
# 4 one B bar 0.005996 0.501930
# 5 one C bar 0.083572 -0.151838
# 6 two A foo 1.172619 1.110582
# 7 three B foo -0.210904 -0.200479
# 8 one C foo 0.166766 0.308271
# 9 one A bar 0.516837 0.869884
# 10 two B bar -0.667602 0.584587
# 11 three C bar -0.848954 0.609278
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
# C bar foo
# A B
# one A 0.516837 -0.268332
# B 0.005996 -1.168934
# C 0.083572 0.166766
# three A 1.339747 NaN
# B NaN -0.210904
# C -0.848954 NaN
# two A NaN 1.172619
# B -0.667602 NaN
# C NaN 1.245084
# 저장
df.to_csv('foo.csv')
# 불러오기
pd.read_csv('foo.csv')
# Unnamed: 0 A B C D
# 0 2000-01-01 0.266457 -0.399641 -0.219582 1.186860
# 1 2000-01-02 -1.170732 -0.345873 1.653061 -0.282953
# 2 2000-01-03 -1.734933 0.530468 2.060811 -0.515536
# 3 2000-01-04 -1.555121 1.452620 0.239859 -1.156896
# 4 2000-01-05 0.578117 0.511371 0.103552 -2.428202
# 5 2000-01-06 0.478344 0.449933 -0.741620 -1.962409
# 6 2000-01-07 1.235339 -0.091757 -1.543861 -1.084753
# .. ... ... ... ... ...
# 993 2002-09-20 -10.628548 -9.153563 -7.883146 28.313940
# 994 2002-09-21 -10.390377 -8.727491 -6.399645 30.914107
# 995 2002-09-22 -8.985362 -8.485624 -4.669462 31.367740
# 996 2002-09-23 -9.558560 -8.781216 -4.499815 30.518439
# 997 2002-09-24 -9.902058 -9.340490 -4.386639 30.105593
# 998 2002-09-25 -10.216020 -9.480682 -3.933802 29.758560
# 999 2002-09-26 -11.856774 -10.671012 -3.216025 29.369368
#
# [1000 rows x 5 columns]
# 저장
df.to_hdf('foo.h5', 'df')
# 불러오기
pd.read_hdf('foo.h5', 'df')
# A B C D
# 2000-01-01 0.266457 -0.399641 -0.219582 1.186860
# 2000-01-02 -1.170732 -0.345873 1.653061 -0.282953
# 2000-01-03 -1.734933 0.530468 2.060811 -0.515536
# 2000-01-04 -1.555121 1.452620 0.239859 -1.156896
# 2000-01-05 0.578117 0.511371 0.103552 -2.428202
# 2000-01-06 0.478344 0.449933 -0.741620 -1.962409
# 2000-01-07 1.235339 -0.091757 -1.543861 -1.084753
# ... ... ... ... ...
# 2002-09-20 -10.628548 -9.153563 -7.883146 28.313940
# 2002-09-21 -10.390377 -8.727491 -6.399645 30.914107
# 2002-09-22 -8.985362 -8.485624 -4.669462 31.367740
# 2002-09-23 -9.558560 -8.781216 -4.499815 30.518439
# 2002-09-24 -9.902058 -9.340490 -4.386639 30.105593
# 2002-09-25 -10.216020 -9.480682 -3.933802 29.758560
# 2002-09-26 -11.856774 -10.671012 -3.216025 29.369368
#
# [1000 rows x 4 columns]
# 저장
df.to_excel('foo.xlsx', sheet_name='Sheet1')
# 불러오기
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
# A B C D
# 2000-01-01 0.266457 -0.399641 -0.219582 1.186860
# 2000-01-02 -1.170732 -0.345873 1.653061 -0.282953
# 2000-01-03 -1.734933 0.530468 2.060811 -0.515536
# 2000-01-04 -1.555121 1.452620 0.239859 -1.156896
# 2000-01-05 0.578117 0.511371 0.103552 -2.428202
# 2000-01-06 0.478344 0.449933 -0.741620 -1.962409
# 2000-01-07 1.235339 -0.091757 -1.543861 -1.084753
# ... ... ... ... ...
# 2002-09-20 -10.628548 -9.153563 -7.883146 28.313940
# 2002-09-21 -10.390377 -8.727491 -6.399645 30.914107
# 2002-09-22 -8.985362 -8.485624 -4.669462 31.367740
# 2002-09-23 -9.558560 -8.781216 -4.499815 30.518439
# 2002-09-24 -9.902058 -9.340490 -4.386639 30.105593
# 2002-09-25 -10.216020 -9.480682 -3.933802 29.758560
# 2002-09-26 -11.856774 -10.671012 -3.216025 29.369368
#
# [1000 rows x 4 columns]