import pandas as pd
import numpy as np
pandas
- 데이터 조작 및 분석을 위해 Python 프로그래밍 언어 로 작성된 소프트웨어 라이브러리
- 특히, 수치표와 시계열을 조작하기 위한 데이터 구조 와 연산을 제공
numpy
- 행렬이나 일반적으로 대규모 다차원 배열을 쉽게 처리할 수 있도록 지원하는 파이썬의 라이브러리
- 데이터 구조 외에도 수치 계산을 위해 효율적으로 구현된 기능을 제공
1. series
- index ,value로 이루어져 있음.
- 한 가지 데이터 타입만 가질 수 있음.
1) series 생성
- list
pd.Series()
>>
Series([], dtype: object)
pd.Series([1, 2, 3, 4])
>>
0 1
1 2
2 3
3 4
dtype: int64
- dtype
pd.Series([1, 2, 3, 4], dtype=np.float64)
>>
0 1.0
1 2.0
2 3.0
3 4.0
dtype: float64
pd.Series([1, 2, 3, 4], dtype=str)
>>
0 1
1 2
2 3
3 4
dtype: object
pd.Series(np.array([1, 2, 3]))
>>
0 1
1 2
2 3
dtype: int32
data = pd.Series([1, 2, 3, 4, "5"])
>>
data
0 1
1 2
2 3
3 4
4 5
dtype: object
- dictionary
pd.Series({"key":"value"})
>>
key value
dtype: object
- 날짜 데이터
dates = pd.date_range("20210101", periods=6)
dates
>>
DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
'2021-01-05', '2021-01-06'],
dtype='datetime64[ns]', freq='D')
2. data frame
- pd.Series()
index, value
- pd.DataFrame()
index, value, column
1) data frame 생성
dates = pd.date_range("20210101", periods=6)
data = np.random.rand(6, 4)
df = pd.DataFrame(data, index=dates, columns=["A", "B", "C", "D"])
df
>>
A B C D
2021-01-01 0.120123 0.805504 0.427877 0.896760
2021-01-02 0.460422 0.836705 0.987296 0.945870
2021-01-03 0.615842 0.455002 0.899762 0.340627
2021-01-04 0.493937 0.651344 0.540037 0.494899
2021-01-05 0.348239 0.016023 0.138826 0.214697
2021-01-06 0.677094 0.811547 0.488967 0.892920
2) data frame 정보 탐색
- head()
df.head()
>>
A B C D
2021-01-01 0.120123 0.805504 0.427877 0.896760
2021-01-02 0.460422 0.836705 0.987296 0.945870
2021-01-03 0.615842 0.455002 0.899762 0.340627
2021-01-04 0.493937 0.651344 0.540037 0.494899
2021-01-05 0.348239 0.016023 0.138826 0.214697
- tail()
df.tail()
>>
A B C D
2021-01-02 0.460422 0.836705 0.987296 0.945870
2021-01-03 0.615842 0.455002 0.899762 0.340627
2021-01-04 0.493937 0.651344 0.540037 0.494899
2021-01-05 0.348239 0.016023 0.138826 0.214697
2021-01-06 0.677094 0.811547 0.488967 0.892920
- index
df.index
>>
DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
'2021-01-05', '2021-01-06'],
dtype='datetime64[ns]', freq='D')
- colums
df.columns
>>
Index(['A', 'B', 'C', 'D'], dtype='object')
- values
df.values
>>
array([[0.12012348, 0.80550413, 0.42787663, 0.89676013],
[0.46042171, 0.83670535, 0.98729637, 0.94586973],
[0.61584153, 0.45500185, 0.89976233, 0.34062741],
[0.49393717, 0.65134386, 0.54003688, 0.49489907],
[0.34823948, 0.01602301, 0.13882641, 0.21469679],
[0.67709373, 0.81154727, 0.4889665 , 0.89291963]])
- info()
df.info()
>>
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2021-01-01 to 2021-01-06
Freq: D
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A 6 non-null float64
1 B 6 non-null float64
2 C 6 non-null float64
3 D 6 non-null float64
dtypes: float64(4)
memory usage: 240.0 bytes
- describe()
df.describe()
>>
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.452610 0.596021 0.580461 0.630962
std 0.200123 0.318625 0.314896 0.320787
min 0.120123 0.016023 0.138826 0.214697
25% 0.376285 0.504087 0.443149 0.379195
50% 0.477179 0.728424 0.514502 0.693909
75% 0.585365 0.810036 0.809831 0.895800
max 0.677094 0.836705 0.987296 0.945870
3) data frame 정렬
- sort_values()
df.sort_values(by="B", ascending=False, inplace=True)
df
>>
A B C D
2021-01-02 0.460422 0.836705 0.987296 0.945870
2021-01-06 0.677094 0.811547 0.488967 0.892920
2021-01-01 0.120123 0.805504 0.427877 0.896760
2021-01-04 0.493937 0.651344 0.540037 0.494899
2021-01-03 0.615842 0.455002 0.899762 0.340627
2021-01-05 0.348239 0.016023 0.138826 0.214697
4) data 선택
- 컬럼명
df["A"]
type(df['A'])
>>
2021-01-02 0.460422
2021-01-06 0.677094
2021-01-01 0.120123
2021-01-04 0.493937
2021-01-03 0.615842
2021-01-05 0.348239
Name: A, dtype: float64
pandas.core.series.Series
df.A
>>
2021-01-02 0.460422
2021-01-06 0.677094
2021-01-01 0.120123
2021-01-04 0.493937
2021-01-03 0.615842
2021-01-05 0.348239
Name: A, dtype: float64
df[["A", "B"]]
>>
A B
2021-01-02 0.460422 0.836705
2021-01-06 0.677094 0.811547
2021-01-01 0.120123 0.805504
2021-01-04 0.493937 0.651344
2021-01-03 0.615842 0.455002
2021-01-05 0.348239 0.016023
- offset index
- [n:m] : n부터 m-1까지
- 인덱스나 컬럼의 이름으로 slice하는 경우는 끝을 포함합니다.
A B C D
2021-01-01 0.577089 0.581611 0.075132 0.748304
2021-01-02 0.168578 0.426618 0.373022 0.065649
2021-01-03 0.537746 0.292111 0.942561 0.878005
2021-01-04 0.818452 0.136901 0.015909 0.789349
2021-01-05 0.146205 0.097107 0.703463 0.964361
2021-01-06 0.294613 0.780128 0.262861 0.666487
df[0:3]
>>
A B C D
2021-01-01 0.577089 0.581611 0.075132 0.748304
2021-01-02 0.168578 0.426618 0.373022 0.065649
2021-01-03 0.537746 0.292111 0.942561 0.878005
2021-01-04 0.818452 0.136901 0.015909 0.789349
2021-01-05 0.146205 0.097107 0.703463 0.964361
2021-01-06 0.294613 0.780128 0.262861 0.666487
df['20210101': '20210104']
>>
A B C D
2021-01-01 0.577089 0.581611 0.075132 0.748304
2021-01-02 0.168578 0.426618 0.373022 0.065649
2021-01-03 0.537746 0.292111 0.942561 0.878005
2021-01-04 0.818452 0.136901 0.015909 0.789349
- loc
A B C D
2021-01-01 0.577089 0.581611 0.075132 0.748304
2021-01-02 0.168578 0.426618 0.373022 0.065649
2021-01-03 0.537746 0.292111 0.942561 0.878005
2021-01-04 0.818452 0.136901 0.015909 0.789349
2021-01-05 0.146205 0.097107 0.703463 0.964361
2021-01-06 0.294613 0.780128 0.262861 0.666487
df.loc[:, ['A', 'B']]
>>
A B
2021-01-01 0.577089 0.581611
2021-01-02 0.168578 0.426618
2021-01-03 0.537746 0.292111
2021-01-04 0.818452 0.136901
2021-01-05 0.146205 0.097107
2021-01-06 0.294613 0.780128
df.loc['20210102': '20210104', ['A','D']]
>>
A D
2021-01-02 0.168578 0.065649
2021-01-03 0.537746 0.878005
2021-01-04 0.818452 0.789349
df.loc['20210102': '20210104', 'A':'D']
>>
A B C D
2021-01-02 0.168578 0.426618 0.373022 0.065649
2021-01-03 0.537746 0.292111 0.942561 0.878005
2021-01-04 0.818452 0.136901 0.015909 0.789349
df.loc['20210102', ['A', 'B']]
>>
A 0.168578
B 0.426618
Name: 2021-01-02 00:00:00, dtype: float64
- iloc
A B C D
2021-01-01 0.577089 0.581611 0.075132 0.748304
2021-01-02 0.168578 0.426618 0.373022 0.065649
2021-01-03 0.537746 0.292111 0.942561 0.878005
2021-01-04 0.818452 0.136901 0.015909 0.789349
2021-01-05 0.146205 0.097107 0.703463 0.964361
2021-01-06 0.294613 0.780128 0.262861 0.666487
df.iloc[3]
>>
A 0.818452
B 0.136901
C 0.015909
D 0.789349
Name: 2021-01-04 00:00:00, dtype: float64
df.iloc[3, 2]
>>
0.015909149121260402
df.iloc[3:5, 0:2]
>>
A B
2021-01-04 0.818452 0.136901
2021-01-05 0.146205 0.097107
df.iloc[[1, 2, 4], [0, 2]]
>>
A C
2021-01-02 0.168578 0.373022
2021-01-03 0.537746 0.942561
2021-01-05 0.146205 0.703463
df.iloc[:, 1:3]
>>
B C
2021-01-01 0.581611 0.075132
2021-01-02 0.426618 0.373022
2021-01-03 0.292111 0.942561
2021-01-04 0.136901 0.015909
2021-01-05 0.097107 0.703463
2021-01-06 0.780128 0.262861
- 조건
A B C D
2021-01-01 0.577089 0.581611 0.075132 0.748304
2021-01-02 0.168578 0.426618 0.373022 0.065649
2021-01-03 0.537746 0.292111 0.942561 0.878005
2021-01-04 0.818452 0.136901 0.015909 0.789349
2021-01-05 0.146205 0.097107 0.703463 0.964361
2021-01-06 0.294613 0.780128 0.262861 0.666487
df["A"] > 0
>>
2021-01-01 True
2021-01-02 True
2021-01-03 True
2021-01-04 True
2021-01-05 True
2021-01-06 True
Freq: D, Name: A, dtype: bool
df[df["A"] > 0]
>>
A B C D
2021-01-01 0.577089 0.581611 0.075132 0.748304
2021-01-02 0.168578 0.426618 0.373022 0.065649
2021-01-03 0.537746 0.292111 0.942561 0.878005
2021-01-04 0.818452 0.136901 0.015909 0.789349
2021-01-05 0.146205 0.097107 0.703463 0.964361
2021-01-06 0.294613 0.780128 0.262861 0.666487
df[df < 0]
>>
A B C D
2021-01-01 NaN NaN NaN NaN
2021-01-02 NaN NaN NaN NaN
2021-01-03 NaN NaN NaN NaN
2021-01-04 NaN NaN NaN NaN
2021-01-05 NaN NaN NaN NaN
2021-01-06 NaN NaN NaN NaN
5) 컬럼 추가 및 삭제
- 기존 컬럼이 없으면 추가
- 기존 컬럼이 있으면 수정
A B C D
2021-01-01 0.577089 0.581611 0.075132 0.748304
2021-01-02 0.168578 0.426618 0.373022 0.065649
2021-01-03 0.537746 0.292111 0.942561 0.878005
2021-01-04 0.818452 0.136901 0.015909 0.789349
2021-01-05 0.146205 0.097107 0.703463 0.964361
2021-01-06 0.294613 0.780128 0.262861 0.666487
- 컬럼
df["E"] = ["one", "two", "three", "four", "five", "six"]
df
>>
A B C D E
2021-01-01 0.577089 0.581611 0.075132 0.748304 one
2021-01-02 0.168578 0.426618 0.373022 0.065649 two
2021-01-03 0.537746 0.292111 0.942561 0.878005 three
2021-01-04 0.818452 0.136901 0.015909 0.789349 four
2021-01-05 0.146205 0.097107 0.703463 0.964361 five
2021-01-06 0.294613 0.780128 0.262861 0.666487 six
df["E"] = ["one", "two", "three", "four", "five", "seven"]
df
>>
A B C D E
2021-01-01 0.577089 0.581611 0.075132 0.748304 one
2021-01-02 0.168578 0.426618 0.373022 0.065649 two
2021-01-03 0.537746 0.292111 0.942561 0.878005 three
2021-01-04 0.818452 0.136901 0.015909 0.789349 four
2021-01-05 0.146205 0.097107 0.703463 0.964361 five
2021-01-06 0.294613 0.780128 0.262861 0.666487 seven
- del
del df["E"]
df
>>
A B C D
2021-01-01 0.577089 0.581611 0.075132 0.748304
2021-01-02 0.168578 0.426618 0.373022 0.065649
2021-01-03 0.537746 0.292111 0.942561 0.878005
2021-01-04 0.818452 0.136901 0.015909 0.789349
2021-01-05 0.146205 0.097107 0.703463 0.964361
2021-01-06 0.294613 0.780128 0.262861 0.666487
- drop()
df.drop(['D'], axis=1)
>>
A B C
2021-01-01 0.577089 0.581611 0.075132
2021-01-02 0.168578 0.426618 0.373022
2021-01-03 0.537746 0.292111 0.942561
2021-01-04 0.818452 0.136901 0.015909
2021-01-05 0.146205 0.097107 0.703463
2021-01-06 0.294613 0.780128 0.262861
df.drop(['20210104'])
>>
A B C D
2021-01-01 0.577089 0.581611 0.075132 0.748304
2021-01-02 0.168578 0.426618 0.373022 0.065649
2021-01-03 0.537746 0.292111 0.942561 0.878005
2021-01-05 0.146205 0.097107 0.703463 0.964361
2021-01-06 0.294613 0.780128 0.262861 0.666487
6) data 확인
A B C D E
2021-01-01 0.577089 0.581611 0.075132 0.748304 one
2021-01-02 0.168578 0.426618 0.373022 0.065649 two
2021-01-03 0.537746 0.292111 0.942561 0.878005 three
2021-01-04 0.818452 0.136901 0.015909 0.789349 four
2021-01-05 0.146205 0.097107 0.703463 0.964361 five
2021-01-06 0.294613 0.780128 0.262861 0.666487 seven
- isin()
df['E'].isin(['two'])
>>
2021-01-01 False
2021-01-02 True
2021-01-03 False
2021-01-04 False
2021-01-05 False
2021-01-06 False
Freq: D, Name: E, dtype: bool
df[df['E'].isin(['two', 'five', 'three'])]
>>
A B C D E
2021-01-02 0.168578 0.426618 0.373022 0.065649 two
2021-01-03 0.537746 0.292111 0.942561 0.878005 three
2021-01-05 0.146205 0.097107 0.703463 0.964361 five
- apply()
A B C D
2021-01-01 0.577089 0.581611 0.075132 0.748304
2021-01-02 0.168578 0.426618 0.373022 0.065649
2021-01-03 0.537746 0.292111 0.942561 0.878005
2021-01-04 0.818452 0.136901 0.015909 0.789349
2021-01-05 0.146205 0.097107 0.703463 0.964361
2021-01-06 0.294613 0.780128 0.262861 0.666487
df["A"].apply("sum")
>>
2.5426834459032466
df["A"].apply("min"), df["A"].apply("max")
>>
(0.14620524983735583, 0.8184524491339409)
df[["A", "D"]].apply("sum")
>>
A 2.542683
D 4.112156
dtype: float64
df[["A", "B"]].apply(np.sum)
>>
A 2.542683
B 2.314475
dtype: float64
- lambda
def plusminus(num):
return "plus" if num > 0 else "minus"
df["A"].apply(plusminus)
>>
2021-01-01 plus
2021-01-02 plus
2021-01-03 plus
2021-01-04 plus
2021-01-05 plus
2021-01-06 plus
Freq: D, Name: A, dtype: object
df["A"].apply(lambda num: "plus" if num > 0 else "minus")
>>
2021-01-01 plus
2021-01-02 plus
2021-01-03 plus
2021-01-04 plus
2021-01-05 plus
2021-01-06 plus
Freq: D, Name: A, dtype: object
7) data frame 병합
left = pd.DataFrame({
"key" : ["K0", "K4", "K2", "K3"],
"A" : ["A0", "A1", "A2", "A3"],
"B" : ["B0", "B1", "B2", "B3"]
})
left
>>
key A B
0 K0 A0 B0
1 K4 A1 B1
2 K2 A2 B2
3 K3 A3 B3
right = pd.DataFrame([
{"key":"K0", "C":"C0", "D":"D0"},
{"key":"K1", "C":"C1", "D":"D1"},
{"key":"K2", "C":"C2", "D":"D2"},
{"key":"K3", "C":"C3", "D":"D3"},
])
right
>>
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
- merge()
- 두 데이터 프레임에서 컬럼이나 인덱스를 기준으로 잡고 병합하는 방법
- 기준이 되는 컬럼이나 인덱스를 키 값이라고 함.
- 기준이 되는 키 값은 두 데이터 프레임에 모두 포함되어 있어야 합니다.
pd.merge(left, right, on="key")
>>
key A B C D
0 K0 A0 B0 C0 D0
1 K2 A2 B2 C2 D2
2 K3 A3 B3 C3 D3
pd.merge(left, right, how="left", on="key")
>>
key A B C D
0 K0 A0 B0 C0 D0
1 K4 A1 B1 NaN NaN
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
pd.merge(left, right, how="right", on="key")
>>
key A B C D
0 K0 A0 B0 C0 D0
1 K1 NaN NaN C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
pd.merge(left, right, how="inner", on="key")
>>
key A B C D
0 K0 A0 B0 C0 D0
1 K2 A2 B2 C2 D2
2 K3 A3 B3 C3 D3
pd.merge(left, right, how="outer", on="key")
>>
key A B C D
0 K0 A0 B0 C0 D0
1 K4 A1 B1 NaN NaN
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
4 K1 NaN NaN C1 D1
8) index 변경
- set_index()
데이터프레임명.set_index("컬럼명", inplace=True)
9) 상관계수
- corr()
데이터프레임명.corr()