[Data Handling] Pandas :: (7) Built-in functions

연두·2021년 2월 16일
0

Python for ML

목록 보기
19/34
post-thumbnail

pandas :: Pandas Built-in functions

import pandas as pd
from pandas import Series
from pandas import DataFrame

import numpy as np

describe

  • Numeric type 데이터의 요약 정보를 보여줌
df = pd.read_csv("wages.csv")
df.head()
earn height sex race ed age
0 79571.299011 73.89 male white 16 49
1 96396.988643 66.23 female white 16 62
2 48710.666947 63.77 female white 16 33
3 80478.096153 63.22 female other 16 95
4 82089.345498 63.08 female white 17 43
df.describe()
earn height ed age
count 1379.000000 1379.000000 1379.000000 1379.000000
mean 32446.292622 66.592640 13.354605 45.328499
std 31257.070006 3.818108 2.438741 15.789715
min -98.580489 57.340000 3.000000 22.000000
25% 10538.790721 63.720000 12.000000 33.000000
50% 26877.870178 66.050000 13.000000 42.000000
75% 44506.215336 69.315000 15.000000 55.000000
max 317949.127955 77.210000 18.000000 95.000000

unique

  • series data의 유일한 값을 list로 반환
# 유일한 인종의 값 list
df.race.unique()

array(['white', 'other', 'hispanic', 'black'], dtype=object)

# dict type으로 index
np.array(dict(enumerate(df["race"].unique())))

array({0: 'white', 1: 'other', 2: 'hispanic', 3: 'black'}, dtype=object)

# label index 값과 label값 각각 추출
value = list(map(int, np.array(list(enumerate(df["race"].unique())))[:, 0].tolist()))
key = np.array(list(enumerate(df["race"].unique())), dtype=str)[:, 1].tolist()

value, key

([0, 1, 2, 3], ['white', 'other', 'hispanic', 'black'])

# label str -> index 값으로 변환
df["race"].replace(to_replace=key, value=value, inplace=True) 
df["race"]

0 0
1 0
2 0
3 1
4 0
..
1374 0
1375 0
1376 0
1377 0
1378 0
Name: race, Length: 1379, dtype: int64

# 성별에 대해서도 동일하게 적용
value = list(map(int, np.array(list(enumerate(df["sex"].unique())))[:, 0].tolist()))
key = np.array(list(enumerate(df["sex"].unique())), dtype=str)[:, 1].tolist()

value, key

([0, 1], ['male', 'female'])

df["sex"].replace(to_replace=key, value=value, inplace=True)
df.head(5)
earn height sex race ed age
0 79571.299011 73.89 0 0 16 49
1 96396.988643 66.23 1 0 16 62
2 48710.666947 63.77 1 0 16 33
3 80478.096153 63.22 1 1 16 95
4 82089.345498 63.08 1 0 17 43

sum

  • 기본적인 column 또는 row 값의 연산을 지원
  • sub, mean, min, max, count, median, mad, var 등
df.sum(axis=0)  #column별

earn 4.474344e+07
height 9.183125e+04
sex 8.590000e+02
race 5.610000e+02
ed 1.841600e+04
age 6.250800e+04
dtype: float64

df.sum(axis=1)  #row별

0 79710.189011
1 96542.218643
2 48824.436947
3 80654.316153
4 82213.425498
...
1374 30290.060363
1375 25019.829514
1376 13824.311312
1377 95563.664410
1378 9686.681857
Length: 1379, dtype: float64

isnull

  • column 또는 row 값의 NaN (null) 값의 index를 반환함
df.isnull()
earn height sex race ed age
0 False False False False False False
1 False False False False False False
2 False False False False False False
3 False False False False False False
4 False False False False False False
... ... ... ... ... ... ...
1374 False False False False False False
1375 False False False False False False
1376 False False False False False False
1377 False False False False False False
1378 False False False False False False

1379 rows × 6 columns

df.isnull().sum()  #null인 값의 합

earn 0
height 0
sex 0
race 0
ed 0
age 0
dtype: int64

sort_values

  • column값을 기준으로 데이터를 sorting
df.sort_values(["age", "earn"], ascending=True).head(10)
earn height sex race ed age
1038 -56.321979 67.81 0 2 10 22
800 -27.876819 72.29 0 0 12 22
963 -25.655260 68.90 0 0 12 22
1105 988.565070 64.71 1 0 12 22
801 1000.221504 64.09 1 0 12 22
862 1002.023843 66.59 1 0 12 22
933 1007.994941 68.26 1 0 12 22
988 1578.542814 64.53 0 0 12 22
522 1955.168187 69.87 1 3 12 22
765 2581.870402 64.79 1 0 12 22

Correlation & Covatiance

  • 상관계수와 공분산을 구하는 함수
  • corr, cov, corrwith
df.age.corr(df.earn)

0.07400349177836055

df.age.cov(df.earn)

36523.6992104089

df.corrwith(df.earn)

earn 1.000000
height 0.291600
sex -0.337328
race -0.063977
ed 0.350374
age 0.074003
dtype: float64

df.corr()
earn height sex race ed age
earn 1.000000 0.291600 -0.337328 -0.063977 0.350374 0.074003
height 0.291600 1.000000 -0.703672 -0.045974 0.114047 -0.133727
sex -0.337328 -0.703672 1.000000 0.000858 -0.061747 0.070036
race -0.063977 -0.045974 0.000858 1.000000 -0.049487 -0.056879
ed 0.350374 0.114047 -0.061747 -0.049487 1.000000 -0.129802
age 0.074003 -0.133727 0.070036 -0.056879 -0.129802 1.000000


https://www.boostcourse.org/ai222/lecture/23822/

0개의 댓글