내가 보려고 만든 Pandas Summary

Kyeongmin·2022년 2월 16일

객체

DataFrame

DataTable 전체 (2차원 Array, Matrix)

Series

DataFrame 중 하나의 Column (1차원 Array, List)

import pandas as pd

personData = {"name":["do","re","mi"],
        "height":[167,178,190],
        "weight":[64,70,88]}
        
df = pd.DataFrame(personData, index=personData["name"])

Slicing

주요 Slicing 방법

loc()
index label 기반 Slicing

df.loc["do"] # index "do" 에 대한 값 출력
df.loc[:"re"] # 처음부터 index "re" 에 대한 값까지 출력

df.loc["do","height"] # index "do"의 "height" 값 출력
df.loc["do",["height","weight"]] # index "do"의 "height","weight" 값 출력

iloc()
index number 기반 Slicing

df.iloc[1] # 2번째 index 값, 즉 "re" 에 대한 값 출력
df.iloc[:2] # 처음부터 3번째 이전 index 값까지 출력

df.iloc[:2,1] # 처음부터 3번째 이전 index 값의 2번째 컬럼(weight) 값 출력
df.iloc[:2,:] # 처음부터 3번째 이전 index 값의 전체 컬럼 값 출력

df[ ]
column name, index number 기반 Slicing

df["height"] # "height" 컬럼 값 출력
df["height"][:2] "height" 컬럼의 3번째 이전 index 값까지 출력

부가적인 방법

head / tail
앞, 뒤의 일부 열 추출

df.head(2) # 앞에서 2번째까지의 행 출력
df.tail(2) # 뒤에서 2번째까지의 행 출력
# head, tail 모두 기본값은 5이다.

T
transpose, 행열을 바꾼다.

df.T
# 전체 Data의 컬럼을 알아보고자 할때, df.head.T 연산을 사용하면 쉽다.

Drop

drop()
Index/Column Label 기반 Drop 함수

df.drop("ko", axis=0) # index "ko" 행을 삭제한 값 반환
df.drop("height", axis=1) # column "height" 열을 삭제한 값 반환
# axis=0(행), axis=1(열)

del
실제 데이터를 지우기 때문에 많이 사용하지 않는다.
```
del df["height"] # df의 "height" column 삭제
```

Series/DataFrame Operation

Add

# '+' 연산자를 이용한 방법
s1 + s2
# add 함수를 이용한 방법
s1.add(s2)
s1.add(s2, fill_value=0)
s1.add(df, axis=0)
# index label 기준으로 연산 수행, 한쪽이라도 index에 대한 값이 존재하지 않는 경우 NaN 반환

map
Series의 각 원소에 함수 적용

sexStr = {0:"male", 1:"female"}

df.sex.map(sexStr) # 0→male, 1→female로 변환
df.age.map(lambda x: x//10) # age를 10으로 나눈 몫 반환
df.salary.map(lambda x: x*12) # salary * 12 값 반환

df.sex.replace([0,1],["male","female"]) # 값을 변환할 때 replace를 사용하기도 함
# replace(target List, conversion List)

apply
DataFrame에 적용 시, Series 단위 함수 적용

df_info = df[["age","salary"]]

df_info.apply(sum) # age/salary column의 sum(합계) 값 반환
df_info.apply(lambda x: pd.Series([x.min(), x.max()], index=["min","max"]))
# age/salary column의 min/max 값 반환

applymap
element 단위 함수 적용 (map 함수와 유사)

df.applymap(lambda x: -x).head() # 각 Element에 (-) 부호 적용

부가적인 내장 함수

df.sex.unique() # "sex" column의 unique 값 반환
df.isnull().sum() # isnull 함수는 null(NaN) 값 여부 반환, sum을 해주면 null 값 개수 확인 가능
df.sort_values(["age","salary"], ascending=True) # 특정 column 기준으로 정렬

df.corr() # 전체 column 간 상관계수 반환
df.cov() # 전체 column 간 공분산 반환
df.corrwith(df.age) # 특정 column ↔︎ 전체 column 간 상관계수 반환

Groupby

Groupby의 연산 순서 Split → Apply → Combine

petalGrouped = iris.groupby("petal_code")["petal width (cm)"]
# 그룹은 petal_code, 값은 petal width
sepalGrouped = iris.groupby(["slength_code","swidth_code"])
# 2개의 컬럼으로 그룹 형성 (Multi Index, Hierarchical Index)

# ※ 아래는 Hierarchical Index인 경우 적용 가능한 연산들
sepalGrouped["petal_code"].sum().unstack() # Index 하나를 컬럼으로 변환 (unstack ↔︎ stack)
sepalGrouped["petal_code"].sum().swaplevel() # Multi Index 순서 Swap
sepalGrouped["petal_code"].sum().sort_index(level=1) # level1 Index 기준으로 정렬
sepalGrouped["petal_code"].sum().sum(level=1) # level1 Index 기준으로 Sum 연산

Split 연산

grouped = df.groupby("Teams")
for name, group in grouped:
    print(name, group) # name-str, group-dataframe
# Groupby의 반환 값은 Split 형태의 Group들이다.

sepalGrouped.get_group("Russia") # Group Label로 해당 Group 불러옴

Apply 연산 유형
Aggregation(통계), Transformation(변환), Filteration(필터) 3가지로 나뉜다.

# Aggregation
sepalGrouped.agg(np.sum)
sepalGrouped.agg([np.sum, np.median, len])
sepalGrouped.agg({'petal length (cm)':sum,
                    'petal width (cm)':[sum, 'count', 'first', 'last']})
# Group 대상으로 연산 실행, 해당 연산을 Group에 반영

# Transformation
score = lambda x : (max(x))
sepalGrouped.transform(score)
# Group 대상으로 연산 실행, 해당 연산을 Element에 반영

# Filteration
sepalGrouped.filter(lambda x : len(x) < 10)
# Boolean 조건으로만 입력 가능
# Group 대상으로 연산 실행, 해당 연산에 해당되는 Group의 Element만 출력

Pivot Table

df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
...                          "bar", "bar", "bar", "bar"],
...                    "B": ["one", "one", "one", "two", "two",
...                          "one", "one", "two", "two"],
...                    "C": ["small", "large", "large", "small",
...                          "small", "large", "small", "small",
...                          "large"],
...                    "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
...                    "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})

pd.pivot_table(df, values="D", index=["A","B"], columns="C", aggfunc=[np.average, len], fill_value=0)

Cross Tab

a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
...               "bar", "bar", "foo", "foo", "foo"], dtype=object)
b = np.array(["one", "one", "one", "two", "one", "one",
...               "one", "two", "two", "two", "one"], dtype=object)
c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
...               "shiny", "dull", "shiny", "shiny", "shiny"],
...              dtype=object)

pd.crosstab(a, [b,c])

Merge / Concat

df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'tak'],
...                     'value': [1, 2, 3, 5]})
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'tic'],
...                     'value': [5, 6, 7, 8]})

df1.merge(df2, left_on='lkey', right_on='rkey', how='inner', suffixes=('_left','_right'))

pd.concat([df1, df2], ignore_index=True, axis=1)

str Module

strSeries = df_str["A"]
strSeries.str[:2]
strSeries.str[:2].str.upper()
strSeries.str[:2].str.replace("fo","kk")

strSeries.str[:2].str.isdigit()
strSeries.str.zfill(5)

strSeries.str.contains('[a-z]{2}', regex=True)
strSeries.str.match('[a-z]{2}')
strSeries.str.split('o', expand=True)
strSeries.str.get_dummies()

Kyeongmin

개발자가 되고 싶은 공장장이🛠

이전 포스트

[TIL] 220214-220220

다음 포스트