df.columns
len(df.columns)
df.info()
print(len(df))
print(len(df.drop_duplicates()))
print(f"Duplicated Rows: {df.duplicated().sum()}")
df.describe()
df.describe().T
df.describe().round(0).T
import numpy as np
np.mean([1, 2, 3, 4, 5])
# 결과: 3.0
np.median([1, 2, 3, 4, 5])
# 결과: 3.0
np.mean([1, 2, 3, 4, 100])
# 결과: 22.0
np.median([1, 2, 3, 4, 100])
# 결과: 3.0
df.describe(include='all')
df.describe(include='all').round(0).T
df.nunique()
# Data info
pd.DataFrame({
'Unique' : df.nunique(),
'Null' : df.isna().sum(),
'Type' : df.dtypes
}).sort_values(by='Unique', ascending=False)
DataFrame의 컬럼 순서 정렬: sort_values()
pd.DataFrame().sort_values(by='정렬 Column 명', ascending=TRUE or FALSE)
# 오름차순: ascending=TRUE
# 내림차순: ascending=False
DataFrame 일부 data drop(삭제)
pd.DataFrame().drop([Col1, Col2, ...],
axis=1,
inplace=TRUE)
Inplace 파라미터는 DataFrame에 생긴 변동사항을 원본 DataFrame에 최종 반영할 것인지의 여부를 정하는 것으로, False가 디폴트
df.drop(['StandardHours', 'Over18', 'EmployeeCount'], axis=1, inplace=True)
# int / object이 섞여있음
df.select_dtypes(include='object')
df.select_dtypes(include='int64')
df.groupby('Col1').size()
df.groupby('Col1')['Col'].count()
df.groupby('Col1')['Col'].agg('count')
tmp0.groupby('Department')['Department'].agg('count')
tmp0.groupby(['Department', 'JobRole']).size()
tmp0.groupby(['Department', 'JobRole']).size().sort_values(ascending=False)
tmp0.groupby(['Department', 'JobRole', 'JobLevel', 'EducationField']).size().sort_values(ascending=False)
tmp0.groupby(['Department', 'JobRole', 'JobLevel', 'EducationField']).size().sort_values(ascending=False).reset_index(name='Emp Count')
tmp1 = df[['Department', 'JobRole', 'JobLevel', 'Age', 'Attrition']].copy()
tmp1
tmp1.groupby(['Department', 'JobRole'])['Age'].agg(['count', 'min', 'max'])
pd.pivot_table(tmp1,
index=['Department', 'Attrition'],
columns='JobLevel',
values='Age',
aggfunc='mean'
)
pt1 = pd.pivot_table(tmp1,
index=['Attrition', 'Department'],
columns='JobLevel',
values='Age',
aggfunc='mean',
fill_value=''
)
pt1
pt1.index
pt2 = pd.pivot_table(tmp1,
index=['Attrition', 'Department', 'JobRole'],
columns='JobLevel',
values='Age',
aggfunc='mean',
fill_value=''
)
pt2
pt2.index
# 0: Attrition, 1: Department, 2: JobRole
pt2.index.names
pt2.index.names[-1]
pt2.index.names[1]
pt2.index.names[2]
pt2.unstack(level=-1)
pt2.unstack(level=0)
pt2.unstack(level=0).stack()
st1 = pt2.unstack(level=0).stack(level=0)
st1.columns