파이썬 데이터 핸들링 기본
1. Getting & Knowing Data
df.head()
print(df.shape)
df.columns
df.columns[5]
df.iloc[:,5].dtype
df.index
df.iloc[2,5]
df = pd.read_csv(DriveUrl, encoding = 'euc-kr')
df.tail(3)
df.select_dtypes(exclude=object).column
df.select_dtypes(include=object).column
df.isnull().sum()
df.info()
df.describe()
df['거주인구']
df['평균속도'].quantile(0.75) - df['평균속도'].qunatile(0.25)
df['읍면동명'].nunique()
df['읍면동명'].unique()
2. Filtering & Sorting
df.loc[df['quantity']==3].head().reset_index(drop=True)
df2 = df[['quantity','item_price']]
df['item_price'].str[1:].astype('float')
df[(df.new_price <=9) & (df.item_name == 'Chicken Salad Bowl')].head()
df.sort_values('new_price').reset_index(drop=True).head()
df.loc[df.item_name.str.contains('Chips')].head()
df.iloc[:,::2]
df.sort_values('new_proce', ascending=False).reset_index(drop=True)
df.loc[(df.item_name == 'Steak Salad') | (df.item_name == 'Bowl')].drop_duplicates('item_name
df.loc[df.new_price >= df.new_price.mean()]
df.loc[df.item_name =='Izze','item_name'] = 'Fizzy Lizzy'
df.loc[~df.choice_description.str.contains('Vegetables')]
df[df.item_name.str.startswith('N')]
df.loc[df.new_price.isin(lst)]
3. Grouping
df.host_name.value_counts().sort_index()
df.groupby(['neighbourhood_group','neighbourhood'], as_index=False).size()
df.groupby('neighbourhood_group')['price'].agg(['mean','var','max','min'])
df.groupby(['neighbourhood','neighbourhood_group']).price.mean().unstack().fillna(-999)
Ans.loc[:,:] = (Ans.values / Ans.sum(axis=1).values.reshape(-1,1))
4. Apply, Map
map은 'Series'만, Apply는 Series, Data Frame 모두 적용 가능
dic = {
'Unknown' : 'N',
'Less than $40K' : 'a',
'$40K - $60K' : 'b',
'$60K - $80K' : 'c',
'$80K - $120K' : 'd',
'$120K +' : 'e'
}
df['newIncome'] = df.Income_Category.map(lambda x: dic[x])
def changeCategory(x):
if x =='Unknown':
return 'N'
elif x =='Less than $40K':
return 'a'
elif x =='$40K - $60K':
return 'b'
elif x =='$60K - $80K':
return 'c'
elif x =='$80K - $120K':
return 'd'
elif x =='$120K +' :
return 'e'
df['newIncome'] =df.Income_Category.apply(changeCategory)
df.Education_Level.map(lambda x: 1 if 'Graduate' in x else 0).value_counts
np.where( df.Education_Level.str.contains('Graduate'), 1, 0)