기본 함수 및 기능 1개만 가지고도 간단하게 처리할 수 있는 문제 : )
# Iris 데이터셋 불러오기
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
columns = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species']
iris = pd.read_csv(url, header=None, names=columns)
# Iris 데이터셋 불러오기 url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' columns = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species'] iris = pd.read_csv(url, header=None, names=columns) df = iris df
+)
# seaborn 라이브러리에서 불러오기 import pandas as pd import seaborn as sns df = sns.load_dataset('iris') df
df.head()
df.info()
df.describe()
df['Sepal Length']
dfsetosa = df[df['Species'] == 'Iris-setosa'] dfsetosa.head(3)
pd.DataFrame(df.groupby('Species')['Sepal Length'].mean())
df['Sepal Sum'] = df['Sepal Length'] + df['Sepal Width'] df
df.sort_values('Petal Length', ascending = False)
df.groupby('Species').count()
df.value_counts('Species')
두 개 이상의 함수 및 기능을 사용하거나 데이터 분석하는 관점에서 접근해야 하는 문제
df1 = df['Sepal Length'].mean() df2 = df['Petal Length'].mean() mean_data = pd.DataFrame({ 'Sepal Length Mean': [df1], 'Petal Length Mean': [df2] }, index = ['value']) mean_data
문제) 각 품종별 'Sepal Length'와 'Petal Length'의 평균을 피벗 테이블로 만들어 보세요.
aggfunc =
pivot_table = df.pivot_table(index='Species', values=['Sepal Length', 'Petal Length'], aggfunc='mean') print(pivot_table)
df1 = pd.DataFrame(df.groupby('Species')['Sepal Length'].mean()) df2 = pd.DataFrame(df.groupby('Species')['Petal Length'].mean()) mean_df = pd.concat([df1, df2], axis=1) mean_df
import numpy as np # 임의로 결측값 10개 추가 iris_with_nan = iris.copy() iris_with_nan.loc[np.random.choice(iris_with_nan.index, 10), 'Sepal Width'] = np.nan dfn = iris_with_nan # 결측값 처리1 (삭제) print(dfn.isnull().sum()) dfn = dfn.dropna(subset = 'Sepal Width') print(dfn.isnull().sum()) # 임의로 결측값 10개 추가 iris_with_nan = iris.copy() iris_with_nan.loc[np.random.choice(iris_with_nan.index, 10), 'Sepal Width'] = np.nan dfn = iris_with_nan # 결측값 처리2 (대체) print(dfn.isnull().sum()) dfn = dfn.fillna(0) print(dfn.isnull().sum())
df['Sepal Ratio'] = df['Sepal Length'] / df['Sepal Width'] df
df['Sepal Size'] = ['Large' if i >= 5.0 else 'Small' for i in df['Sepal Length']] df
# 각 통계량 계산 후 합치기 df1 = pd.DataFrame(df.groupby('Species')['Sepal Length'].sum()) df1.columns = ['Sepal Length Sum'] df2 = pd.DataFrame(df.groupby('Species')['Sepal Length'].mean()) df2.columns = ['Sepal Length Mean'] df3 = pd.DataFrame(df.groupby('Species')['Sepal Length'].std()) df3.columns = ['Sepal Length Std'] df4 = pd.DataFrame(df.groupby('Species')['Sepal Width'].sum()) df4.columns = ['Sepal Width Sum'] df5 = pd.DataFrame(df.groupby('Species')['Sepal Width'].mean()) df5.columns = ['Sepal Width Mean'] df6 = pd.DataFrame(df.groupby('Species')['Sepal Width'].std()) df6.columns = ['Sepal Width Std'] describe_df = pd.concat([df1, df2, df3, df4, df5, df6], axis=1) describe_df
# agg 메서드로 통계량 한 번에 구하기 describedf = df.groupby('Species').agg({ 'Sepal Length': ['sum', 'mean', 'std'], 'Sepal Width': ['sum', 'mean', 'std'] }) describedf
con = df[(df['Sepal Length'] > 5.0) & (df['Sepal Width'] <= 3.5)] df['Petal Sum'] = con['Petal Length'] + con['Petal Width'] df
import matplotlib.pyplot as plt # 품종별 분리 df.groupby('Species').count() # [Iris-setosa, Iris-versicolor, Iris-virginica] setosa = df[df['Species'] == 'Iris-setosa'] versicolor = df[df['Species'] == 'Iris-versicolor'] virginica = df[df['Species'] == 'Iris-virginica'] #산점도 그리기 plt.scatter(setosa['Sepal Length'], setosa['Sepal Width'], color='black', label='Setosa') plt.scatter(versicolor['Sepal Length'], versicolor['Sepal Width'], color='blue', label='Versicolor') plt.scatter(virginica['Sepal Length'], virginica['Sepal Width'], color='red', label='Virginica') plt.title('Sepal Length : Width by Species') plt.xlabel('Setal Length') plt.ylabel('Sepal Width') plt.legend() plt.show()
# seaborn 라이브러리의 scatterplot을 사용해서 # hue = '칼럼' 인자를 사용하면 해당 컬럼의 고유값들을 다르게 표현해준다!! import matplotlib.pyplot as plt import seaborn as sns import pandas as pd sns.scatterplot(data=df, x='Sepal Length', y='Sepal Width', hue='Species', palette='rocket') plt.title('Sepal Length : Width by Species') plt.xlabel('Sepal Lenth') plt.ylabel('Sepal Width') plt.legend(title='Species') plt.show()
# 품종별 색상 지정 color = {'Iris-setosa': 'black', 'Iris-versicolor': 'blue', 'Iris-virginica': 'red'} # 히스토그램 그리기 for species, value in df.groupby('Species'): plt.hist(value['Sepal Length'], color=color[species], label=species) plt.title('Distribution of Sepal Length') plt.xlabel('Sepal Length') plt.ylabel('Frequency') plt.legend(title='Species') plt.show()
# 박스플롯 그리기 df.boxplot(by='Species', column=['Petal Length']) plt.title('Petal Length Distribution by Species') plt.xlabel('Species') plt.ylabel('Petal Length') plt.show()