# 연속형 변수 분포 확인
continuous = train[['Rating','Reviews','Size','Installs','Price']]
continuous.hist(bins=30,figsize=(10,10))
plt.show()
sns.pairplot(df)
plt.figure(figsize=(15,15))
sns.heatmap(data = df.corr(), annot=True,
fmt = '.2f', linewidths=.5, cmap='RdYlBu_r')
찐한색일수록 상관성이 짙다.
f, axes = plt.subplots(1, 3, figsize=(20, 7), sharex=True)
sns.scatterplot(x='income', y='score', data=df, hue=df['KMeans_labels'], ax=axes[0])
sns.scatterplot(x='score', y='age', data=df, hue=df['KMeans_labels'], ax=axes[1])
sns.scatterplot(x='income', y='age', data=df, hue=df['KMeans_labels'], ax=axes[2])
plt.figure(figsize=(10,5))
plt.scatter(x = continuous['Installs'], y = continuous['Rating'])
def remove_outlier(df,col):
data = df[col]
q25 = np.percentile(data.values,25)
q75 = np.percentile(data.values,75)
IQR = q75 - q25
IQR = IQR * 1.5
low = q25-IQR ; high = q75+IQR
outlier_index = data[(data<low)|(data>high)].index
#print(len(outlier_index))
#df.drop(outlier_index, axis=0, inplace=True)
#print(df.shape)
#return df
return len(outlier_index)
b = train['Category'].unique().tolist()
train['low_Genres'].fillna('', inplace=True)
df.drop('two',axis=1)
df.drop(['one','two'],axis=1)
df['gender'].replace({'Male':0, 'Female':1}, inplace=True)
cat_col = ['gender', 'martial','city', 'occup','pc1','pc2','pc3']
df = pd.get_dummies(df, columns=cat_col)