Kaggle Link : https://www.kaggle.com/code/yoontaeklee/spacex-falcon-9-firsts-stage-landing-prediction
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
# Allows us to split out data into training and testing data
from sklearn.model_selection import train_test_split
# Allows us to test parameters of classification algorithms and find the best one
from sklearn.model_selection import GridSearchCV
# Logistic Regression classification algorithm
from sklearn.linear_model import LogisticRegression
# Support Vector Machine classification algorithm
from sklearn.svm import SVC
# Decision Tree classification algorithm
from sklearn.tree import DecisionTreeClassifier
# K Nearest Neighbors classification algorithm
from sklearn.neighbors import KNeighborsClassifier
df = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_1.csv")
df.head(10)
df.describe()
for column in df.columns:
msg = 'column: {:>10}\t Percent of NaN Value: {:.2f}%'
.format(column, 100 * (df[column].isnull().sum()) / df[column].shape[0])
print(msg)
msno.matrix(df=df.iloc[:,:], figsize=(7,7), color=(0.3,0.3,0.3))
msno.bar(df=df.iloc[:,:], figsize=(7,7), color=(0.3,0.3,0.3))
landing_outcomes = df.Outcome.value_counts()
print(landing_outcomes)
for i, outcome in enumerate(landing_outcomes.keys()):
print(i,outcome)
bad_outcomes = set(landing_outcomes.keys()[[1,3,5,6,7]])
landing_class = []
for outcome in (df['Outcome']):
if outcome in bad_outcomes:
landing_class.append(0)
else:
landing_class.append(1)
df['Class'] = landing_class
df[['Class']].head(10)
sns.catplot(y="PayloadMass",x="FlightNumber",hue="Class",data=df,aspect=5)
plt.xlabel("Flight Number", fontsize=22)
plt.ylabel("Pay load Mass (kg)", fontsize=22)
plt.show()
sns.catplot(y="LaunchSite",x="FlightNumber",hue="Class",data=df,aspect=5)
plt.xlabel("Flight Number",fontsize=22)
plt.ylabel("Launch Site",fontsize=22)
plt.show()
sns.catplot(y="LaunchSite",x="PayloadMass",hue="Class",data=df,aspect=5)
plt.xlabel("Payload Mass (KG)", fontsize=22)
plt.ylabel("Launch Site",fontsize=22)
plt.show()
df.groupby("Orbit").mean()["Class"].plot(kind='bar')
plt.xlabel("Orbit Type",fontsize=20)
plt.ylabel("Success Rate",fontsize=20)
plt.show()
![](https://velog.velcdn.com/images/hsjunior1/post/0d70e847-5fa5-44b2-85f7-9d05ac3f74cd/image.png)
sns.catplot(y="Orbit",x="FlightNumber",hue="Class",data=df,aspect=5)
plt.xlabel("Flight Number",fontsize=20)
plt.ylabel("Orbit Type",fontsize=20)
plt.show()
![](https://velog.velcdn.com/images/hsjunior1/post/ca320a63-f5f5-49e3-bc9e-98e3399cb33c/image.png)
sns.catplot(y="Orbit",x="PayloadMass",hue="Class",data=df,aspect=5)
plt.ylabel("Orbit type",fontsize=20)
plt.xlabel("Pay load (Kg)")
plt.show()
![](https://velog.velcdn.com/images/hsjunior1/post/e1b154d1-c09f-4e8a-9221-afd004c137c5/image.png)
years = []
def Extract_year(date):
for i in df["Date"]:
years.append(i,split("-")[0]
return years
df1 = pd.DataFrame(Extract_year(df["Date"]), columns = ["year"])
df1["Class"] = df["Class"]
sns.lineplot(y=df1.groupby('year')['Class'].mean(),
x=np.unique(Extract_year(df['Date'])))
plt.xlabel("Year", fontsize = 20)
plt.ylabel("Success Rate", fontsize = 20)
plt.show()
features = df[['FlightNumber','PayloadMass','Orbit','LaunchSite','Flights','GridFins','Reused','Legs','LandingPad','Block','ReusedCount','Serial']]
features.head()
features_one_hot = pd.get_dummies(features, columns = ['Orbit','LaunchSite','LandingPad','Serial'])
features_one_hot.head()
features_one_hot.astype('float64')
Y = df['Class'].to_numpy()
Y
transform = preprocessing.StandardScaler()
X = transform.fit_transform(features_one_hot)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=2)
print('Train set: ',X_train.shape, Y_train.shape)
print('Test set: ',X_test.shape, Y_test.shape)
parameters = {'C':[0.01,0.1,1], 'penalty':['l2'], 'solver':['lbfgs']}
lr = LogisticRegression()
logreg_cv = GridSearchCV(lr,parameters,cv=10)
logreg_cv.fit(X_train,Y_train)
print('best parameters : ',logreg_cv.best_params_)
print('accuracy : ',logreg_cv.best_score_)
print('test set accuracy : ',logreg_cv.score(X_test, Y_test))
yhat = logreg_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)
parameters = {'kernel':('linear','rbf','poly','rbf','sigmoid'),'C':np.logspace(-3,3,5),'gamma':np.logspace(-3,3,5)}
svm = SVC()
svm_cv = GridSearchCV(svm, parameters, cv=10)
svm_cv.fit(X_train, Y_train)
print("best parameters : ",svm_cv.best_params_)
print("accuracy : ",svm_cv.best_score_)
print("test set accuracy : ",svm_cv.score(X_test, Y_test))
yhat = svm_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)
parameters = {'criterion': ['gini','entropy'],
'splitter':['best','random'],
'max_depth':[2 * n for n in range(1,10)],
'max_features':['auto','sqrt'],
'min_samples_leaf':[1,2,4],
'min_samples_split':[2,5,10]}
tree = DecisionTreeClassifier()
tree_cv = GridSearchCV(tree, parameters, cv = 10)
tree_cv.fit(X_train, Y_train)
print("best parameters : ",tree_cv.best_params_)
print("accuracy : ",tree_cv.best_score_)
print("test set accuracy : ",tree_cv.score(X_test,Y_test))
yhat = tree_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)
parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
'p': [1,2]}
KNN = KNeighborsClassifier()
knn_cv = GridSearchCV(KNN,parameters,cv=10)
knn_cv.fit(X_train, Y_train)
print("best parameters : ",knn_cv.best_params_)
print("accuracy : ",knn_cv.best_score_)
print("test set accuracy : ",knn_cv.score(X_test,Y_test))
yhat = knn_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)