250507 TIL

수이·2025년 5월 7일

Data_6기 tableau 내일배움캠프 머신러닝

🟡 TIL

목록 보기

54/60

개인스터디

통계 + 머신러닝 개인과제 해설

🔗 개인과제

필수 1. pandas 응용 🟢

풀이

import pandas as pd

df = pd.read_csv(r"C:\Users\1234\PycharmProjects\sparta_python\statistics\statistics.csv")

df2 = df.groupby('Category').agg({
    'Customer ID' : 'count',
    'Purchase Amount (USD)' : 'sum'
}).reset_index()

df2

필수 2. pandas 응용 🟢

풀이

import pandas as pd
df['Purchase Amount (USD)_누적']= df['Purchase Amount (USD)'].expanding().sum()
df[['Purchase Amount (USD)_누적', 'Purchase Amount (USD)']]

필수 3. 기초통계 🟢

풀이

import pandas as pd
df.groupby('Gender')['Review Rating'].agg(['mean','median']).round(2)
# 해석 
# 여성 평균 3.74, 중앙값 3.7 / 남성 평균 3.75, 중앙값 3.8 
# 두 성별은 유사한 평점 분포를 보임

reset_index 까지 추가로 넣으시긴 했는데 상관없다고 하심!

필수 4. 통계적 가설검정 🟢

풀이

import pandas as pd
import scipy.stats as stats

# 가설 설정
# 귀무가설 : 두 성별 간 평균 평점은 차이가 없다 
# 대립가설 : 두 성별 간 평균 평점은 차이가 있다

# 데이터 분리 
mask_m = (df['Gender'] == 'Male')
mask_f = (df['Gender'] == 'Female')

male = df[mask_m]
female = df[mask_f]

# 확인용 코드
display(male, female)

# 통계적 가설검정
t, p_value = stats.ttest_ind(female['Review Rating'], male['Review Rating'], equal_var=True)
print(f"t : {t:.4f}")
print(f"p-value : {p_value:.4f}")

# 해석 
# t : -0.5097 , p-value : 0.6103
# t < 0 이므로 남성 평균 평점이 더 크다 
# p-value > 0.05 이므로 통계적으로 유의미하지 않다 
# 따라서 최종적으로 귀무가설 채택
# 귀무가설 : 두 성별 간 평균 평점은 차이가 없다 > 채택 
# 대립가설 : 두 성별 간 평균 평점은 차이가 있다 > 기각

아예 슬라이싱으로 review rating만 가져오시긴 했는데 이것도 문제 없을듯!

m_df = m_df[['Review Rating']]
f_df = f_df[['Review Rating']]

필수 5. 통계적 가설검정 🟢

풀이

import pandas as pd
from scipy.stats import chi2_contingency

# 가설 설정
# 귀무가설 : Color, season은 관계가 없다 
# 대립가설 : Color, season은 관계가 있다 

# 데이터 확인
df.groupby(['Color', 'Season'])['Customer ID'].count().reset_index()

result = pd.crosstab(df['Color'], df['Season']) ✅

# 확인용 
result

# 카이제곱 독립성 검정 
chi2, p, dof, expected = chi2_contingency(result)
print(f"카이제곱 통계량: {chi2:.3f}, p-value : {p:.3f}, 자유도: {dof}")

# 해석
# 카이제곱 통계량: 64.651, p-value : 0.719, 자유도: 72
# p-value > 0.05 이므로 통계적으로 유의미하지 않다
# 따라서 최종적으로 귀무가설 채택
# 귀무가설 : Color, season은 관계가 없다 > 채택
# 대립가설 : Color, season은 관계가 있다 > 기각
# 두 변수는 독립적임

5️⃣ 해설

빈도표 만드는 다른 방법 !

frequency_table = df.groupby(['Color','Season']).size().unstack(fill_value=0)

필수 6. 머신러닝 🟢

풀이

import pandas as pd
import numpy as np 
from sklearn.linear_model import LinearRegression

X = [10, 20, 30, 40, 60, 100] # 광고예산
Y = [50, 60, 70, 80, 90, 120] # 일일매출

# 어레이로 변환 (1차원 > 2차원)
X = np.array(X).reshape(-1, 1)
Y = np.array(Y)

# 모델 선언
model = LinearRegression()

# 모델 학습 
model.fit(X,Y)

# 기울기, 절편
coef = model.coef_[0]         # 기울기
intercept = model.intercept_  # 절편
print(f"회귀식 : y = {coef:.3f}x + {intercept:.3f}") # y = 0.756x + 45.562

# 광고예산이 1000만원일 경우 매출 예측 
budget = 1000

sales = coef * budget + intercept
print(f"매출 : {sales:.0f}만원") # 802만원

# 해석
# 광고 예산과 일일 매출은 양의 상관관계
# 광고 예산이 1만원 증가할때, 일일 매출은 약 0.756만원 증가 
# 광고 예산이 1000만원일 경우 예상 매출은 약 802만원
# 따라서 광고 집행이 매출 증대에 도움이 됨

6️⃣ 해설

# 예측 
predicted_sales = model.predict(np.array([[1000]])

예측을 나는 수기로 했고, 튜터님은 predict 사용

도전 1. 머신러닝 🟢

풀이

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 결측치 확인 
df.isna().sum() # 없음

df.shape # (3900, 19)

# 변수선택 
X = df[['Review Rating', 'Age', 'Previous Purchases']]

# 인코딩
encoder = LabelEncoder()
y = encoder.fit_transform(df['Discount Applied'])

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)

# 확인
print(X_train.shape) # (2730, 3)
print(y_test.shape) # (1170,)

# Random Forest 학습 
rf_model = RandomForestClassifier(random_state = 42)
rf_model.fit(X_train, y_train)

# 예측
y_pred_rf = rf_model.predict(X_test)

# 성능평가
print("🌳 Random Forest 🌳")
print("Accuracy : ", accuracy_score(y_test, y_pred_rf))
print("\n Confusion Matrix : \n", confusion_matrix(y_test, y_pred_rf))
print("\n Classfication Report : \n", classification_report(y_test, y_pred_rf))

# 해석
# 정확도는 약 51.1% > 성능이 별로 좋지 않은 모델
# 0(No) 정밀도 0.56, 재현율 0.65 > No 클래스는 성능 양호
# 1(Yes) 정밀도 0.41, 재현율 0.32 > yes 클래스는 성능 별로
# 실제 할인 받은 고객을 제대로 분류하지 못함

7️⃣ 해설

accuracy = rf_model.score(X_test, y_test)

예측값은 따로 저장하지 않고 정확도만 계산하심

도전 2. 머신러닝 🟢

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score

# 변수선택 
X = df[['Age', 'Purchase Amount (USD)', 'Review Rating']]

# 인코딩
encoder = LabelEncoder()
y = encoder.fit_transform(df['Subscription Status'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)

# 확인
print(X_train.shape) # (2730, 3)
print(y_test.shape) # (1170,)

# Logistic 학습 
logi_model = LogisticRegression()
logi_model.fit(X_train, y_train)

# 예측
y_pred_logi = logi_model.predict(X_test)
y_proba_logi = logi_model.predict_proba(X_test)

# 신규 고객 
new_customer = [[30, 50, 4.0]]

# 구독 취소 확률 구하기
proba = logi_model.predict_proba(new_customer)
print(f"🧑‍🦲 고객정보 \n고객 나이 : {new_customer[0][0]} / 구매 금액 : {new_customer[0][1]} / 리뷰 평점 : {new_customer[0][2]}")
print(f"- 구독 확률 : {proba[0][0] * 100:.2f}%") #0.73, 0.270
print(f"- 구독취소 확률 : {proba[0][1] * 100:.2f}%")

# auc 계산 
auc_score = roc_auc_score(y_test, y_proba_logi[:, -1])


# 성능평가
print("\n🌹 Logistic Regression 🌹")
print("Confusion Matrix :\n", confusion_matrix(y_test,y_pred_logi))
print("\n Accuracy : ", accuracy_score(y_test, y_pred_logi))
print("\n Classification Report :\n ", classification_report(y_test, y_pred_logi))
print("\n ROC - AUC score : ", auc_score)

# 해석 
# 고객이 구독할 확률은 약 73%, 구독하지 않을 확률은 약 27%
# 정확도가 약 73%로 높게 나왔으나, 실제 구독 취소 고객을 하나도 맞추지 못한 것이라 좋은 모델이라 할 수 없음 
# roc-auc score 약 0.5로 랜덤에 가까움