#############################################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
#############################################
주어진 데이터를 바탕으로 회귀분석하시오.
a. 회귀식 세우기
b. 독립변수의 회귀계수 의미를 해석할 것.
* 이때 독립변수는 Area Income, Avg. Area House Age, Avg. Area Number of Rooms, Area Population 만 사용할 것.
* 종속변수는 Price
종속변수
독립변수
#########이 셀은 실행만 하고 건들지 마시오############
df = pd.read_csv('https://raw.githubusercontent.com/ericoh929/Machine-Learning/main/USA_Housing.csv')
df['Price'] = df['Price']
df
#######################################################
Avg. Area Income | Avg. Area House Age | Avg. Area Number of Rooms | Avg. Area Number of Bedrooms | Area Population | Price | Address | |
---|---|---|---|---|---|---|---|
0 | 79545.458574 | 5.682861 | 7.009188 | 4.09 | 23086.800503 | 1.059034e+06 | 208 Michael Ferry Apt. 674\nLaurabury, NE 3701... |
1 | 79248.642455 | 6.002900 | 6.730821 | 3.09 | 40173.072174 | 1.505891e+06 | 188 Johnson Views Suite 079\nLake Kathleen, CA... |
2 | 61287.067179 | 5.865890 | 8.512727 | 5.13 | 36882.159400 | 1.058988e+06 | 9127 Elizabeth Stravenue\nDanieltown, WI 06482... |
3 | 63345.240046 | 7.188236 | 5.586729 | 3.26 | 34310.242831 | 1.260617e+06 | USS Barnett\nFPO AP 44820 |
4 | 59982.197226 | 5.040555 | 7.839388 | 4.23 | 26354.109472 | 6.309435e+05 | USNS Raymond\nFPO AE 09386 |
... | ... | ... | ... | ... | ... | ... | ... |
4995 | 60567.944140 | 7.830362 | 6.137356 | 3.46 | 22837.361035 | 1.060194e+06 | USNS Williams\nFPO AP 30153-7653 |
4996 | 78491.275435 | 6.999135 | 6.576763 | 4.02 | 25616.115489 | 1.482618e+06 | PSC 9258, Box 8489\nAPO AA 42991-3352 |
4997 | 63390.686886 | 7.250591 | 4.805081 | 2.13 | 33266.145490 | 1.030730e+06 | 4215 Tracy Garden Suite 076\nJoshualand, VA 01... |
4998 | 68001.331235 | 5.534388 | 7.130144 | 5.44 | 42625.620156 | 1.198657e+06 | USS Wallace\nFPO AE 73316 |
4999 | 65510.581804 | 5.992305 | 6.792336 | 4.07 | 46501.283803 | 1.298950e+06 | 37778 George Ridges Apt. 509\nEast Holly, NV 2... |
5000 rows × 7 columns
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
width="24px">
##############################################################
df.columns = [col.replace(' ', '_') for col in df.columns]
df.columns = [col.replace('.', '') for col in df.columns]
model = smf.ols('Price ~ Avg_Area_Income + Avg_Area_House_Age + Avg_Area_Number_of_Rooms + Area_Population', data = df).fit()
model.summary()
##############################################################
Dep. Variable: | Price | R-squared: | 0.918 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.918 |
Method: | Least Squares | F-statistic: | 1.398e+04 |
Date: | Fri, 16 Jun 2023 | Prob (F-statistic): | 0.00 |
Time: | 07:46:15 | Log-Likelihood: | -64714. |
No. Observations: | 5000 | AIC: | 1.294e+05 |
Df Residuals: | 4995 | BIC: | 1.295e+05 |
Df Model: | 4 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -2.638e+06 | 1.72e+04 | -153.726 | 0.000 | -2.67e+06 | -2.6e+06 |
Avg_Area_Income | 21.5827 | 0.134 | 160.743 | 0.000 | 21.320 | 21.846 |
Avg_Area_House_Age | 1.657e+05 | 1443.404 | 114.769 | 0.000 | 1.63e+05 | 1.68e+05 |
Avg_Area_Number_of_Rooms | 1.216e+05 | 1422.608 | 85.476 | 0.000 | 1.19e+05 | 1.24e+05 |
Area_Population | 15.1961 | 0.144 | 105.388 | 0.000 | 14.913 | 15.479 |
Omnibus: | 5.310 | Durbin-Watson: | 2.006 |
---|---|---|---|
Prob(Omnibus): | 0.070 | Jarque-Bera (JB): | 4.742 |
Skew: | 0.011 | Prob(JB): | 0.0934 |
Kurtosis: | 2.851 | Cond. No. | 9.40e+05 |
a.
b. : 해당지역 평균수입이 1달러 증가하면, 집값은 21.5827 달러 증가한다.
: 해당지역 평균건물연령이 1년 증가하면, 집값은 달러 증가한다.
: 해당지역 평균방의 갯수가 1개 증가하면, 집값은 달러 증가한다.
: 해당지역 인구수가 1명 증가하면, 집값은 15.1961 달러 증가한다.
주어진 데이터를 바탕으로 회귀분석하시오.
a. 회귀식 세우기
b. 독립변수의 회귀계수 의미를 로짓변환 관점에서 해석할 것.
* 이때 독립변수는 Pclass, Sex, Age 만 사용할 것. (Sex 는 male : 0, female : 1 로 바꿔서 회귀분석 할 것.)
* 종속변수는 Survived
종속변수
독립변수
#########이 셀은 실행만 하고 건들지 마시오############
df = pd.read_csv('https://raw.githubusercontent.com/ericoh929/Machine-Learning/main/titanic_train.csv')
df
######################################################
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
width="24px">
#####################################################
df['Sex'] = df['Sex'].replace('male', 0)
df['Sex'] = df['Sex'].replace('female', 1)
model = smf.logit('Survived ~ Age + Pclass + Sex', data = df).fit()
print(model.summary())
#####################################################
Optimization terminated successfully.
Current function value: 0.453285
Iterations 6
Logit Regression Results
==============================================================================
Dep. Variable: Survived No. Observations: 714
Model: Logit Df Residuals: 710
Method: MLE Df Model: 3
Date: Fri, 16 Jun 2023 Pseudo R-squ.: 0.3289
Time: 07:55:04 Log-Likelihood: -323.65
converged: True LL-Null: -482.26
Covariance Type: nonrobust LLR p-value: 1.860e-68
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 2.5339 0.456 5.554 0.000 1.640 3.428
Age -0.0369 0.008 -4.841 0.000 -0.052 -0.022
Pclass -1.2885 0.139 -9.253 0.000 -1.561 -1.016
Sex 2.5221 0.207 12.168 0.000 2.116 2.928
==============================================================================
a.
b. : 승객 나이가 1세 증가하면, 생존할 확률의 로짓값이 0.0369 만큼 감소한다.
: 객실 등급이 1단계 낮아지면, 생존할 확률의 로짓값이 1.2885 만큼 감소한다.
: 여성은 남성에 비해, 생존할 확률의 로짓값이 2.5221 만큼 증가한다.
주어진 데이터를 바탕으로 푸아송 회귀분석하시오.
주어진 데이터는 아이들이 놀이터에서 놀때 하루동안 잃어버리는 물건의 수를 독립변수들과 함께 정리해 놓은 것이다. 이때 독립변수의 값은 알려져있지만, 각 변수의 명은 미지수로만 알려져있다.
a. 모델에 대한 가설검정을 유의수준 0.05에서 실시후 무의미하다고 판단되는 회귀계수를 찾으시오.
b. 독립변수가 1단위 증가할 때, 종속변수 (하루동안 잃어버리는 물건 수) 가 감소하도록 하는 독립변수를 찾으시오.
**주의사항**
* 데이터가 두개가 있는데, 이를 합쳐서 활용해야함.
* 이때 독립변수는 f0~f15 를 모두 사용할 것.
* 종속변수는 loss (하루동안 잃어버리는 물건 수)
#########이 셀은 실행만 하고 건들지 마시오############
df1 = pd.read_csv('https://raw.githubusercontent.com/ericoh929/Machine-Learning/main/f_data1.csv') #데이터 1
df2 = pd.read_csv('https://raw.githubusercontent.com/ericoh929/Machine-Learning/main/f_data2.csv') #데이터 2
######################################################
df = pd.concat([df1, df2], axis = 0)
X = df.drop(['Unnamed: 0', 'loss'], axis = 1)
y = df['loss']
X
f0 | f1 | f2 | f3 | f4 | f5 | f6 | f7 | f8 | f9 | f10 | f11 | f12 | f13 | f14 | f15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.002350 | 59 | 0.766739 | -1.350460 | 42.27270 | 16.68570 | 30.35990 | 1.267300 | 0.392007 | 1.09101 | 1.96874 | 1.874640 | 117.286 | 6.716200 | 0.985656 | 0.973428 |
1 | 0.784462 | 145 | -0.463845 | -0.530421 | 27324.90000 | 3.47545 | 160.49800 | 0.828007 | 3.735860 | 1.28138 | -2.73947 | -0.529506 | 157.670 | 0.696384 | 1.441230 | 0.159056 |
2 | 0.317816 | 19 | -0.432571 | -0.382644 | 1383.26000 | 19.71290 | 31.10260 | -0.515354 | 34.430800 | 1.24210 | 2.90180 | -0.960340 | 118.590 | 7.696420 | 1.488760 | 0.387277 |
3 | 0.210753 | 17 | -0.616454 | 0.946362 | -119.25300 | 4.08235 | 185.25700 | 1.383310 | -47.521400 | 1.09130 | -1.51200 | -1.292340 | 125.461 | 7.343230 | -3.092390 | 0.713795 |
4 | 0.439671 | 20 | 0.968126 | -0.092546 | 74.30200 | 12.30650 | 72.18600 | -0.233964 | 24.399100 | 1.10151 | 1.77348 | -0.546781 | 147.186 | 17.394300 | 0.964678 | 0.964894 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
99995 | 0.923980 | 6 | 0.663212 | -0.055120 | 6.61768 | 1.26619 | 40.44790 | 0.852375 | 25.522400 | 1.12294 | -1.35367 | 0.518291 | 122.352 | 5.136830 | 1.377340 | 0.473856 |
99996 | 0.243556 | 7 | -0.557062 | 1.333470 | -54.88610 | 17.58310 | 212.96400 | -1.204750 | -128.385000 | 1.19334 | 0.47586 | 0.874639 | 122.067 | 6.227890 | 1.129860 | 0.144292 |
99997 | 0.046023 | 53 | 0.462863 | 0.704034 | 2062.94000 | 14.58160 | 11.42710 | -0.509812 | 80.818000 | 1.20383 | -2.01340 | -1.154930 | 157.777 | 1.685930 | 1.064220 | 0.285237 |
99998 | 0.977330 | 12 | -1.002880 | 0.576377 | 4741.16000 | 11.10090 | 3.81546 | 0.616191 | 118.902000 | 1.50424 | -1.67184 | 0.907019 | 159.786 | 10.562000 | -3.321150 | 0.285740 |
99999 | 0.244233 | 51 | 0.729304 | -0.702592 | 197.46200 | 19.02400 | 42.33370 | 0.465181 | -115.398000 | 1.13675 | 0.27846 | -0.575401 | 161.328 | 4.127680 | -2.012820 | 0.570032 |
250000 rows × 16 columns
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
width="24px">
col_list = str()
for col in X.columns:
if col == 'f15':
col_list += col
break
else:
col_list += col
col_list += ' + '
col_list
'f0 + f1 + f2 + f3 + f4 + f5 + f6 + f7 + f8 + f9 + f10 + f11 + f12 + f13 + f14 + f15'
model = smf.poisson('loss ~ {}'.format(col_list), data = df).fit()
model.summary()
Dep. Variable: | loss | No. Observations: | 250000 |
---|---|---|---|
Model: | Poisson | Df Residuals: | 249983 |
Method: | MLE | Df Model: | 16 |
Date: | Fri, 16 Jun 2023 | Pseudo R-squ.: | 0.001300 |
Time: | 07:54:45 | Log-Likelihood: | -1.4258e+06 |
converged: | False | LL-Null: | -1.4276e+06 |
Covariance Type: | nonrobust | LLR p-value: | 0.000 |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 1.6683 | 0.012 | 138.780 | 0.000 | 1.645 | 1.692 |
f0 | 0.0320 | 0.002 | 12.862 | 0.000 | 0.027 | 0.037 |
f1 | 0.0003 | 1.79e-05 | 18.047 | 0.000 | 0.000 | 0.000 |
f2 | -0.0039 | 0.001 | -6.764 | 0.000 | -0.005 | -0.003 |
f3 | -0.0281 | 0.001 | -29.021 | 0.000 | -0.030 | -0.026 |
f4 | 1.145e-06 | 1.25e-07 | 9.147 | 0.000 | 9e-07 | 1.39e-06 |
f5 | 0.0019 | 0.000 | 13.799 | 0.000 | 0.002 | 0.002 |
f6 | 3.595e-05 | 4.16e-06 | 8.644 | 0.000 | 2.78e-05 | 4.41e-05 |
f7 | 0.0011 | 0.001 | 1.218 | 0.223 | -0.001 | 0.003 |
f8 | 9.781e-05 | 7.69e-06 | 12.718 | 0.000 | 8.27e-05 | 0.000 |
f9 | 0.0744 | 0.008 | 9.716 | 0.000 | 0.059 | 0.089 |
f10 | -0.0013 | 0.000 | -4.225 | 0.000 | -0.002 | -0.001 |
f11 | -0.0081 | 0.001 | -9.884 | 0.000 | -0.010 | -0.006 |
f12 | 0.0005 | 5.31e-05 | 8.773 | 0.000 | 0.000 | 0.001 |
f13 | 0.0054 | 0.000 | 39.122 | 0.000 | 0.005 | 0.006 |
f14 | -0.0027 | 0.000 | -5.745 | 0.000 | -0.004 | -0.002 |
f15 | -0.0028 | 0.001 | -2.275 | 0.023 | -0.005 | -0.000 |
a. f7
b. f2, f3, f10, f11, f14, f15