๋จผ์ , ์ ๋ฐ์ดํฐ๋ฅผ ๊ฐ์ง๊ณ ๊ธฐ๋ณธ์ ์ธ ์ฐ์ฐ์ ์ํํด๋ณด์.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.set_printoptions(precision=5, suppress=True) #์์์ 5์๋ฆฌ๊น์ง๋ง ํํ๋๋๋ก
filename = '์ฑ_๋ฐ_์ฐ๋ น๋ณ_1์ธ๊ฐ๊ตฌ__์๊ตฐ๊ตฌ_20230315141048.csv'
np_data = pd.read_csv(filename,encoding='cp949').to_numpy()
print(np_data)
# [['๋ถ์ฐ๊ด์ญ์' 'ํฉ๊ณ' 2015 164617 197132]
# ['๋ถ์ฐ๊ด์ญ์' 'ํฉ๊ณ' 2016 168035 204377]
# ['๋ถ์ฐ๊ด์ญ์' 'ํฉ๊ณ' 2017 176932 211967]
# ['๋ถ์ฐ๊ด์ญ์' 'ํฉ๊ณ' 2018 183579 220829]
# ['๋ถ์ฐ๊ด์ญ์' 'ํฉ๊ณ' 2019 191796 231431]
# ['๋ถ์ฐ๊ด์ญ์' 'ํฉ๊ณ' 2020 206311 248896]
# ['๋ถ์ฐ๊ด์ญ์' 'ํฉ๊ณ' 2021 222040 265322]]
sub_data = np_data[:,2:].astype(np.int64)
print(sub_data)
# [[ 2015 164617 197132]
# [ 2016 168035 204377]
# [ 2017 176932 211967]
# [ 2018 183579 220829]
# [ 2019 191796 231431]
# [ 2020 206311 248896]
# [ 2021 222040 265322]]
man_mean = np.mean(sub_data[:,1])
woman_mean = np.mean(sub_data[:,2])
print(man_mean) # 187615.7142857143
print(woman_mean) # 225707.7142857143
man_std = np_data[:,1] - man_mean
woman_std = np_data[:,2] - man_mean
print( man_std)
# [-22998.71429 -19580.71429 -10683.71429 -4036.71429 4180.28571
# 18695.28571 34424.28571]
print(woman_std)
# [ 9516.28571 16761.28571 24351.28571 33213.28571 43815.28571 61280.28571
# 77706.28571]
corr = np.corrcoef(sub_data[:,1],sub_data[:,2])
print(corr)
# [[1. 0.9987]
# [0.9987 1. ]]
์๊ด๊ณ์๊ฐ 0.9987์ด๋ฏ๋ก ์ฌ์ 1์ธ๊ฐ๊ตฌ์ ์ฆ๊ฐ --> ๋จ์ 1์ธ ๊ฐ๊ตฌ์๋ ์ฆ๊ฐํ๋ค๊ณ ๋ณผ ์ ์์ ๊ฒ์ด๋ค.
.
year_sum = np.sum(sub_data[:,1:],axis=1)
print(year_sum)
# [361749 372412 388899 404408 423227 455207 487362]
man_per = sub_data[:,1] / year_sum
woman_per = sub_data[:,2] / year_sum
print(man_per) # [0.45506 0.45121 0.45496 0.45395 0.45318 0.45322 0.4556 ]
print(woman_per) # [0.54494 0.54879 0.54504 0.54605 0.54682 0.54678 0.5444 ]
.
.
.
.
: ๋ ๊ฐ ๊ทธ๋ฃน์ ํ๊ท ๊ฐ ๋น๊ต
: stats.ttest_ind(a,b)
from scipy import stats
print(sub_data)
# [[ 2015 164617 197132]
# [ 2016 168035 204377]
# [ 2017 176932 211967]
# [ 2018 183579 220829]
# [ 2019 191796 231431]
# [ 2020 206311 248896]
# [ 2021 222040 265322]]
man = sub_data[:,1]
woman = sub_data[:,2]
stats.ttest_ind(man, woman)
p-value = 0.0086 < 0.05 ์ด๋ฏ๋ก, "๊ท๋ฌด๊ฐ์ค ๊ธฐ๊ฐ"
ยป ์ฆ, ๋ ์ง๋จ์ ํ๊ท ์ ์ฐจ์ด๋ ํต๊ณ์ ์ผ๋ก ์ ์ํ๋ค๊ณ ๋ณผ ์ ์๋ค. < 0.05
.
.
.
๋จ, t-test ๋ฅผ ์ํํ๊ธฐ ์ , 2๊ฐ์ง ์กฐ๊ฑด์ ํ์ธํด์ผํ๋ค.
๊ฐ ์ํ์ ๋ชจ์ง๋จ์ normal distribution์ ๋ฐ๋ฅธ๋ค.
(ํ์ค์ ๊ท๋ถํฌ ๊ฒ์ : shapiro, anderson, kstest, q-qplot )
๊ฐ ์ํ์ ๋ชจ์ง๋จ์ ๋ถ์ฐ์ ๊ฐ๋ค.
(๋ฑ๋ถ์ฐ ๊ฒ์ : barlet, levene )
1. shapiro ๊ฒ์
stats.shapiro(man)
stats.shapiro(woman)
man : p-value = 0.74499 > 0.05 ์ด๋ฏ๋ก, "๊ท๋ฌด๊ฐ์ค ์ฑํ"
woman : p-value = 0.66956 > 0.05 ์ด๋ฏ๋ก, "๊ท๋ฌด๊ฐ์ค ์ฑํ"
ยป ์ฆ, man๊ณผ woman ์ํ์ ๋ชจ์ง๋จ์ ์ ๊ท๋ถํฌ๋ฅผ ๋ฐ๋ฅธ๋ค๊ณ ๋ณผ ์ ์๋ค.
.
.
2. anderson ๊ฒ์
stats.anderson(man)
.
.
3. kstest ๊ฒ์
: goodness of fit (์ ํ๋ ๋ถํฌ์ ์ผ์นํ๋์ง ๊ฒ์ )
stats.kstest(man, stats.norm.cdf)
p-value < 0.05 ์ด๋ฏ๋ก, "๊ท๋ฌด๊ฐ์ค ๊ธฐ๊ฐ"
.
.
4. Q-Q plot
stats.probplot
qqplot ์ ๋ถ์์๋์กฐ๋๋ก ๋ถ๋ฆฌ๋ฉฐ, ์ ๊ท๋ชจ์ง๋จ ๊ฐ์ ์ ํ๋ ๋ฐฉ๋ฒ ์ค ํ๋์ด๋ฉฐ ์์ง ๋ฐ์ดํฐ๋ฅผ ํ์ค์ ๊ท๋ถํฌ์ ๋ถ์์์ ๋น๊ตํ์ฌ ๊ทธ๋ฆฌ๋ ๊ทธ๋ํ์ด๋ค.
๋ชจ์ง๋จ์ด ์ ๊ท์ฑ์ ๋ฐ๋ฅธ๋ค๋ฉด , ์ง์ ์ ํํ ๋ก ๊ทธ๋ ค์ง๊ฒ ๋๋ค.
_, axe = plt.subplots()
stats.probplot(man,plot=axe)
โจ๏ธ ๋ฑ๋ถ์ฐ ๊ฒ์
1. bartlett ๊ฒ์
stats.bartlett(man, woman)
ยป p-value = 0.6949 > 0.05 ์ด๋ฏ๋ก, "๊ท๋ฌด๊ฐ์ค ์ฑํ"
.
.
2. levene ๊ฒ์
stats.levene(man, woman)
ยป p-value = 0.6811 > 0.05 ์ด๋ฏ๋ก, "๊ท๋ฌด๊ฐ์ค ์ฑํ"