1번

import pandas as pd
df = pd.read_csv("data/employee_performance.csv")
# 사용자 코딩
df.고객만족도 = df.고객만족도.fillna(df.고객만족도.mean())
df = df.dropna(subset=['근속연수'])
print(df.고객만족도.quantile(0.75))
print(df.groupby('부서').연봉.mean().sort_values(ascending=False).iloc[1])
8.0 => 8
74690.18823529412 => 74690
2번
코드를 입력하세요
3번

import pandas as pd
import numpy as np
from scipy import stats
df = pd.read_csv("data/bcc.csv")
#1번
df['log'] = np.log(df['Resistin'])
group1 = df[df['Classification']==1]['log']
group2 = df[df['Classification']==2]['log']
var1 = group1.var()
var2 = group2.var()
dof1 = len(group1)-1
dof2 = len(group2)-1
# print(dof1, dof2)
f_stat = var2/var1
print(round(f_stat,3))
#2번
n1=len(group1)
n2=len(group2)
pooled_var = ((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2)
print(round(pooled_var,3))
#3번 - 2가지 방식
mean1 = group1.mean()
mean2 = group2.mean()
t_stat = (mean1-mean2) / np.sqrt(pooled_var*(1/n1 + 1/n2))
p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df = n1+n2-2))
print(round(p_value,3))
ttest_result = stats.ttest_ind(group1, group2, equal_var=True)
print(ttest_result)