<빅데이터 분석> 11. 상관분석과 회귀분석

정지인·2025년 6월 6일

✅ Section 01. 상관분석 초 통계를 이용한 상관계수 도출

train=[[25, 100], [52, 256], [38, 152], [32, 140], [25, 150]]

x=[i[0] for i in train]
y=[j[1] for j in train]
x,y

✅ 기초 통계 함수 구현하기

def mean(x):
  return sum(x) / len(x)
mean(x), mean(y)

✅ 개별 값과 평균의 차

def d_mean(x):
  x_mean=mean(x)
  return [i - x_mean for i in x]

d_mean(x), d_mean(y)

x1=d_mean(x)
mean(x1)

✅ 내적

def dot(x, y):
  return sum([x*y for x, y in zip(x, y)])
dot(x, y)

✅ 제곱의 합

def sum_of_squares(v):
  return dot(v, v)
sum_of_squares(x), sum_of_squares(y)

✅ 분산

def variance(x):
  n=len(x)
  d=d_mean(x)
  return sum_of_squares(d) / (n-1)
variance(x)

✅ 표준편차

def standard_deviation(x):
  return variance(x)**0.5

standard_deviation(x)

✅ <여기서 잠깐> math 라이브러리의 sqrt 함수로 표준편차 함수 구현하기

import math
def standard_deviation(x):
 return math.sqrt(variance(x))
standard_deviation(x)

✅ 공분산

def covariance(x, y):
  n=len(x)
  return dot(d_mean(x), d_mean(y)) / (n-1)

covariance(x, y)

✅ 상관계수

def correlation(x, y):
  stdev_x=standard_deviation(x) # 표준편차(x) 할당
  stdev_y=standard_deviation(y) # 표준편차(y) 할당
  if stdev_x > 0 and stdev_y > 0: # stdev_x와 stdev_y가 0을 초과하면
    return covariance(x, y) / (stdev_x * stdev_y) # 상관계수 결과 반환
  else:
    return 0

correlation(x, y)

✅ 넘파이 함수로 기초 통계 구하기

import numpy as np
x1=np.array(x)
x1.mean(), x1.var(), x1.std()

np.cov(x1,y), np.corrcoef(x1,y)

np.cov(x1,y)[0][1], np.corrcoef(x1,y)[0][1]

✅ Section 02. 회귀분석1 최소자승법을 이용한 회귀계수 도출

1. 회귀계수 구하기

def OLS(x,y):
  beta=covariance(x, y)/variance(x) # 공분산(x,y)/분산(x)
  alpha=mean(y)-beta*mean(x) # 평균(y)–beta*평균(x)
  return [alpha, beta]

OLS(x,y)

def OLS_fit(x,y):
  beta=(correlation(x, y)*standard_deviation(y))/standard_deviation(x)
  # beta=(상관계수(x,y)*표준편차(y))/표준편차(x)
  alpha=mean(y)-beta*mean(x)
  return [alpha, beta]

OLS_fit(x,y)

✅ 예측값 구하기

def predict(alpha, beta,train, test):
  predictions=list() # 예측값 선언
  x=[i[0] for i in train] # 변수 x에 train 데이터의 아파트 평수 저장
  y=[j[1] for j in train] # 변수 y에 train 데이터의 전력량 저장
  alpha, beta=OLS_fit(x,y) # alpha와 beta에 OSL_fit 함수의 반환값 저장
  for i in test:
    yhat=alpha+beta*i[0]
    predictions.append(yhat) # predictions 리스트에 변수 yhat에 입력된 값 추가
  return predictions # predictions 리스트 값 반환

train=[[25,100],[52,256],[38,152],[32,140],[25,150]]
alpha,beta=OLS_fit(x,y)
pr=predict(alpha, beta, train, train) # predict 함수의 반환값을 변수 pr에 할당
print(pr) # 변수 pr에 할당된 prediction 값 출력

✅ <여기서 잠깐> matplotlib 라이브러리를 이용한 산점도 그래프 표시

!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

# matplotlib 라이브러리 임포트, 글꼴 설정
import matplotlib.pyplot as plt
plt.rc('font', family='NanumGothic')
plt.title('아파트 평수에 따른 전기 사용량')# 그래프 제목 설정
plt.scatter(x, y, c='red') # 산점도 차트 표시. x축, y축 데이터 설정. 색상은 red로 설정
plt.plot(x,pr) # 라인 그래프 표시. x축, y축은 변수 pr에 할당된 값 사용
plt.xlabel('아파트 평형') # x축 제목
plt.ylabel('전기사용량') # y축 제목
plt.show()

✅ SSE 구하기

def SSE(alpha, beta, train, test):
  sse=0
  for i in test:
    error=(i[1]-(alpha+beta*i[0]))**2 # (실제값-예측값)의 제곱
    sse=error+sse
  return sse

SSE(alpha, beta, train, train)

✅ SST 구하기

def SST(alpha, beta, train, test):
  sst=0
  x=[i[0] for i in train]
  y=[j[1] for j in train]
  for i in test:
    sum_ds=(i[1]-mean(y))**2
    sst=sum_ds+sst
  return sst

SST(alpha, beta, train, train)

✅ 결정계수 구하기

def R_squared(alpha, beta, train, test):
  return 1.0-(SSE(alpha, beta, train, test)/SST(alpha, beta, train, test))

R_squared(alpha, beta, train, train)

train=[[25, 100], [52, 256], [38, 152], [32, 140], [25, 150]]
x=[i[0] for i in train]
y=[j[1] for j in train]
import statsmodels.api as sms
_X=sms.add_constant(x)
model=sms.OLS(y, _X).fit()
print(model.summary())

✅ 예측력 구하기

test=[[45,183],[40,175],[55,203],[28,152],[42,198]]
test

✅ 종속변수 값 예측하기

def predict(alpha, beta, train, test):
  predictions=list()
  x=[i[0] for i in train]
  y=[j[1] for j in train]
  alpha, beta=OLS_fit(x,y)
  for i in test:
    yhat=alpha+beta*i[0]
    predictions.append(yhat)
  return predictions

predict(alpha, beta, train, test)

✅ 예측 결과 평가하기

actual=[j[1] for j in test]
predicted=predict(alpha, beta, train, test)
actual, predicted

from math import sqrt
def RMSE(actual, predicted): # 변수 RMSE 선언, 인자는 actual, predicted
  sum_error=0.0 # 변수 sum_error 값을 0.0으로 초기화
  for i in range(len(actual)): # for 문으로 변수 actual에 저장된 값만큼 반복
    prediction_error=predicted[i]-actual[i] # 예측값[i]-실제값[i] 반환
    sum_error+=(prediction_error**2) # prediction_error 제곱 누적
    mean_error=sum_error/float(len(actual)) # sum_error/len(actual) 값 저장
  return sqrt(mean_error) # mean_error 제곱근 반환

RMSE(actual, predicted)

✅ Section 03. 회귀분석2 경사하강법을 이용한 회귀계수 도출

✅ 회귀계수 구하기

dataset=[[25,100],[52,256],[38,152],[32,140],[25,150],[45,183],[40,175],[55,203],[28,152],[42,198]]
train=dataset[:5]
test=dataset[5:]
print('학습 데이터(train): {}, 테스트 데이터(test): {}'.format(train, test))

coef=[0.0 for i in range(len(train[0]))] # 회귀계수 값 초기화
coef

def predict(row, coef):
  # yhat=coef[0]+coef[i+1]*row[i]
  yhat=coef[0] # 절편
  for i in range(len(row)-1):
    yhat+=coef[i+1]*row[i] # 회귀계수*학습 데이터
  return yhat

def coefficients_sgd(train, l_rate, n_epoch): # 학습 데이터, 학습률, 학습 횟수
  coef=[0.0 for i in range(len(train[0]))] # 절편과 회귀계수 모두 0으로 초기화
  for epoch in range(n_epoch): # 학습 횟수만큼 반복
    sse=0 # 오차 제곱합
    for row in train: # 학습 데이터를 차례로 row에 넘겨 predict 함수 실행
      yhat=predict(row, coef) # 예측값
      error=yhat-row[-1] # 오차
      sse+=error**2 # 오차 제곱
      coef[0]=coef[0] - l_rate * error # 절편 수정
      for i in range(len(row)-1):
        coef[i+1]=coef[i+1]-l_rate*error*row[i]
      return coef, sse

import math
def coefficients_sgd(train, l_rate, n_epoch):
  coef=[0.0 for i in range(len(train[0]))] #초기 회귀계수 값
  for epoch in range(n_epoch):
    sum_error=0
    for row in train:
      yhat=predict(row, coef)
      error=row[-1]-yhat
      sum_error+=(error**2)
      coef[0]=coef[0]+l_rate*error
      for i in range(len(row)-1):
        coef[i+1]=coef[i+1]+l_rate*error*row[i]
 # print('>epoch=%d, lrate=%.5f, error=%.2f' % (epoch,l_rate,sum_error))
  return coef, math.sqrt(sum_error/len(train))

l_rate=0.0001
n_epoch=10
coef=coefficients_sgd(train, l_rate, n_epoch)
coef

✅ 예측값 구하고 결과 평가하기

dataset=[[25,100],[52,256],[38,152],[32,140],[25,150],[45,183],[40,175],[55,203],[28,152],[42,198]]
train=dataset[:5]
test=dataset[5:]
actual=[j[1] for j in test]
l_rate=0.0001
n_epoch=10
alpha,beta=coefficients_sgd(train,l_rate,n_epoch)[0][0],coefficients_sgd(train, l_rate, n_epoch)[0][1]

alpha,beta

def predicted(train, test, alpha, beta):
  predictions=list()
  x=[i[0] for i in train]
  y=[j[1] for j in train]
  for i in test:
    yhat=alpha+beta*i[0]
    predictions.append(yhat)
  return predictions

pred=predicted(train, test, alpha, beta)
pred

from math import sqrt

def RMSE(actual, predicted):
  sum_error=0.0
  for i in range(len(actual)):
    prediction_error=predicted[i]-actual[i]
    sum_error+=(prediction_error**2)

  mean_error=sum_error/float(len(actual))
  return sqrt(mean_error)

RMSE(actual, pred)

✅ 종합 및 심화

def sgd1(sq_error, sq_error_grad, x, y, theta_0, l_rate_0):
  data=list(zip(x, y))
  theta=theta_0
  l_rate=l_rate_0
  min_value=float("inf")
  iterations=0
  while iterations < 5:
    value=sum(sq_error(xi, yi, theta) for xi, yi in data)
    if value < min_value:
      min_theta, min_value=theta, value
      iterations=0
      l_rate=l_rate_0
    else:
      iterations+=1
      l_rate*=0.9
    for xi, yi in data:
      gradient_i=sq_error_grad(xi, yi, theta)
      theta=vector_subtract(theta, scalar_multiply(l_rate, gradient_i))
      # print("min_theta",min_theta)
  return min_theta

def sq_error(xi, yi,theta):
  alpha, beta=theta
  error=yi-(beta * xi + alpha)
  sq_error=error**2
  return sq_error

def sq_error_grad(xi, yi, theta):
  alpha, beta=theta
  return [-2 *(yi-(beta * xi + alpha)),-2 *(yi-(beta * xi + alpha)) * xi]

def vector_subtract(v, w):
  return [vi - wi for vi, wi in zip(v,w)]

def scalar_multiply(c, v):
  return [c * vi for vi in v]

dataset=[[25,100],[52,256],[38,152],[32,140],[25,150],[45,183],[40,175],[55,203],[28,152],[42,198]]
train=dataset[:5]
test=dataset[5:]
x=[i[0] for i in train]
y=[j[1] for j in train]
l_rate_0=0.001
theta_0=[0,0]
sgd1(sq_error, sq_error_grad, x, y, theta_0, l_rate_0)

정지인

멋쟁이사자 13기 백엔드

이전 포스트

<빅데이터 분석> 10. 기본-소비 데이터 분석과 승객 생존율 예측

다음 포스트

<빅데이터 분석> 11. 상관분석과 회귀분석

✅ Section 01. 상관분석 초 통계를 이용한 상관계수 도출

✅ 기초 통계 함수 구현하기

✅ 개별 값과 평균의 차

✅ 내적

✅ 제곱의 합

✅ 분산

✅ 표준편차

✅ <여기서 잠깐> math 라이브러리의 sqrt 함수로 표준편차 함수 구현하기

✅ 공분산

✅ 상관계수

✅ 넘파이 함수로 기초 통계 구하기

✅ Section 02. 회귀분석1 최소자승법을 이용한 회귀계수 도출

1. 회귀계수 구하기

✅ 예측값 구하기

✅ <여기서 잠깐> matplotlib 라이브러리를 이용한 산점도 그래프 표시

✅ SSE 구하기

✅ SST 구하기

✅ 결정계수 구하기

✅ 예측력 구하기

✅ 종속변수 값 예측하기

✅ 예측 결과 평가하기

✅ Section 03. 회귀분석2 경사하강법을 이용한 회귀계수 도출

✅ 회귀계수 구하기

✅ 예측값 구하고 결과 평가하기

✅ 종합 및 심화

<빅데이터 분석> 10. 기본-소비 데이터 분석과 승객 생존율 예측

<빅데이터분석> NumPy, 머신러닝, 신경망 요약 정리

0개의 댓글