220825 Day3

유예지·2022년 8월 25일

결측치

-값이 없는 것, 비어있는 것(빈칸)
-표시: na()

-결측치가 있으면 R에서는 값이 인식되는데 다른 곳에서는 없는 것으로 인식되어 에러가 발생한다
-결측치가 있는지, 있다면 몇개가 있는지, 그것을 어떻게 처리할 것인지 "미리 확인해야" 데이터 값을 구할 수 있다

-is.na() : 결측치 확인; TRUE, FALSE로 구분
-table() : 결측치 빈도(개수) 출력

-na.omit() : 모든 변수에 결측치를 없애고 데이터 추출
-na가 있는 행을 지운다 -> 행 내에서 na가 하나만 있어도 행 자체가 없어짐
-자주 사용은 하지 않는 것이 좋음

-na.rm() : na를 지운다(remove)
na.rm() = T or F ; T인 경우 결측치가 제외됨

df <- data.frame(sex=c('M','F',NA,'M','F'),
                score=c(5,4,3,4,NA))   #data frame 생성
df

is.na(df)
table(is.na(df))
table(is.na(df$sex))
table(is.na(df$score))

df %>% filter(is.na(score))   #결측치가 있는 행 추출
df %>% filter(!is.na(score))  #결측치가 있는 행 제외한 나머지 데이터 추출
df %>% filter(!is.na(sex))

df_nomiss <- df %>% filter(!is.na(score))
mean(df_nomiss$score)

df %>% filter(!is.na(score) & !is.na(sex))

na.omit(df)

mean(df$score, na.rm = T)   #결측치 제외하고 평균 산출

#예제

airquality

#1. NA가 모두 몇개 인가?
table(is.na(airquality))

#2.어느 칼럼에 NA가 몇개가 있는가?
table(is.na(airquality$Ozone))
table(is.na(airquality$Solar.R))
table(is.na(airquality$Wind))
table(is.na(airquality$Temp))
table(is.na(airquality$Month))
table(is.na(airquality$Day))

#summary로 위의 두 문제를 한번에 확인할 수 있다
summary(airquality)

#3.오존 농도의 평균은?
mean(airquality$Ozone, na.rm=T)

NA를 처리하는 방법
1.NA를 빼고 처리한다
2.NA에 같은 열에서 NA를 제외한 나머지 값으로 처리한 평균 값을 넣고 처리한다
(많이 쓰는 방법)
3.NA의 바로 위의 행에 있는 값과 같은 값을 NA에 넣고 처리한다 (2번 보다는 합리적인 방법)

이상치

-outlier, extraordinary
-오류값은 아니고, 이상한 값

outlier <- data.frame(sex=c(1,2,1,3,2,1), score=c(5,4,3,4,2,6))
outlier
table(outlier)

table(outlier$sex)  #데이터 값의 개수 확인
table(outlier$score)

#sex가 3이면 3에 NA 할당
outlier$sex <- ifelse(outlier$sex == 3, NA, outlier$sex)
outlier

#score가 5보다 크면 5에 NA 할당
outlier$score <- ifelse(outlier$score > 5, NA, outlier$score)
outlier

-이상치에 NA를 할당한 후 na.rm()을 사용해서 평균값을 구하면 이상치를 제외한 평균값을 알 수 있다

* 성별에 따른 점수 평균

#방법1
outlier %>% filter(!is.na(sex) & !is.na(score)) %>% 
group_by(sex) %>% summarise(mean_s=mean(score))

#방법2
na.omit(outlier)
na.omit(outlier) %>% group_by(sex) %>% summarise(평균=mean(score))

#방법3
outlier %>% filter(!is.na(sex)) %>% group_by(sex) %>% 
summarise(평균=mean(score, na.rm = T))

#방법4
outlier %>% filter(sex == 1 | sex == 2) %>% group_by(sex) %>% 
summarize(mean_score = mean(score, na.rm = T))

* 상자 그림(상자 수염 그림) 통계치 출력

-형태 : boxplot()
-이상치를 잡아내기 위한 것(이상치 파악)
-수염 바깥에 있는 것이 이상치

library(ggplot2)
mpg

mpg <- mpg %>% head(2)
mpg

rm(mpg)   #mpg를 지운다
mpg


df <- df %>% head(3)
df

rm(df)  #df를 지운다
df

#drv : 구동방식
mpg[ ,'drv']   #결과물이 tibble로 나온다
mpg$drv   #결과물이 vector로 나온다

unique(mpg$drv)   #하나의 값만 보여달라
table(mpg$drv)    #table로도 확인 가능

#drv별로 hwy(고속도로연비)의 평균값(이상치는 제외)
#무엇이 이상치인가? 이상치 파악 -> boxplot()
#입력되는 형태가 숫자인지, 문자인지, 데이터 프레임인지 등 데이터의 형태가 무엇인지가 중요하다
boxplot(mpg$drv)
boxplot(mpg$hwy)

mpg %>% select(hwy) %>% arrange(desc(hwy)) %>% head
mpg %>% select(drv,hwy) %>% filter(hwy < 42 & hwy > 12) %>% 
group_by(drv) %>% summarise(평균=mean(hwy))

ggplot2 : 그래픽 패키지 (시각화)

library(ggplot2)
ggplot(data = mpg, aes(y=hwy)) + geom_boxplot(color='skyblue')
#mpg라는 데이터에 대하여 시각화하겠다

boxplot(mpg$hwy)  #단순한 형태의 그래픽

ggplot(data = mpg, aes(x=class, y=hwy, fill=class)) + geom_boxplot()

1. 평면세팅 : ggplot(data= , aes(x = , y = ))

-plot() : 그림을 그리는 함수
-aes() : aestetic(?); 심미적인, x축, y축 꾸미기
-geom_function() : 어떤 그래프를 그릴지 정하는 함수, 포토샵에서의 레이어와 같은
-geom_function()의 괄호 안에 : position(x, y), color(색상), fill(채우기), shape(모양), linetype(선 형태), size(크기) 등

ggplot(mpg, aes(x=displ , y=hwy))  #displ:연비

ggplot(mpg, aes(x=displ , y=hwy)) + 
  geom_point(color='blue', size=2)   #geom_point : 산점도

ggplot(mpg, aes(x=displ , y=hwy)) + 
  geom_point(color='blue', size=2) + xlim(3,6) + ylim(10,30)  #x축과 y축에 범위 지정

2. 도형 선택 : + geom_point()

mpg
ggplot(mpg, aes(x=displ , y=hwy, color=drv)) + geom_point()
#displ은 데이터 형태가 dbl(숫자-연속형), drv는 chr(문자-범주형(카테고리형; factor))
#범주형 != 카테고리형; 범주형은 등급으로 나뉜다, 순서가 있다
#drv에 세가지의 데이터(4,f,r)가 있으므로 점의 색깔이 세가지로 나타남

ggplot(mpg, aes(x=displ , y=hwy, color=class)) + geom_point()
#drv는 chr, class도 동일하게 chr
#class에는 7가지의 데이터가 있어 점의 색깔이 7가지로 나뉘어 나타난다

ggplot(mpg, aes(x=displ , y=hwy, color=cty)) + geom_point()
#displ은 데이터 형태가 dbl, cty는 int
#color인 cty가 숫자형 데이터("연속형")이기 때문에 점의 색깔이 그라데이션으로 나타난다

#geom_point()는 변수가 3개 일때도 표현 가능하다

ggplot(mpg, aes(y=hwy, color=class)) + geom_boxplot()
ggplot(mpg, aes(y=hwy, fill=class)) + geom_boxplot()
#geom_boxplot()은 x축 또는 y축 한개의 변수(칼럼)만 있어도 그래프가 그려질 수 있다
#단, 변수가 1개는 연속형 이어야 한다 (ex. int)
#변수가 몇개인지, 변수의 데이터 타입이 무엇인지(chr, int, dbl,...) 확인필요

* 응용

ggplot(mpg, aes(x=displ , y=hwy, color=drv)) + geom_point()

ggplot(mpg, aes(x=displ , y=hwy)) + geom_point(color=drv)  #그래픽이 나오지 않는다

ggplot(mpg, aes(x=displ , y=hwy)) + geom_point(color="red")  #점이 red색으로 나온다

ggplot(mpg, aes(x=displ , y=hwy, color="blue")) + geom_point()  #점이 blue색으로 나오지 않는다

ggplot(mpg, aes(x=displ , y=hwy, color=drv)) + geom_point(color="blue")  #점이 blue색으로 나온다

ggplot(mpg, aes(x=displ , y=hwy)) + geom_point(aes(color=drv))  #맨 처음 것과 동일하게 나온다
#color=drv가 애초에 aes()안에 있었던 것이기 때문에, "aes(color=drv)"로 옮겨야 동일한 결과 도출

ggplot(mpg, aes(x=displ)) + geom_point(aes(y=hwy, color=drv))
ggplot(mpg) + geom_point(aes(x=displ, y=hwy, color=drv))
#aes()는 geom_point()에도 매길 수 있다

* __::glimpse() : 데이터 프레임의 구조를 볼 때 한 줄로 정렬하여 보는 것

str(mpg)
dplyr::glimpse(mpg)

코드를 재사용하기 쉽게 설정하기

ggplot(mpg, aes(x=displ, y=hwy)) + geom_point()
ggplot(mpg, aes(x=displ, y=hwy)) + geom_smooth(method = lm)

g <- ggplot(mpg, aes(x=displ, y=hwy))
g + geom_point()
g + geom_smooth(method=lm)

f1 <- geom_point()
g + f1

3. 테마 : theme_~~~()

-점의 색깔을 데이터 값 별로 달리하여 보는 이유 :
전체 데이터에서 각각의 값을 분별하여 세부적으로 파악할 수 있다

ggplot(mpg, aes(x=displ , y=hwy, color=drv)) + geom_point()

#색깔 구분에 모양 구분까지 넣어 데이터를 더욱 세부적으로 분별하여 볼 수 있다
ggplot(mpg, aes(x=displ , y=hwy, color=drv, shape=drv)) + geom_point()

#산점도와 곡선그래프가 색깔로 구별되어 함께 보여진다
ggplot(mpg, aes(x=displ , y=hwy, color=drv)) + geom_point() + geom_smooth()

#산점도와 직선그래프가 색깔로 구별되어 함께 보여진다
#method='lm' : linear model, 직선그래프를 나타냄
ggplot(mpg, aes(x=displ , y=hwy, color=drv)) + geom_point() + geom_smooth(method = 'lm')

#그래프의 배경을 어둡게
ggplot(mpg, aes(x=displ , y=hwy, color=drv)) + geom_point() + geom_smooth(method = 'lm') +
theme_dark()  

ggplot(mpg, aes(x=displ , y=hwy, color=drv)) + geom_point() + geom_smooth(method = 'lm') +
theme_bw()

#같은 내용을 비주얼적으로 보기 편하게
ggplot(mpg, aes(x=displ , y=hwy, color=drv)) + geom_point() + geom_smooth(method = 'lm') +
theme_economist()  

ggplot(mpg, aes(x=displ , y=hwy, color=drv)) + geom_point() + geom_smooth(method = 'lm') +
theme_wsj()

* as.__() : 데이터의 형태 바꾸기

-as.character() : 문자형으로 바꾸기
-as.numeric() : 숫자형으로 바꾸기
-as.factor() : 범주형으로 바꾸기

유예지

이전 포스트

220824 Day2

다음 포스트