[R 기초] ggplot2 라이브러리를 이용한 데이터 시각화

2020. 10. 18. 16:21노트/R : 통계

library(ggplot2)

boxplot(mpg$hwy)
boxplot(mpg$hwy)$stats
median(mpg$hwy)

# 데이터 실습을 위해 결측치 만들어 주기 
mpg$hwy<-ifelse(mpg$hwy<12 | mpg$hwy>37, NA, mpg$hwy)
table(is.na(mpg$hwy))

#drv를 기준으로 그룹화 
#mean_hwy <- hwy의 평균, 결측값은 제외
mpg %>% 
  group_by(drv) %>% 
  summarise(mean_hwy=mean(hwy,na.rm=T))

#배경설정
ggplot(data=mpg, aes(x=displ, y=hwy))

ggplot(data=mpg, aes(x=displ, y=hwy))+
  geom_point()+
  xlim(3,6)+
  ylim(10,30)

df_mpg<-mpg %>% 
  group_by(drv) %>% 
  summarise(mean_hwy=mean(hwy))
df_mpg

ggplot(data=df_mpg,aes(x=drv,y=mean_hwy))+geom_col()

economics
ggplot(data=economics, aes(x=date, y=unemploy))+geom_point()

 

 

SPSS 실습 데이터 다운로드

rap0d.github.io/assets/file/r/191023/Koweps_hpc10_2015_beta1.zip

 

 

SPSS파일 불러오기 

install.packages("foreign")
install.packages("readxl")
library(foreign) #SPSS 파일 로드 
library(dplyr) # 전처리 
library(ggplot2) # 시각화 
library(readxl) # 엑셀파일 

raw_welfare<-read.spss(file="Data/Koweps_hpc10_2015_beta1.sav", to.data.frame=T)
welfare<-raw_welfare

 

str(welfare) # 문자 형식으로 변경 
View(welfare) # 새창으로 확인 

 

dim(welfare) # 행 열 갯수 (차원 확인)
>>> [1] 16664   957

welfare <-rename(welfare,
                 sex=h10_g3, # 성별
                 birth=h10_g4,
                 marriage=h10_g10,
                 religion=h10_g11,
                 code_job=h10_eco9,
                 income=p1002_8aq1,
                 code_region=h10_reg7) # 지역코드

class(welfare$sex)
table(welfare$sex)

>>>    
1    2 
7578 9086
# 이상치 결측값 처리 
welfare$sex = ifelse(welfare$sex==9, NA, welfare$sex)
table(is.na(welfare$sex))

#1 => "male" , 2=>"female"
welfare$sex <-ifelse(welfare$sex==1,"male","female")
table(welfare$sex)
qplot(welfare$sex)

 

class(welfare$income)
summary(welfare$income)
qplot(welfare$income)+xlim(0,1000)

 

 

summary(welfare$income)

# 이상치 결측치 처리 
welfare$income<-ifelse(welfare$income %in% c(0,9999), NA, welfare$income)

table(is.na(welfare$income))

sex_income <-welfare %>% 
  filter(!is.na(income)) %>% 
  group_by(sex) %>% 
  summarise(mi=mean(income))

ggplot(data=sex_income, aes(x=sex,y=mi))+geom_col()

summary(welfare$birth)
table(is.na(welfare$birth))

#9999 => NA로 
welfare$birth <- ifelse(welfare$birth==9999, NA,welfare$birth)

welfare$age<-2015-welfare$birth+1

summary(welfare$age)
qplot(welfare$age)

 

age_income <-welfare %>% 
  filter(!is.na(income)) %>% 
  group_by(age) %>% 
  summarise(mi=mean(income))

head(age_income)

ggplot(data=age_income,aes(x=age,y=mi))+geom_line()

# 연령대별 (초년, 중년, 장년) 월 수입 평균 시각화 까지 .. 

welfare<-welfare %>% 
  mutate(ageg=ifelse(age<30,"young", ifelse(age<=59,"middle","old")))

table(welfare$ageg)
qplot(welfare$ageg)

ageg_income<-welfare %>% 
  filter(!is.na(income)) %>% 
  group_by(ageg) %>% 
  summarise(mi=mean(income))
ageg_income

ggplot(data = ageg_income, aes(x=ageg, y=mi))+geom_col()

ggplot(data=ageg_income, aes(x=ageg, y=mi))+
  geom_col()+
  scale_x_discrete(limits=c("young","middle","old"))

# 성별 월급 차이는 연령대별로 다를까? 
sex_income <-welfare %>% 
  filter(!is.na(income)) %>% 
  group_by(ageg,sex) %>% 
  summarise(mi=mean(income))
sex_income

ggplot(data=sex_income, aes(x=ageg, y=mi, fill=sex))+
  geom_col(position = "dodge")+
  scale_x_discrete(limits=c("young","middle","old"))

# 성별(sex), 연령별(age) 월급 평균표 
sex_age <-welfare %>% 
  filter(!is.na(income)) %>% 
  group_by(age,sex) %>% 
  summarise(mi=mean(income))
head(sex_age)

ggplot(sex_age, aes(x=age, y=mi, col=sex))+geom_line()

 

 

Excel 실습 데이터 다운로드 

rap0d.github.io/assets/file/r/191023/Koweps_Codebook.xlsx

 

library(readxl)
list_job<-read_excel("Data/Koweps_Codebook.xlsx", sheet=2, col_names =T)
list_job

 

str(welfare)
welfare$code_job

#code_job 열 병합 
welfare<-left_join(welfare,list_job, id="code_job")
welfare$job
welfare$code_job

#welfare에서 code_job이 na가 아닌 데이터에 대해 code_job, job 열을 추출 
welfare %>% 
  filter(!is.na(code_job)) %>% 
  select(code_job,job) %>% 
  head(20)

job_income<-welfare %>% 
  filter(!is.na(job) & !is.na(income)) %>% 
  group_by(job) %>% 
  summarise(mi=mean(income))
head(job_income)

# 상위 10개 직업 추출 
top10 <-job_income %>% 
  arrange(mi) %>% 
  head(10)

ggplot(data=top10, aes(x=reorder(job,-mi),y=mi))+geom_col()+coord_flip()