[분류] C50, rpart 라이브러리를 활용한 의사결정

2020. 11. 22. 20:17노트/R : 통계

 

 

데이터 

credit.csv
0.09MB

 

 

데이터 불러오기 

credit<-read.csv("C:/Users/LG/Downloads/dataset_for_ml/dataset_for_ml/credit.csv")

summary(credit$amount)

table(credit$default)

set.seed(123)
train_sample<-sample(1000,900) # 1 부터 1000 사이에 900건 추출 

str(train_sample)
credit_train<-credit[train_sample,]


test_sample<-sample(1000,100)
credit_test<-credit[test_sample,]

prop.table(table(credit_test$default))

prop.table(table(credit_train$default))

install.packages("C50") # 의사결정 알고리즘 패키지
library(C50)



#17번째 열을 제외한 나머지(default 열 제외)

# 모델 생성 
credit_model <-C5.0(credit_train[-17], credit_train$default)

summary(credit_model)

credit_pred<-predict(credit_model , credit_test)

credit_pred

library(gmodels)

CrossTable(credit_test$default , credit_pred, 
		prop.c = FALSE, prop.r = FALSE, 
        dnn = c("actual", "predicted"))

 

# 부스팅 :의사결정 트리를 여러개 작성하여 각 의사결정 트리에서 나온 결과에 대해 투표하고 
=> 성능이 약한 모델을 모아서 성능 개선함 

 

credit_boost10<-C5.0(credit_train[-17], credit_train$default, trials = 10)

credit_boost_pred10 <-predict(credit_boost10, credit_test)

CrossTable(credit_test$default , credit_boost_pred10 , 
           prop.c = FALSE, prop.r=FALSE, dnn = c("actual","predicted"))

 


 

데이터 

 

www.kaggle.com/c/titanic/data?select=train.csv

 

Titanic: Machine Learning from Disaster

Start here! Predict survival on the Titanic and get familiar with ML basics

www.kaggle.com

 

train<- read.csv("C:/Users/LG/Documents/Data/train.csv")
test<- read.csv("C:/Users/LG/Documents/Data/test.csv")

str(train)
str(test)

# 데이터 프레임 합치기
install.packages("readr")
install.packages("rpart")
install.packages("rpart.plot")

library(readr)
library(rpart)
library(rpart.plot)
library(dplyr)
library(ggplot2)

# 생존자에 NULL값 채우기 
Survived<-train$Survived
train$Survived <-NULL

# train과 test셋 합치기 
dataset<-bind_rows(train,test)

dim(bind_rows(train,test)) # 1309/11

str(dataset)

summary(dataset)
# 결측치 확인 
colSums(is.na(dataset))

 

데이터 전처리 

dataset$Fare <-median(dataset$Fare,na.rm=TRUE)

dataset$Fare[dataset$PassengerId==1044]<-median(dataset$Fare, na.rm=TRUE)
#sapply 적용 
summary(dataset$Age)

dataset$Age <-sapply(dataset$Age, 
                     FUN = function(x){
                       ifelse(is.na(x), median(dataset$Age, na.rm=TRUE),x)
                     })

summary(dataset$Age)

table(dataset$Embarked)

sum(dataset$Embarked!="")

table(dataset$Embarked) / sum(dataset$Embarked!="")

# dataset$Embarked 가 ""인 승객의 id 추출 
dataset$PassengerId[dataset$Embarked==""] # 62 830

# 탑승한 걸로 친다. 
dataset$Embarked[c(62,830)]<-"S"
nrow(dataset) #행의 갯수 : 1309 
dim(dataset) # 행과 열의 갯수 : 1309 11

1-sum(dataset$Cabin !="")/nrow(dataset) # 0.7746371

dataset$Cabin <-substr(dataset$Cabin,1,1)

table(dataset$Cabin)

dataset$Cabin[dataset$Cabin==""]<-"H"

str(dataset)

#chr -> Factor로 변경: as.factor
factor_vars <- c('PassengerId', 'Pclass', 'Sex', 'Embarked', 'Cabin' )

dataset[factor_vars] <-lapply(dataset[factor_vars], function(x)as.factor(x))

str(dataset) # factor로 변경 확인 

train_cleaned <- dataset[1:891,]
test_cleaned <- dataset[892:1309,]
train_cleaned$Survived <-Survived

DT<-rpart(Survived~Pclass+Sex+Embarked+Cabin, train_cleaned,method = "class" )


summary(DT)

 

예측

predict_dt<-predict(DT,test_cleaned,type="class")

res<-data.frame(PassengerId=test_cleaned$PassengerId, Survived=predict_dt)

write.csv(res, file="result.csv", row.names=FALSE)