[분류] KNN 알고리즘을 이용한 데이터 분류하기

2020. 10. 24. 19:52노트/R : 통계

중고차 데이터 

usedcars.csv
0.00MB

 

 

 

usedcars<- read.csv(file = "C:\\Users\\LG\\Downloads\\dataset_for_ml\\dataset_for_ml\\usedcars.csv", stringsAsFactors = FALSE)

str(usedcars)

summary(usedcars$year)

summary(usedcars[c("price","mileage")])

range(usedcars$price)

diff(range(usedcars$price))

IQR(range(usedcars$price))

quantile(usedcars$price, seq(from=0, to=1, by=0.1))

boxplot(usedcars$price, main="Car prices", ylab="price($)")

boxplot(usedcars$mileage, main="Car mileage", ylab="odometer")

hist(usedcars$price, main="Car prices", xlab ="price($)")

hist(usedcars$mileage, main="Car mileage", xlab = "odometer")

var(usedcars$price) # [1] 9749892
sd(usedcars$price) # [1] 3122.482
table(usedcars$year)

table(usedcars$model)

c_table <- table(usedcars$color)

round(prop.table(c_table)*100,1)

 

일변량 통계: 변수 하나에 대해서 조사 
이변량 통계: 두변수의 관계를 나타내는 통계 
다변량 통계: 두개 이상의 변수 관계 
산포도 : 이변량 데이터에 대한 그래프 

 

# 산포도 
plot(x=usedcars$mileage,
     y=usedcars$price)

# 강한 음의 상관관계를 갖음 : 주행거리가 더 크면 클 수록 가격은 떨어지게 된다. 

usedcars$conservative <- usedcars$color %in% c("Black","Gray","Silver","White")

table(usedcars$conservative)

install.packages("gmodels")
library(gmodels)

 

 

데이터

 

wisc_bc_data.csv
0.12MB

 

 

 

 

wbcd <- read.csv("C:\\Users\\LG\\Downloads\\dataset_for_ml\\dataset_for_ml\\wisc_bc_data.csv",stringsAsFactors = FALSE)
str(wbcd)

wbcd <- wbcd[-1]
str(wbcd)

table(wbcd$diagnosis)

wbcd$diagnosis <- factor(wbcd$diagnosis, levels=c("B","M"), labels = c("Benign","Malignant"))

table(wbcd$diagnosis)

summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")])

정규화 

normalize <- function(x){
  return ((x-min(x))/ (max(x)- min(x)))
}

normalize(c(1,2,3,4,5))

wbcd_n <-as.data.frame(lapply(wbcd[2:31], normalize))

class(wbcd_n) # [1] "data.frame"

summary(wbcd_n$area_mean)

# train data와 test data로 분류 
wbcd_train <- wbcd_n[1:469,]
wbcd_test <- wbcd_n[470:569,]

wbcd_train_labels <-wbcd[1:469,1]
wbcd_test_labels <- wbcd[470:569,1]

KNN 알고리즘 

library(class)

wbcd_test_pred<- knn(train= wbcd_train,
                     test = wbcd_test,
                     cl = wbcd_train_labels,
                     k=21)

wbcd_test_pred

library(gmodels)
CrossTable(x=wbcd_test_labels, y= wbcd_test_pred, prop.chisq= FALSE)

 

정규화 : 최댓값, 최솟값 따짐 
vs 표준화 : 최댓값, 최솟값 따지지 않음 (값이 중심 방향으로 축소되지 않았기 때문)

z 점수 표준화 
종양 데이터가 매우 커졌을 경우, 
이상치의 경우 가중치를 두는 것이 합리적이라고 판단 

 

wbcd_z <- as.data.frame(scale(wbcd[-1])) # 첫번째 열 제거 

summary(wbcd_z$area_mean)
# 표준화 했기 때문에 평균 = 0