[머신러닝] 랜덤포레스트를 이용한 은행 마케팅 (deposit 예측)

2020. 5. 19. 11:43노트/Python : 프로그래밍

 

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
from sklearn import tree 
from sklearn import metrics 
%matplotlib inline

 

데이터 

 

# UCI Bank Marketing data set 

archive.ics.uci.edu/ml/datasets.php

path = "C:\\Users\\student\\Desktop\\DY\\★ 데이터\\307.bankmarketing\\"
bank = pd.read_csv(path + "bank.csv")
# UCI Bank Marketing data set 
# https://archive.ics.uci.edu/ml/datasets.php  
bank.head()
# age: 수치형 데이터 
# job~ poutcome : 범주형 데이터 
# deposit : Yes / No 를 예측 

 

결측값 확인 

# 각 컬럼별 결측값 확인  
bank.isnull().sum() 
>>> 
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

# bank.describe()  #기술통계   

 

데이터 EDA

# box plot 출력  
sns.boxplot(x=bank["age"])

sns.distplot(bank.age , bins=100)​

sns.boxplot(x=bank["duration"])

 

데이터 전처리 

# categorical 형태로 변환 ( Convert categorical data) 
bankData = bank.copy()
bankData.info()
bankData.columns

 # job별로 갯수 세기 
bankData.job.value_counts()
>>> 
management       2566
blue-collar      1944
technician       1823
admin.           1334
services          923
retired           778
self-employed     405
student           360
unemployed        357
entrepreneur      328
housemaid         274
unknown            70
Name: job, dtype: int64

#deposit이 yes인 데이터에 한해 갯수를 출력  
bankData[bankData.deposit=='yes'].groupby("job").size()
bankData[bankData.deposit=="yes"].job.value_counts() 

>>> 
job
admin.            631
blue-collar       708
entrepreneur      123
housemaid         109
management       1301
retired           516
self-employed     187
services          369
student           269
technician        840
unemployed        202
unknown            34
dtype: int64

jobs=list(set(bankData.job.values))
jobs
for j in jobs:
    print("{:15}:{:5}".format(j, len(bankData[(bankData.deposit=="yes") & 
                                              (bankData.job==j)])))

>>> 
unknown        :   34
entrepreneur   :  123
retired        :  516
housemaid      :  109
management     : 1301
self-employed  :  187
technician     :  840
unemployed     :  202
student        :  269
admin.         :  631
blue-collar    :  708
services       :  369
# job 을 white-collar / pink-collar / other 의 3가지로 분류 
bankData['job']=bankData['job'].replace(['management','admin'],"white-collar")
bankData['job']=bankData['job'].replace(['services','housemaid'],"pink-collar")
bankData['job']=bankData['job'].replace(['retired','student','unemployed',
                                         'unknown'],"other")
                                         
----------------------------------------------------------------------------
bankData.poutcome.value_counts()
>>> 
unknown    8326
failure    1228
success    1071
other       537
Name: poutcome, dtype: int64

# other 부분을 unknown으로 표시 
bankData['poutcome']= bankData['poutcome'].replace(['other'],'unknown')
bankData.poutcome.value_counts()
>>> 
unknown    8863
failure    1228
success    1071
Name: poutcome, dtype: int64

---------------------------------------------------------------------------
# contact 열 제거 
bankData['contact'].value_counts()
>>> 
cellular     8042
unknown      2346
telephone     774
Name: contact, dtype: int64

bankData.drop('contact',axis=1, inplace=True)
# inplace= True : 원데이터도 제거 
--------------------------------------------------------------------------
# default 인코딩 
bankData['default'].value_counts()
>>> 
0    10994
1      168
Name: default_cat, dtype: int64

# 집값대출을 받았는가 안받았는가 => 수치로 변경  
bankData['housing_cat'] = bankData["housing"].map({'no':0,"yes":1})
bankData.drop("housing", axis = 1 , inplace =True )

bankData.loan_cat.value_counts()
>>> 
0    9702
1    1460
Name: loan_cat, dtype: int64

-----------------------------------------------------------------------
bankData.drop('day',axis=1, inplace=True)
bankData.drop('month',axis=1, inplace=True)

bankData["deposit_cat"]=bankData["deposit"].map({'no':0, "yes":1})
bankData.drop("deposit",axis=1,inplace=True)

bankData.deposit_cat.value_counts()

>>> 
0    5873
1    5289
Name: deposit_cat, dtype: int64

len(bankData[bankData["pdays"]==-1]) # 8324 명 컨택 x 
>>> 8324 

bankData["pdays"].max() # 컨택 한지 854일 지남 
>>> 854 

bankData["pdays"]=bankData["pdays"].replace(-1,10000)
bankData['recent_pdays']=np.where(bankData['pdays'],1/ bankData["pdays"], 1/bankData['pdays'])
# pdays가 높을 수록 최근에 연락했다는 의미가 됌 
bankData.drop('pdays', axis= 1 , inplace=True)

 

데이터 인코딩 

# 더미변수 생성 
bankWitDummies = pd.get_dummies(data = bankData, 
                                columns =['job', 'marital', 'education' , 'poutcome'], 
                                prefix = ['job','marital', ' education' , 'poutcome'])

bankWitDummies.shape # (11162, 28)
bankWitDummies.describe()

bankWitDummies.plot(kind="scatter", x="age", y= "balance")

bankWitDummies.plot(kind="hist", x="poutcome_success", y="duration")

# 계약 기간이 만료된 사람들 
bankWitDummies[bankData.deposit_cat==1].describe()

# 모두 가지고 있는 사람 
len(bankWitDummies[(bankData.deposit_cat==1)& 
              (bankWitDummies.loan_cat)&
              (bankWitDummies.housing_cat)])
              
>>> 265

# 265 명은 저축, 개인대출, 주택대출 모두 있음 

# deposit_cat= 1 : 정기저축 가입, 이고 default_cat = 1 인 사람 수 조사 
len(bankWitDummies[(bankData.deposit_cat==1)&(bankData.default_cat==1)])

>>> 52 
# 52명 
# 직업별 정기저축 가입 비율 
plt.figure(figsize=(10,6))
sns.barplot(x="job", y="deposit_cat", data=bankData)

bankwd=bankWitDummies
# 상관계수 조사 
corr=bankwd.corr()
corr

plt.figure(figsize =(10,10))
cmap= sns.diverging_palette(220,10,as_cmap=True)
sns.heatmap(corr, cmap=cmap, linewidths = .5, square = True , center = 0 )

corr_deposit = pd.DataFrame(corr['deposit_cat'].drop('deposit_cat'))

# corr_deposit을 deposit_cat을 기준으로 내림차순 정렬 
corr_deposit.sort_values(by="deposit_cat", ascending=False)

 

 

모델 생성 

dropDeposit =bankwd.drop("deposit_cat",1) # 입력 변수 
label=bankwd.deposit_cat # 출력 변수 
dropDeposit

dataTrain,dataTest, labelTrain, labelTest = train_test_split(dropDeposit, label, 
                                                             train_size = 0.2 , random_state = 42)
                                                             
>>> 
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
df2_sc_train= df2.score(dataTrain,labelTrain)
df2_sc_test = df2.score(dataTest, labelTest)

print("트레이닝 스코어:", df2_sc_train)
print("테스트 스코어:", df2_sc_test)

>>> 
트레이닝 스코어: 0.7670250896057348
테스트 스코어: 0.7530795072788354


df2 = tree.DecisionTreeClassifier(max_depth = 3 )
df2.fit(dataTrain, labelTrain)

df2_sc_train= df2.score(dataTrain,labelTrain)
df2_sc_test = df2.score(dataTest, labelTest)

print("트레이닝 스코어:", df2_sc_train)
print("테스트 스코어:", df2_sc_test)
>>> 
트레이닝 스코어: 0.7732974910394266
테스트 스코어: 0.7559910414333707



df2 = tree.DecisionTreeClassifier( )
df2.fit(dataTrain, labelTrain)

df2_sc_train= df2.score(dataTrain,labelTrain)
df2_sc_test = df2.score(dataTest, labelTest)

print("트레이닝 스코어:", df2_sc_train)
print("테스트 스코어:", df2_sc_test)

>>> 
트레이닝 스코어: 1.0
테스트 스코어: 0.7189249720044792


from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, random_state = 0)
model=forest.fit(dataTrain, labelTrain)

print("트레이닝 스코어: {:.3f}".format(forest.score(dataTrain, labelTrain)))
print("테스트: {:.3f}".format(forest.score(dataTest, labelTest)))

>>> 
트레이닝 스코어: 1.000
테스트: 0.799

 

시각화 

import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'


import graphviz 

with open("bank.dot" ,encoding="UTF-8") as f:
    dot_graph = f.read()
display(graphviz.Source(dot_graph))

# 이미지로 저장 
graphviz.Source(dot_graph).render('bank', format="png")

( 깊이가 너무 깊어서 짤림 ) 이미지로는 저장됌 

 

특성중요도 

#한글깨짐 방지 코드 
import matplotlib
from matplotlib import font_manager, rc
import platform
if platform.system()=="Windows":
    font_name=font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)


def plot_feature_importances_cancer(model):
    plt.figure(figsize=(15,5))
    n_features = dataTest.shape[1]
    plt.barh(np.arange(n_features), forest.feature_importances_, align="center")
    plt.yticks(np.arange(n_features), dataTest.columns, color="w")
    plt.xticks(color="w")
    plt.xlabel("특성 중요도", color ="w")
    plt.ylabel("특성", color="w")
    plt.ylim(-1, n_features)
    
    
plot_feature_importances_cancer(tree)

duration이 deposit 예측에 가장 중요한 특성이고, 그다음 balance, age,순으로 중요함