[머신러닝] 랜덤포레스트를 이용한 은행 마케팅 (deposit 예측)
2020. 5. 19. 11:43ㆍ노트/Python : 프로그래밍
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn import metrics
%matplotlib inline
데이터
# UCI Bank Marketing data set
archive.ics.uci.edu/ml/datasets.php
path = "C:\\Users\\student\\Desktop\\DY\\★ 데이터\\307.bankmarketing\\"
bank = pd.read_csv(path + "bank.csv")
# UCI Bank Marketing data set
# https://archive.ics.uci.edu/ml/datasets.php
bank.head()
# age: 수치형 데이터
# job~ poutcome : 범주형 데이터
# deposit : Yes / No 를 예측
결측값 확인
# 각 컬럼별 결측값 확인
bank.isnull().sum()
>>>
age 0
job 0
marital 0
education 0
default 0
balance 0
housing 0
loan 0
contact 0
day 0
month 0
duration 0
campaign 0
pdays 0
previous 0
poutcome 0
deposit 0
dtype: int64
# bank.describe() #기술통계
데이터 EDA
# box plot 출력
sns.boxplot(x=bank["age"])
sns.distplot(bank.age , bins=100)
sns.boxplot(x=bank["duration"])
데이터 전처리
# categorical 형태로 변환 ( Convert categorical data)
bankData = bank.copy()
bankData.info()
bankData.columns
# job별로 갯수 세기
bankData.job.value_counts()
>>>
management 2566
blue-collar 1944
technician 1823
admin. 1334
services 923
retired 778
self-employed 405
student 360
unemployed 357
entrepreneur 328
housemaid 274
unknown 70
Name: job, dtype: int64
#deposit이 yes인 데이터에 한해 갯수를 출력
bankData[bankData.deposit=='yes'].groupby("job").size()
bankData[bankData.deposit=="yes"].job.value_counts()
>>>
job
admin. 631
blue-collar 708
entrepreneur 123
housemaid 109
management 1301
retired 516
self-employed 187
services 369
student 269
technician 840
unemployed 202
unknown 34
dtype: int64
jobs=list(set(bankData.job.values))
jobs
for j in jobs:
print("{:15}:{:5}".format(j, len(bankData[(bankData.deposit=="yes") &
(bankData.job==j)])))
>>>
unknown : 34
entrepreneur : 123
retired : 516
housemaid : 109
management : 1301
self-employed : 187
technician : 840
unemployed : 202
student : 269
admin. : 631
blue-collar : 708
services : 369
# job 을 white-collar / pink-collar / other 의 3가지로 분류
bankData['job']=bankData['job'].replace(['management','admin'],"white-collar")
bankData['job']=bankData['job'].replace(['services','housemaid'],"pink-collar")
bankData['job']=bankData['job'].replace(['retired','student','unemployed',
'unknown'],"other")
----------------------------------------------------------------------------
bankData.poutcome.value_counts()
>>>
unknown 8326
failure 1228
success 1071
other 537
Name: poutcome, dtype: int64
# other 부분을 unknown으로 표시
bankData['poutcome']= bankData['poutcome'].replace(['other'],'unknown')
bankData.poutcome.value_counts()
>>>
unknown 8863
failure 1228
success 1071
Name: poutcome, dtype: int64
---------------------------------------------------------------------------
# contact 열 제거
bankData['contact'].value_counts()
>>>
cellular 8042
unknown 2346
telephone 774
Name: contact, dtype: int64
bankData.drop('contact',axis=1, inplace=True)
# inplace= True : 원데이터도 제거
--------------------------------------------------------------------------
# default 인코딩
bankData['default'].value_counts()
>>>
0 10994
1 168
Name: default_cat, dtype: int64
# 집값대출을 받았는가 안받았는가 => 수치로 변경
bankData['housing_cat'] = bankData["housing"].map({'no':0,"yes":1})
bankData.drop("housing", axis = 1 , inplace =True )
bankData.loan_cat.value_counts()
>>>
0 9702
1 1460
Name: loan_cat, dtype: int64
-----------------------------------------------------------------------
bankData.drop('day',axis=1, inplace=True)
bankData.drop('month',axis=1, inplace=True)
bankData["deposit_cat"]=bankData["deposit"].map({'no':0, "yes":1})
bankData.drop("deposit",axis=1,inplace=True)
bankData.deposit_cat.value_counts()
>>>
0 5873
1 5289
Name: deposit_cat, dtype: int64
len(bankData[bankData["pdays"]==-1]) # 8324 명 컨택 x
>>> 8324
bankData["pdays"].max() # 컨택 한지 854일 지남
>>> 854
bankData["pdays"]=bankData["pdays"].replace(-1,10000)
bankData['recent_pdays']=np.where(bankData['pdays'],1/ bankData["pdays"], 1/bankData['pdays'])
# pdays가 높을 수록 최근에 연락했다는 의미가 됌
bankData.drop('pdays', axis= 1 , inplace=True)
데이터 인코딩
# 더미변수 생성
bankWitDummies = pd.get_dummies(data = bankData,
columns =['job', 'marital', 'education' , 'poutcome'],
prefix = ['job','marital', ' education' , 'poutcome'])
bankWitDummies.shape # (11162, 28)
bankWitDummies.describe()
bankWitDummies.plot(kind="scatter", x="age", y= "balance")
bankWitDummies.plot(kind="hist", x="poutcome_success", y="duration")
# 계약 기간이 만료된 사람들
bankWitDummies[bankData.deposit_cat==1].describe()
# 모두 가지고 있는 사람
len(bankWitDummies[(bankData.deposit_cat==1)&
(bankWitDummies.loan_cat)&
(bankWitDummies.housing_cat)])
>>> 265
# 265 명은 저축, 개인대출, 주택대출 모두 있음
# deposit_cat= 1 : 정기저축 가입, 이고 default_cat = 1 인 사람 수 조사
len(bankWitDummies[(bankData.deposit_cat==1)&(bankData.default_cat==1)])
>>> 52
# 52명
# 직업별 정기저축 가입 비율
plt.figure(figsize=(10,6))
sns.barplot(x="job", y="deposit_cat", data=bankData)
bankwd=bankWitDummies
# 상관계수 조사
corr=bankwd.corr()
corr
plt.figure(figsize =(10,10))
cmap= sns.diverging_palette(220,10,as_cmap=True)
sns.heatmap(corr, cmap=cmap, linewidths = .5, square = True , center = 0 )
corr_deposit = pd.DataFrame(corr['deposit_cat'].drop('deposit_cat'))
# corr_deposit을 deposit_cat을 기준으로 내림차순 정렬
corr_deposit.sort_values(by="deposit_cat", ascending=False)
모델 생성
dropDeposit =bankwd.drop("deposit_cat",1) # 입력 변수
label=bankwd.deposit_cat # 출력 변수
dropDeposit
dataTrain,dataTest, labelTrain, labelTest = train_test_split(dropDeposit, label,
train_size = 0.2 , random_state = 42)
>>>
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False,
random_state=None, splitter='best')
df2_sc_train= df2.score(dataTrain,labelTrain)
df2_sc_test = df2.score(dataTest, labelTest)
print("트레이닝 스코어:", df2_sc_train)
print("테스트 스코어:", df2_sc_test)
>>>
트레이닝 스코어: 0.7670250896057348
테스트 스코어: 0.7530795072788354
df2 = tree.DecisionTreeClassifier(max_depth = 3 )
df2.fit(dataTrain, labelTrain)
df2_sc_train= df2.score(dataTrain,labelTrain)
df2_sc_test = df2.score(dataTest, labelTest)
print("트레이닝 스코어:", df2_sc_train)
print("테스트 스코어:", df2_sc_test)
>>>
트레이닝 스코어: 0.7732974910394266
테스트 스코어: 0.7559910414333707
df2 = tree.DecisionTreeClassifier( )
df2.fit(dataTrain, labelTrain)
df2_sc_train= df2.score(dataTrain,labelTrain)
df2_sc_test = df2.score(dataTest, labelTest)
print("트레이닝 스코어:", df2_sc_train)
print("테스트 스코어:", df2_sc_test)
>>>
트레이닝 스코어: 1.0
테스트 스코어: 0.7189249720044792
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, random_state = 0)
model=forest.fit(dataTrain, labelTrain)
print("트레이닝 스코어: {:.3f}".format(forest.score(dataTrain, labelTrain)))
print("테스트: {:.3f}".format(forest.score(dataTest, labelTest)))
>>>
트레이닝 스코어: 1.000
테스트: 0.799
시각화
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
import graphviz
with open("bank.dot" ,encoding="UTF-8") as f:
dot_graph = f.read()
display(graphviz.Source(dot_graph))
# 이미지로 저장
graphviz.Source(dot_graph).render('bank', format="png")
( 깊이가 너무 깊어서 짤림 ) 이미지로는 저장됌
특성중요도
#한글깨짐 방지 코드
import matplotlib
from matplotlib import font_manager, rc
import platform
if platform.system()=="Windows":
font_name=font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
def plot_feature_importances_cancer(model):
plt.figure(figsize=(15,5))
n_features = dataTest.shape[1]
plt.barh(np.arange(n_features), forest.feature_importances_, align="center")
plt.yticks(np.arange(n_features), dataTest.columns, color="w")
plt.xticks(color="w")
plt.xlabel("특성 중요도", color ="w")
plt.ylabel("특성", color="w")
plt.ylim(-1, n_features)
plot_feature_importances_cancer(tree)
duration이 deposit 예측에 가장 중요한 특성이고, 그다음 balance, age,순으로 중요함
'노트 > Python : 프로그래밍' 카테고리의 다른 글
[가상환경] conda 가상환경 명령어 (0) | 2020.10.15 |
---|---|
[신경망] LSTM 모델을 이용한 리뷰 요약하기 (2) | 2020.05.20 |
[머신러닝] 결정트리와 랜덤포레스트를 이용한 분류 기법 (0) | 2020.05.19 |
[자연어처리] LSTM을 이용한 챗봇(chatbot) 만들기 (0) | 2020.05.18 |
[자연어처리] 문장 생성하기 (text generation) (0) | 2020.05.18 |