[신경망] LSTM 모델을 이용한 리뷰 요약하기
2020. 5. 20. 10:14ㆍ노트/Python : 프로그래밍
데이터
코드
import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
path = "C:\\Users\\student\\Desktop\\DY\\★ 데이터\\402. word2vec\\"
data=pd.read_csv(path + "Reviews.csv", nrows=60000)
print(len(data))
>>> 60000
#data.head(5)
#data.info()
data=data[['Text','Summary']]
data.head()
데이터 전처리
data.dropna(axis=0, inplace=True)
#전처리
contractions = {
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}
stopWords=set(stopwords.words('english'))
len(stopWords) #179
def preprocessSentence(sent, rs=True):
sent=sent.lower()
sent=BeautifulSoup(sent, "lxml").text#html태그제거
sent=re.sub("\([^)]*\)", "" , sent)
sent=re.sub('"', "" , sent)
sent=" ".join([contractions[t] if t in
contractions else t for t in
sent.split(" ")])
sent=re.sub("'s\b", "",sent) #소유격제거
sent=re.sub("[^a-zA-Z]", " ",sent)
sent=re.sub("[m]{2,}", "mm", sent)
#rs==True => 불용어 제거(원문)
if rs:
tokens=" ".join(word for word in sent.split()
if not word in stopWords
if len(word)>1)
else: #불용어 제거 안함(요약)
tokens=" ".join(word for word in
sent.split() if len(word)>1)
return tokens
cleanText=[]
for sent in data['Text']:
cleanText.append(preprocessSentence(sent))
print(cleanText[:10])
data['Text']=cleanText
cleanSummary=[]
for sent in data['Summary']:
cleanSummary.append(preprocessSentence(sent))
print(cleanSummary[:10])
data['Summary']=cleanSummary
data[:10]
data.replace("", np.nan,inplace=True)
data.isnull().sum()
data.dropna(axis=0, inplace=True)
textLen=[len(s.split()) for s in data['Text']]
summaryLen=[len(s.split()) for s in data['Summary']]
textMaxLen=50
summaryMaxLen=8
def threshLen(mlen, nlist):
c=0
for s in nlist:
if(len(s.split()) <= mlen):
c+=1
print(c/len(nlist))
threshLen(textMaxLen, data['Text'])
threshLen(summaryMaxLen, data['Summary'])
data=data[data['Text'].apply(
lambda x:len(x.split())<=textMaxLen)]
data=data[data['Summary'].apply(
lambda x:len(x.split())<=summaryMaxLen)]
#seq2seq
data['Summary']=data['Summary'].apply(lambda x:"sostoken "+ x +" eostoken")
data['Summary']
트레이닝 , 테스트 데이터 분리
textData=list(data['Text'])
summaryData=list(data['Summary'])
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest=train_test_split(textData, summaryData, test_size=0.2, random_state=42, shuffle=True)
len(xTrain)#54448
len(xTest)#13613
srcToken=Tokenizer()
srcToken.fit_on_texts(xTrain)
#단어 집합 생성
totalCnt=len(srcToken.word_index) #32000 여개
totalFreq=0 #전체 단어 빈도수 총 합
rCnt=0 #빈도수가 10 미만인 단어의 개수
rFreq=0 #빈도수가 10 미만인 단어 빈도수 총 합
for k, v in srcToken.word_counts.items():
totalFreq+=v
if(v<10):
rCnt+=1
rFreq+=v
print("단어 빈도수 총합", totalFreq)#1323945
print("단어수", totalCnt)#32688
print(rCnt/totalCnt) #78%
print(rFreq/totalFreq)#4%
print(totalCnt-rCnt) #6956 -> 7000
>>>
단어 빈도수 총합 1323945
단어수 32668
0.7870699155136525
0.04192696826529803
6956
srcVocab=7000
srcToken=Tokenizer(num_words=srcVocab)
srcToken.fit_on_texts(xTrain)
xTrain=srcToken.texts_to_sequences(xTrain)
xTest=srcToken.texts_to_sequences(xTest)
print(xTrain[0])
data['Text'][0]
tarToken=Tokenizer()
tarToken.fit_on_texts(yTrain)
totalFreq=0 #전체 단어 빈도수 총 합
rCnt=0 #빈도수가 10 미만인 단어의 개수
rFreq=0 #빈도수가 10 미만인 단어 빈도수 총 합
for k, v in tarToken.word_counts.items():
totalFreq+=v
if(v<10):
rCnt+=1
rFreq+=v
print("단어 빈도수 총합", totalFreq)#1323945
print("단어수", totalCnt)#32688
print(rCnt/totalCnt) #78%
print(rFreq/totalFreq)#4%
print(totalCnt-rCnt) #6956 -> 7000
>>>
단어 빈도수 총합 254334
단어수 32668
0.28223337822946004
0.07756336156392775
23448
tarVoc=2000
tarTokenizer=Tokenizer(num_words=tarVoc)
tarTokenizer.fit_on_texts(yTrain)
yTrain=tarTokenizer.texts_to_sequences(yTrain)
yTest=tarTokenizer.texts_to_sequences(yTest)
dropTrain=[i for i, sent in
enumerate(yTrain)
if len(sent)==2]
dropTest=[i for i, sent in
enumerate(yTest)
if len(sent)==2]
xTrain=np.delete(xTrain, dropTrain, axis=0)
yTrain=np.delete(yTrain, dropTrain, axis=0)
xTest=np.delete(xTest, dropTest, axis=0)
yTest=np.delete(yTest, dropTest, axis=0)
len(xTrain) #52722
len(xTest) #13181
>>> 13181
패딩
xTrain=pad_sequences(xTrain,maxlen=textMaxLen, padding='post')
xTest=pad_sequences(xTest,maxlen=textMaxLen, padding='post')
yTrain=pad_sequences(yTrain,maxlen=summaryMaxLen, padding='post')
yTest=pad_sequences(yTest,maxlen=summaryMaxLen, padding='post')
모델생성
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
embedding_dim = 128
hidden_size = 256
# 인코더
encoder_inputs = Input(shape=(textMaxLen,))
# 인코더의 임베딩 층
enc_emb = Embedding(srcVocab, embedding_dim)(encoder_inputs)
# 인코더의 LSTM 1
encoder_lstm1 = LSTM(hidden_size, return_sequences=True, return_state=True ,dropout = 0.4, recurrent_dropout = 0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)
# 인코더의 LSTM 2
encoder_lstm2 = LSTM(hidden_size, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)
# 인코더의 LSTM 3
encoder_lstm3 = LSTM(hidden_size, return_state=True, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)
# 디코더
decoder_inputs = Input(shape=(None,))
# 디코더의 임베딩 층
dec_emb = Embedding(tarVoc, embedding_dim)(decoder_inputs)
# 디코더의 LSTM
decoder_lstm = LSTM(hidden_size, return_sequences = True, return_state = True, dropout = 0.4, recurrent_dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state = [state_h, state_c])
# 디코더의 출력층
decoder_softmax_layer = Dense(tarVoc, activation = 'softmax')
decoder_softmax_outputs = decoder_softmax_layer(decoder_outputs)
# 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_softmax_outputs)
model.summary()
모델 학습
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/thushv89/attention_keras/master/layers/attention.py", filename="attention.py")
from attention import AttentionLayer
# 어텐션 층(어텐션 함수)
attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])
# 어텐션의 결과와 디코더의 hidden state들을 연결
decoder_concat_input = Concatenate(axis = -1, name='concat_layer')([decoder_outputs, attn_out])
# 디코더의 출력층
decoder_softmax_layer = Dense(tarVoc, activation='softmax')
decoder_softmax_outputs = decoder_softmax_layer(decoder_concat_input)
# 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_softmax_outputs)
model.summary()
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 2)
history = model.fit([xTrain, yTrain[:,:-1]], yTrain.reshape(yTrain.shape[0], yTrain.shape[1], 1)[:,1:] \
,epochs=50, callbacks=[es], batch_size = 256, validation_data=([xTest, yTest[:,:-1]], \
yTest.reshape(yTest.shape[0], yTest.shape[1], 1)[:,1:]))
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()
src_index_to_word = srcToken.index_word # 원문 단어 집합에서 정수 -> 단어를 얻음
tar_word_to_index = tarToken.word_index # 요약 단어 집합에서 단어 -> 정수를 얻음
tar_index_to_word = tar_token.index_word # 요약 단어 집합에서 정수 -> 단어를 얻음
# 인코더 설계
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs, state_h, state_c])
# 이전 시점의 상태들을 저장하는 텐서
decoder_state_input_h = Input(shape=(hidden_size,))
decoder_state_input_c = Input(shape=(hidden_size,))
dec_emb2= dec_emb_layer(decoder_inputs)
# 문장의 다음 단어를 예측하기 위해서 초기 상태(initial_state)를 이전 시점의 상태로 사용. 이는 뒤의 함수 decode_sequence()에 구현
# 훈련 과정에서와 달리 LSTM의 리턴하는 은닉 상태와 셀 상태인 state_h와 state_c를 버리지 않음.
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])
# 어텐션 함수
decoder_hidden_state_input = Input(shape=(textMaxLen, hidden_size))
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])
# 디코더의 출력층
decoder_outputs2 = decoder_softmax_layer(decoder_inf_concat)
# 최종 디코더 모델
decoder_model = Model(
[decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
[decoder_outputs2] + [state_h2, state_c2])
def decode_sequence(input_seq):
# 입력으로부터 인코더의 상태를 얻음
states_value = encoder_model.predict(input_seq)
# <SOS>에 해당하는 원-핫 벡터 생성
target_seq = np.zeros((1, 1, tarVoce))
target_seq[0, 0, tar_to_index['\t']] = 1.
stop_condition = False
decoded_sentence = ""
while not stop_condition: #stop_condition이 True가 될 때까지 루프 반복
# 이점 시점의 상태 states_value를 현 시점의 초기 상태로 사용
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = index_to_tar[sampled_token_index]
decoded_sentence += sampled_char
# <eos>에 도달하거나 최대 길이를 넘으면 중단.
if (sampled_char == '\n' or
len(decoded_sentence) > maxTarLen):
stop_condition = True
# 길이가 1인 타겟 시퀀스를 업데이트 합니다.
target_seq = np.zeros((1, 1, tar_vocab_size))
target_seq[0, 0, sampled_token_index] = 1.
# 상태를 업데이트 합니다.
states_value = [h, c]
return decoded_sentence
# 원문의 정수 시퀀스를 텍스트 시퀀스로 변환
def seq2text(input_seq):
temp=''
for i in input_seq:
if(i!=0):
temp = temp + src_index_to_word[i]+' '
return temp
# 요약문의 정수 시퀀스를 텍스트 시퀀스로 변환
def seq2summary(input_seq):
temp=''
for i in input_seq:
if((i!=0 and i!=target_word_index['sostoken']) and i!=target_word_index['eostoken']):
temp = temp + tar_index_to_word[i] + ' '
return temp
for i in range(500, 1000):
print("원문 : ",seq2text(xTest[i]))
print("실제 요약문 :",seq2summary(yTest[i]))
print("예측 요약문 :",decode_sequence(xTest[i].reshape(1, textMaxLen)))
print("\n")
오류나서 수정 필요함...
'노트 > Python : 프로그래밍' 카테고리의 다른 글
[CS] 이진탐색트리(Binary Search Tree) 구현하기 python code (0) | 2020.11.03 |
---|---|
[가상환경] conda 가상환경 명령어 (0) | 2020.10.15 |
[머신러닝] 랜덤포레스트를 이용한 은행 마케팅 (deposit 예측) (0) | 2020.05.19 |
[머신러닝] 결정트리와 랜덤포레스트를 이용한 분류 기법 (0) | 2020.05.19 |
[자연어처리] LSTM을 이용한 챗봇(chatbot) 만들기 (0) | 2020.05.18 |