[신경망] LSTM 모델을 이용한 리뷰 요약하기

2020. 5. 20. 10:14노트/Python : 프로그래밍

데이터 

Reviews60000.zip
10.00MB

 

코드 

import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
path = "C:\\Users\\student\\Desktop\\DY\\★ 데이터\\402. word2vec\\"
data=pd.read_csv(path + "Reviews.csv", nrows=60000)
print(len(data))
>>> 60000 

#data.head(5)
#data.info()
data=data[['Text','Summary']]
data.head()

 

데이터 전처리 

data.dropna(axis=0, inplace=True)

#전처리
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

stopWords=set(stopwords.words('english'))
len(stopWords) #179
def preprocessSentence(sent, rs=True):
    sent=sent.lower()
    sent=BeautifulSoup(sent, "lxml").text#html태그제거
    sent=re.sub("\([^)]*\)", "" , sent)
    sent=re.sub('"', "" , sent)
    sent=" ".join([contractions[t] if t in 
                   contractions else t for t in 
                   sent.split(" ")])
    sent=re.sub("'s\b", "",sent) #소유격제거
    sent=re.sub("[^a-zA-Z]", " ",sent)
    sent=re.sub("[m]{2,}", "mm", sent)
    
    #rs==True => 불용어 제거(원문)
    if rs:
        tokens=" ".join(word for word in sent.split() 
        if not word in stopWords 
        if len(word)>1)
    else: #불용어 제거 안함(요약)
        tokens=" ".join(word for word in 
                        sent.split() if len(word)>1)    
    return tokens    
    
cleanText=[]
for sent in data['Text']:
    cleanText.append(preprocessSentence(sent))
print(cleanText[:10])

data['Text']=cleanText

cleanSummary=[]
for sent in data['Summary']:
    cleanSummary.append(preprocessSentence(sent))
print(cleanSummary[:10])

data['Summary']=cleanSummary

data[:10]

data.replace("", np.nan,inplace=True)

data.isnull().sum()
data.dropna(axis=0, inplace=True)

textLen=[len(s.split()) for s in data['Text']]
summaryLen=[len(s.split()) for s in data['Summary']]

textMaxLen=50
summaryMaxLen=8

def threshLen(mlen, nlist):
    c=0
    for s in nlist:
        if(len(s.split()) <= mlen):
            c+=1
    print(c/len(nlist))
    
threshLen(textMaxLen, data['Text'])
threshLen(summaryMaxLen, data['Summary'])

data=data[data['Text'].apply(
    lambda x:len(x.split())<=textMaxLen)]
data=data[data['Summary'].apply(
    lambda x:len(x.split())<=summaryMaxLen)]
    
#seq2seq
data['Summary']=data['Summary'].apply(lambda x:"sostoken "+  x  +" eostoken")
data['Summary']

 

트레이닝 , 테스트 데이터 분리 

textData=list(data['Text'])
summaryData=list(data['Summary'])

from sklearn.model_selection import train_test_split

xTrain, xTest, yTrain, yTest=train_test_split(textData, summaryData, test_size=0.2, random_state=42, shuffle=True)

len(xTrain)#54448
len(xTest)#13613

srcToken=Tokenizer()
srcToken.fit_on_texts(xTrain)
#단어 집합 생성

totalCnt=len(srcToken.word_index) #32000 여개

totalFreq=0 #전체 단어 빈도수 총 합
rCnt=0 #빈도수가 10 미만인 단어의 개수
rFreq=0 #빈도수가 10 미만인 단어 빈도수 총 합
for k, v in srcToken.word_counts.items():
    totalFreq+=v
    if(v<10):
        rCnt+=1
        rFreq+=v
print("단어 빈도수 총합", totalFreq)#1323945
print("단어수", totalCnt)#32688
print(rCnt/totalCnt) #78%
print(rFreq/totalFreq)#4%
print(totalCnt-rCnt)  #6956 -> 7000

>>>
단어 빈도수 총합 1323945
단어수 32668
0.7870699155136525
0.04192696826529803
6956
srcVocab=7000
srcToken=Tokenizer(num_words=srcVocab)
srcToken.fit_on_texts(xTrain)

xTrain=srcToken.texts_to_sequences(xTrain)
xTest=srcToken.texts_to_sequences(xTest)

print(xTrain[0])
data['Text'][0]

tarToken=Tokenizer()
tarToken.fit_on_texts(yTrain)

totalFreq=0 #전체 단어 빈도수 총 합
rCnt=0 #빈도수가 10 미만인 단어의 개수
rFreq=0 #빈도수가 10 미만인 단어 빈도수 총 합

for k, v in tarToken.word_counts.items():
    totalFreq+=v
    if(v<10):
        rCnt+=1
        rFreq+=v
        
        
print("단어 빈도수 총합", totalFreq)#1323945
print("단어수", totalCnt)#32688
print(rCnt/totalCnt) #78%
print(rFreq/totalFreq)#4%
print(totalCnt-rCnt)  #6956 -> 7000

>>> 
단어 빈도수 총합 254334
단어수 32668
0.28223337822946004
0.07756336156392775
23448

tarVoc=2000
tarTokenizer=Tokenizer(num_words=tarVoc)
tarTokenizer.fit_on_texts(yTrain)

yTrain=tarTokenizer.texts_to_sequences(yTrain)
yTest=tarTokenizer.texts_to_sequences(yTest)

dropTrain=[i for i, sent in 
           enumerate(yTrain) 
           if len(sent)==2]
dropTest=[i for i, sent in 
          enumerate(yTest) 
          if len(sent)==2]
          
xTrain=np.delete(xTrain, dropTrain, axis=0)
yTrain=np.delete(yTrain, dropTrain, axis=0)
xTest=np.delete(xTest, dropTest, axis=0)
yTest=np.delete(yTest, dropTest, axis=0)

len(xTrain) #52722
len(xTest) #13181
>>> 13181

 

패딩

xTrain=pad_sequences(xTrain,maxlen=textMaxLen, padding='post')
xTest=pad_sequences(xTest,maxlen=textMaxLen, padding='post')
yTrain=pad_sequences(yTrain,maxlen=summaryMaxLen, padding='post')
yTest=pad_sequences(yTest,maxlen=summaryMaxLen, padding='post')

 

모델생성

from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dim = 128
hidden_size = 256

# 인코더
encoder_inputs = Input(shape=(textMaxLen,))

# 인코더의 임베딩 층
enc_emb = Embedding(srcVocab, embedding_dim)(encoder_inputs)

# 인코더의 LSTM 1
encoder_lstm1 = LSTM(hidden_size, return_sequences=True, return_state=True ,dropout = 0.4, recurrent_dropout = 0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

# 인코더의 LSTM 2
encoder_lstm2 = LSTM(hidden_size, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

# 인코더의 LSTM 3
encoder_lstm3 = LSTM(hidden_size, return_state=True, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)

# 디코더
decoder_inputs = Input(shape=(None,))

# 디코더의 임베딩 층
dec_emb = Embedding(tarVoc, embedding_dim)(decoder_inputs)

# 디코더의 LSTM
decoder_lstm = LSTM(hidden_size, return_sequences = True, return_state = True, dropout = 0.4, recurrent_dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state = [state_h, state_c])

# 디코더의 출력층
decoder_softmax_layer = Dense(tarVoc, activation = 'softmax')
decoder_softmax_outputs = decoder_softmax_layer(decoder_outputs) 

# 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_softmax_outputs)
model.summary()

 

모델 학습 

import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/thushv89/attention_keras/master/layers/attention.py", filename="attention.py")
from attention import AttentionLayer

# 어텐션 층(어텐션 함수)
attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])

# 어텐션의 결과와 디코더의 hidden state들을 연결
decoder_concat_input = Concatenate(axis = -1, name='concat_layer')([decoder_outputs, attn_out])

# 디코더의 출력층
decoder_softmax_layer = Dense(tarVoc, activation='softmax')
decoder_softmax_outputs = decoder_softmax_layer(decoder_concat_input)

# 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_softmax_outputs)
model.summary()

model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 2)
history = model.fit([xTrain, yTrain[:,:-1]], yTrain.reshape(yTrain.shape[0], yTrain.shape[1], 1)[:,1:] \
                  ,epochs=50, callbacks=[es], batch_size = 256, validation_data=([xTest, yTest[:,:-1]], \
                  yTest.reshape(yTest.shape[0], yTest.shape[1], 1)[:,1:]))

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()



src_index_to_word = srcToken.index_word # 원문 단어 집합에서 정수 -> 단어를 얻음
tar_word_to_index = tarToken.word_index # 요약 단어 집합에서 단어 -> 정수를 얻음
tar_index_to_word = tar_token.index_word # 요약 단어 집합에서 정수 -> 단어를 얻음

# 인코더 설계
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs, state_h, state_c])

# 이전 시점의 상태들을 저장하는 텐서
decoder_state_input_h = Input(shape=(hidden_size,))
decoder_state_input_c = Input(shape=(hidden_size,))

dec_emb2= dec_emb_layer(decoder_inputs) 
# 문장의 다음 단어를 예측하기 위해서 초기 상태(initial_state)를 이전 시점의 상태로 사용. 이는 뒤의 함수 decode_sequence()에 구현
# 훈련 과정에서와 달리 LSTM의 리턴하는 은닉 상태와 셀 상태인 state_h와 state_c를 버리지 않음.
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

# 어텐션 함수
decoder_hidden_state_input = Input(shape=(textMaxLen, hidden_size))
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

# 디코더의 출력층
decoder_outputs2 = decoder_softmax_layer(decoder_inf_concat) 

# 최종 디코더 모델
decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])

def decode_sequence(input_seq):
    # 입력으로부터 인코더의 상태를 얻음
    states_value = encoder_model.predict(input_seq)
    # <SOS>에 해당하는 원-핫 벡터 생성
    target_seq = np.zeros((1, 1, tarVoce))
    target_seq[0, 0, tar_to_index['\t']] = 1.

    stop_condition = False
    decoded_sentence = ""
    while not stop_condition: #stop_condition이 True가 될 때까지 루프 반복
        # 이점 시점의 상태 states_value를 현 시점의 초기 상태로 사용
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = index_to_tar[sampled_token_index]
        decoded_sentence += sampled_char

        # <eos>에 도달하거나 최대 길이를 넘으면 중단.
        if (sampled_char == '\n' or
           len(decoded_sentence) > maxTarLen):
            stop_condition = True

        # 길이가 1인 타겟 시퀀스를 업데이트 합니다.
        target_seq = np.zeros((1, 1, tar_vocab_size))
        target_seq[0, 0, sampled_token_index] = 1.

        # 상태를 업데이트 합니다.
        states_value = [h, c]

    return decoded_sentence

# 원문의 정수 시퀀스를 텍스트 시퀀스로 변환
def seq2text(input_seq):
    temp=''
    for i in input_seq:
        if(i!=0):
            temp = temp + src_index_to_word[i]+' '
    return temp

# 요약문의 정수 시퀀스를 텍스트 시퀀스로 변환
def seq2summary(input_seq):
    temp=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index['sostoken']) and i!=target_word_index['eostoken']):
            temp = temp + tar_index_to_word[i] + ' '
    return temp

for i in range(500, 1000):
    print("원문 : ",seq2text(xTest[i]))
    print("실제 요약문 :",seq2summary(yTest[i]))
    print("예측 요약문 :",decode_sequence(xTest[i].reshape(1, textMaxLen)))
    print("\n")

오류나서 수정 필요함...