
텍스트 요약은 긴 원문을 짧고 간결하게 요약하는 작업임. 주로 추출적 요약(Extractive Summarization)과 추상적 요약(Abstractive Summarization) 두 가지 방식이 있음.
단점: 원문에서 선택된 문장만을 사용하기 때문에, 새로운 표현 생성이 불가능함. 중복된 정보나 긴 문장이 그대로 포함될 수 있음.
단점: 모델 학습을 위해 방대한 데이터와 고성능 컴퓨팅 자원이 필요함. 데이터 구축에 많은 시간과 노력이 들어감.
import pandas as pd
data = pd.read_csv("Reviews.csv", nrows=100000)
data = data[['Text', 'Summary']]
data.drop_duplicates(subset=['Text'], inplace=True)
data.dropna(axis=0, inplace=True)
print(f'전체 샘플 수: {len(data)}')
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
stop_words = set(stopwords.words('english'))
def preprocess_sentence(sentence):
sentence = sentence.lower()
sentence = BeautifulSoup(sentence, "lxml").text
sentence = re.sub(r'\([^)]*\)', '', sentence)
sentence = re.sub("[^a-zA-Z]", " ", sentence)
sentence = ' '.join(word for word in sentence.split() if word not in stop_words)
return sentence
from tensorflow.keras.preprocessing.sequence import pad_sequences
text_max_len = 50
summary_max_len = 8
# 패딩 처리
encoder_input = pad_sequences(encoder_input, maxlen=text_max_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen=summary_max_len, padding='post')
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.models import Model
embedding_dim = 128
hidden_size = 256
# 인코더 정의
encoder_inputs = Input(shape=(text_max_len,))
enc_emb = Embedding(input_dim=src_vocab, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(hidden_size, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# 디코더 정의
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(input_dim=tar_vocab, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(hidden_size, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])
# 어텐션 레이어 추가
from attention import AttentionLayer
attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])
decoder_concat = Concatenate()([decoder_outputs, attn_out])
decoder_softmax = Dense(tar_vocab, activation='softmax')(decoder_concat)
model = Model([encoder_inputs, decoder_inputs], decoder_softmax)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', patience=2)
history = model.fit([encoder_input_train, decoder_input_train], decoder_target_train,
epochs=50, batch_size=256,
validation_data=([encoder_input_test, decoder_input_test], decoder_target_test),
callbacks=[es])
def decode_sequence(input_seq):
e_out, e_h, e_c = encoder_model.predict(input_seq)
target_seq = np.zeros((1, 1))
target_seq[0, 0] = tar_word_to_index['sostoken']
decoded_sentence = ''
while True:
output_tokens, h, c = decoder_model.predict([target_seq, e_out, e_h, e_c])
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_token = tar_index_to_word[sampled_token_index]
if sampled_token == 'eostoken' or len(decoded_sentence.split()) >= (summary_max_len - 1):
break
decoded_sentence += ' ' + sampled_token
target_seq = np.zeros((1, 1))
target_seq[0, 0] = sampled_token_index
e_h, e_c = h, c
return decoded_sentence
만나서 설명해주세여