- imgage는 용량이 크기때문에 주로 경로만 복사하고 generator로 학습을 진행한다.
- 나의경우 text데이터가 2억건이 넘는 데이터를 학습해야했고 2억건이 넘는 데이터는 메모리에 다 올릴 수 없으므로 generator로 text를 학습하고자했다.
def change_float(target):
arr = []
for i in target:
arr.append(int(i))
return arr
def change_float2(sample):
arr = []
arr2 = []
for i in sample:
for z in i:
arr2.append(int(z))
arr.append(arr2)
return arr
def generator(train_f, train_labels_f_t, batch_size, total_size):
index_n = 0
count = 0
while 1:
while count < total_size:
start = index_n*batch_size
end = ((index_n+1)*batch_size+1)
if end > total_size:
end = total_size
count = total_size
index_n += 1
target = to_categorical(train_labels_f_t[start: end],num_classes=339)
sample = change_float2(train_f[start: end])
yield np.array(train_f[start: end]).astype(int), target
index_n = 0
count = 0
text_tran_f_g = generator(train_f, train_labels_f_t, batch_size, len(train_f))
from tensorflow.keras.layers import LSTM,Bidirectional, Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, MaxPooling1D, Input, Flatten, concatenate
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf
sequence_input = Input(shape=(max_len,))
y = Embedding(vocab_size, 128, input_length=max_len, mask_zero = True)(sequence_input)
y = Bidirectional(LSTM(128, return_sequences = True))(y)
y = Flatten()(y)
y = Dense(256, activation="relu")(y)
y = Dropout(0.1)(y)
y = Dense(num_classes+1, activation="softmax")(y)
model = Model(inputs=sequence_input, outputs=y)
model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit_generator(text_tran_f_g, steps_per_epoch=int(len(train_f)/batch_size),epochs=1)
- 이와같이 yield로 각 배치만큼 잘라서 넣어주면 fit_generator 함수를 사용하요 아주 큰 text 데이터도 학습이 가능하다.