import tensorflow as tf
[1] import
[2] Create a Simple Dataset
dataset = tf.data.Dataset.range(10)
dataset
#output
<_RangeDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>
for val in dataset:
print(val.numpy())
0
1
2
3
4
5
6
7
8
9
[3] Windowing the data
dataset = tf.data.Dataset.range(10)
dataset = datatset.window(size=5, shift=1)
for window_dataset in dataset:
print(window_dataset)
# output
<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>
<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>
<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>
<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>
<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>
<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>
<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>
<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>
<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>
<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>
각 요소를 보려면 각 iterable 객체를 반복해야 한다.
중첩된 for-loop 또는 list comprehension으로 위의 print 문을 수정하여 확인할 수 있다.
for window_dataset in dataset:
print([item.numpy() for item in window_dataset])
# output
[0, 1, 2, 3, 4]
[1, 2, 3, 4, 5]
[2, 3, 4, 5, 6]
[3, 4, 5, 6, 7]
[4, 5, 6, 7, 8]
[5, 6, 7, 8, 9]
[6, 7, 8, 9]
[7, 8, 9]
[8, 9]
[9]
2024-04-18 08:23:24.021439: W tensorflow/core/framework/dataset.cc:959] Input of Window will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.
2024-04-18 08:23:24.022837: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:23:24.025039: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:23:24.027288: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:23:24.029111: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:23:24.030775: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:23:24.032195: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:23:24.033432: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:23:24.034850: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:23:24.035126: W tensorflow/core/data/root_dataset.cc:350] Optimization loop failed: CANCELLED: Operation was cancelled
2024-04-18 08:23:24.037649: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:23:24.064230: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:23:24.065019: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
이제 각 창의 요소를 볼 수 있으므로 숫자 9 뒤에 더 이상 요소가 없기 때문에 결과 집합의 크기가 균일하지 않다는 확인할 수 있다.
이때는 drop_remainder를 사용하여 요소가 5개인 창만 표시되도록 할 수 있다.
dataset = tf.data.Dataset.range(10)
dataset = dataset.window(size=5, shift=1, drop_remainder=True)
for window_data in dataset:
print([item.numpy() for item in window_data])
# output
[0, 1, 2, 3, 4]
[1, 2, 3, 4, 5]
[2, 3, 4, 5, 6]
[3, 4, 5, 6, 7]
[4, 5, 6, 7, 8]
[5, 6, 7, 8, 9]
2024-04-18 08:54:25.012814: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:54:25.014446: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:54:25.015643: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:54:25.016754: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:54:25.017752: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:54:25.019100: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-18 08:54:25.019518: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
[4] Flatten the Windows
dataset = tf.data.Dataset.range(10)
dataset = dataset.window(size=5, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window : window.batch(5))
for window in dataset:
print(window.numpy())
#output
[0 1 2 3 4]
[1 2 3 4 5]
[2 3 4 5 6]
[3 4 5 6 7]
[4 5 6 7 8]
[5 6 7 8 9]
2024-04-18 09:02:01.460113: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
[5] Group into features and labels
dataset = tf.data.Dataset.range(10)
dataset = dataset.window(size=5, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window : window.batch(5))
dataset = dataset.map(lambda window: (window[:-1], window[-1]))
for x,y in dataset:
print(f"x -> {x.numpy()}")
print(f"y -> {y.numpy()}")
print()
# output
x -> [0 1 2 3]
y -> 4
x -> [1 2 3 4]
y -> 5
x -> [2 3 4 5]
y -> 6
x -> [3 4 5 6]
y -> 7
x -> [4 5 6 7]
y -> 8
x -> [5 6 7 8]
y -> 9
2024-04-18 09:06:40.380108: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
[6] Shuffle the data
모델을 훈련하는 동안 시퀀스 편향을 줄이기 위해 데이터 세트를 섞는 것이 좋다. 데이터 세스틀 섞지 않으면 신경망이 입력 순서에 과적합될 수 있다. 학습 할 때 훈련 입력의 순서가 신경망에 영향을 미치는 않도록 데이터를 섞는다.
간단히 shuffle() 메서드를 사용하여 이 작업을 수행할 수 있다. 이를 위해서는 buffer_size 매개변수가 필요하며 문서에서 언급한 대로 더 나은 셔플링을 위해 전체 요소 수보다 크거나 같은 숫자를 입력해야 한다.
데이터 세트의 총 window 수가 6개이므로 이 숫자를 선택하거나 숫자 이상을 선택한다.
dataset = tf.data.Dataset.range(10)
dataset = dataset.window(size=5, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(5))
dataset = dataset.map(lambda window: (window[:-1], window[-1]))
dataset = dataset.shuffle(buffer_size=10)
for x, y in dataset:
print(f"x-> {x.numpy()}")
print(f"y-> {y.numpy()}")
print()
# output
x-> [0 1 2 3]
y-> 4
x-> [4 5 6 7]
y-> 8
x-> [2 3 4 5]
y-> 6
x-> [3 4 5 6]
y-> 7
x-> [1 2 3 4]
y-> 5
x-> [5 6 7 8]
y-> 9
2024-04-18 09:14:48.466036: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
[7] Create batches for trainig
dataset = tf.data.Dataset.range(10)
dataset = dataset.window(size=5, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window : window.batch(5))
dataset = dataset.map(lambda window : (window[:-1], window[-1]))
dataset = dataset.shuffle(buffer_size=10)
dataset = dataset.batch(2).prefetch(1)
for x, y in dataset:
print(f"x-> {x.numpy()}")
print(f"y-> {y.numpy()}")
# output
x-> [[5 6 7 8]
[0 1 2 3]]
y-> [9 4]
x-> [[3 4 5 6]
[1 2 3 4]]
y-> [7 5]
x-> [[2 3 4 5]
[4 5 6 7]]
y-> [6 8]
2024-04-18 09:30:13.364326: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
batch를 3으로 했을 경우
dataset = tf.data.Dataset.range(10)
dataset = dataset.window(size=5, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window : window.batch(5))
dataset = dataset.map(lambda window : (window[:-1], window[-1]))
dataset = dataset.shuffle(buffer_size=10)
dataset = dataset.batch(3).prefetch(1)
for x, y in dataset:
print(f"x-> {x.numpy()}")
print(f"y-> {y.numpy()}")
# output
x-> [[5 6 7 8]
[3 4 5 6]
[4 5 6 7]]
y-> [9 7 8]
x-> [[1 2 3 4]
[0 1 2 3]
[2 3 4 5]]
y-> [5 4 6]
2024-04-18 09:30:32.913292: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
여기서는 tf.data.Dataset 클래스의 다양한 메서드를 연결하여 순서를 섞고 일괄 처리된 window 데이터세트로 시퀀스를 준비하는 방법을 코딩했다. 다음에는 이것을 합성 데이터에 적용하고 그 결과를 사용하여 신경망을 훈련하게 된다.