Load and Process Images

-J1-·2022년 1월 21일
0

Different ways to Load Data

set up

# import library
import numpy as np
import os
import PIL
import PIL.Image
import tensorflow as tf
import tensorflow_datasets as tfds
# download image data
import pathlib
dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
data_dir = tf.keras.utils.get_file(origin=dataset_url,
                                   fname='flower_photos',
                                   untar=True)
data_dir = pathlib.Path(data_dir)

Keras Utility

  • Cerate Data set
# define some parameters before 
batch_size = 32
img_height = 180
img_width = 180
# load train and validation data separately
# train dataset load 
train_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="training",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)
# validation dataset load
val_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)
  • find class names
#check class names 
class_names = train_ds.class_names
print(class_names)
  • configure dataset for performance
    • Dataset.cache() keeps the image in the memory while first epoch. This prevents the dataset from becoming congested while training the model.
    • Dataset.prefetch() overlaps data preprocessing and model execution while training.
# configure dataset
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

tf.data for finer control

  • tf.keras.preprocessing is a convenient way to create tf.data.dataset from the image directory.
  • using tf.data to create your own input pipeline for more detailed control

  • create dataset
# load file 
list_ds = tf.data.Dataset.list_files(str(data_dir/'*/*'), shuffle=False)
list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)
# check class names
class_names = np.array(sorted([item.name for item in data_dir.glob('*') if item.name != "LICENSE.txt"]))
print(class_names)
# separate dataset to train/validation/test sets
val_size = int(image_count * 0.2)
train_ds = list_ds.skip(val_size)
val_ds = list_ds.take(val_size)
  • function for converts file paths into pairs (img, label)
def get_label(file_path):
  # Convert the path to a list of path components
  parts = tf.strings.split(file_path, os.path.sep)
  # The second to last is the class-directory
  one_hot = parts[-2] == class_names
  # Integer encode the label
  return tf.argmax(one_hot)
  
def decode_img(img):
  # Convert the compressed string to a 3D uint8 tensor
  img = tf.io.decode_jpeg(img, channels=3)
  # Resize the image to the desired size
  return tf.image.resize(img, [img_height, img_width])
  
def process_path(file_path):
  label = get_label(file_path)
  # Load the raw data from the file as a string
  img = tf.io.read_file(file_path)
  img = decode_img(img)
  return img, label
  • change a form of dataset from file to image and label pairs using Dataset.map
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
train_ds = train_ds.map(process_path, num_parallel_calls=AUTOTUNE)
val_ds = val_ds.map(process_path, num_parallel_calls=AUTOTUNE)

for image, label in train_ds.take(1):
  print("Image shape: ", image.numpy().shape)
  print("Label: ", label.numpy())
  • configure dataset for performance
  • you want the data:
    • to be well shuffled
    • to be batched
    • batches to be available as soon as possible
# configure datasets for performance
def configure_for_performance(ds):
  ds = ds.cache()
  ds = ds.shuffle(buffer_size=1000)
  ds = ds.batch(batch_size)
  ds = ds.prefetch(buffer_size=AUTOTUNE)
  return ds

train_ds = configure_for_performance(train_ds)
val_ds = configure_for_performance(val_ds)

Tensorflow Datasets

  • create dataset
# load dataset using tfds.load() method
(train_ds, val_ds, test_ds), metadata = tfds.load(
    'tf_flowers',
    split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'],
    # split train/validation/test datasets
    with_info=True,
    as_supervised=True,
)
  • find class names
# check number of classes
num_classes = metadata.features['label'].num_classes
print(num_classes)
# retrive image from dataset
get_label_name = metadata.features['label'].int2str

image, label = next(iter(train_ds))
_ = plt.imshow(image)
_ = plt.title(get_label_name(label))
  • configure dataset for performance
# batch, shuffle, configure for better performance
# the function already defined above is used here
train_ds = configure_for_performance(train_ds)
val_ds = configure_for_performance(val_ds)
test_ds = configure_for_performance(test_ds)
profile
Jaywalking with Jaewon🏃‍♀️

0개의 댓글