Load and Process Images

-J1-·2022년 1월 21일

Different ways to Load Data

set up

# import library
import numpy as np
import os
import PIL
import PIL.Image
import tensorflow as tf
import tensorflow_datasets as tfds

# download image data
import pathlib
dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
data_dir = tf.keras.utils.get_file(origin=dataset_url,
                                   fname='flower_photos',
                                   untar=True)
data_dir = pathlib.Path(data_dir)

Keras Utility

Cerate Data set

# define some parameters before 
batch_size = 32
img_height = 180
img_width = 180

# load train and validation data separately
# train dataset load 
train_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="training",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

# validation dataset load
val_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

find class names

#check class names 
class_names = train_ds.class_names
print(class_names)

configure dataset for performance
- Dataset.cache() keeps the image in the memory while first epoch. This prevents the dataset from becoming congested while training the model.
- Dataset.prefetch() overlaps data preprocessing and model execution while training.

# configure dataset
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

tf.data for finer control

tf.keras.preprocessing is a convenient way to create tf.data.dataset from the image directory.
using tf.data to create your own input pipeline for more detailed control

create dataset

# load file 
list_ds = tf.data.Dataset.list_files(str(data_dir/'*/*'), shuffle=False)
list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)

# check class names
class_names = np.array(sorted([item.name for item in data_dir.glob('*') if item.name != "LICENSE.txt"]))
print(class_names)

# separate dataset to train/validation/test sets
val_size = int(image_count * 0.2)
train_ds = list_ds.skip(val_size)
val_ds = list_ds.take(val_size)

function for converts file paths into pairs (img, label)

def get_label(file_path):
  # Convert the path to a list of path components
  parts = tf.strings.split(file_path, os.path.sep)
  # The second to last is the class-directory
  one_hot = parts[-2] == class_names
  # Integer encode the label
  return tf.argmax(one_hot)
  
def decode_img(img):
  # Convert the compressed string to a 3D uint8 tensor
  img = tf.io.decode_jpeg(img, channels=3)
  # Resize the image to the desired size
  return tf.image.resize(img, [img_height, img_width])
  
def process_path(file_path):
  label = get_label(file_path)
  # Load the raw data from the file as a string
  img = tf.io.read_file(file_path)
  img = decode_img(img)
  return img, label

change a form of dataset from file to image and label pairs using Dataset.map

# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
train_ds = train_ds.map(process_path, num_parallel_calls=AUTOTUNE)
val_ds = val_ds.map(process_path, num_parallel_calls=AUTOTUNE)

for image, label in train_ds.take(1):
  print("Image shape: ", image.numpy().shape)
  print("Label: ", label.numpy())

configure dataset for performance
you want the data:
- to be well shuffled
- to be batched
- batches to be available as soon as possible

# configure datasets for performance
def configure_for_performance(ds):
  ds = ds.cache()
  ds = ds.shuffle(buffer_size=1000)
  ds = ds.batch(batch_size)
  ds = ds.prefetch(buffer_size=AUTOTUNE)
  return ds

train_ds = configure_for_performance(train_ds)
val_ds = configure_for_performance(val_ds)

Tensorflow Datasets

create dataset

# load dataset using tfds.load() method
(train_ds, val_ds, test_ds), metadata = tfds.load(
    'tf_flowers',
    split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'],
    # split train/validation/test datasets
    with_info=True,
    as_supervised=True,
)

find class names

# check number of classes
num_classes = metadata.features['label'].num_classes
print(num_classes)

# retrive image from dataset
get_label_name = metadata.features['label'].int2str

image, label = next(iter(train_ds))
_ = plt.imshow(image)
_ = plt.title(get_label_name(label))

configure dataset for performance

# batch, shuffle, configure for better performance
# the function already defined above is used here
train_ds = configure_for_performance(train_ds)
val_ds = configure_for_performance(val_ds)
test_ds = configure_for_performance(test_ds)

-J1-

Jaywalking with Jaewon🏃‍♀️

이전 포스트

ImageNet Classification with Deep Convolutional Neural Networks

다음 포스트

Load and Process Images

Different ways to Load Data

set up

Keras Utility

tf.data for finer control

Tensorflow Datasets

ImageNet Classification with Deep Convolutional Neural Networks

CIFAR 10 Classification

0개의 댓글