tensorflow_datasets tfds custom dataset 만들기

Hanna·2022년 2월 22일
0

참고 페이지 : https://www.tensorflow.org/datasets/add_dataset

  1. 라이브러리 설치
    pip install -q tfds-nightly

  2. my_dataset/my_dataset.py 파일 만들기
    cd path/to/my/Project/Dataset
    tfds new my_dataset
    ls -1 my_dataset/

  3. my_dataset 디렉토리 아래 생성된 my_dataset.py 파일을 적절하게 수정한다

"""my_dataset dataset."""

import tensorflow_datasets as tfds

# TODO(my_dataset): Markdown description  that will appear on the catalog page.
_DESCRIPTION = """
Description is **formatted** as markdown.

It should also contain any processing which has been applied (if any),
(e.g. corrupted example skipped, images cropped,...):
"""

# TODO(my_dataset): BibTeX citation
_CITATION = """
"""


class MyDataset(tfds.core.GeneratorBasedBuilder):
  """DatasetBuilder for my_dataset dataset."""

  VERSION = tfds.core.Version('1.0.0')
  RELEASE_NOTES = {
      '1.0.0': 'Initial release.',
  }

  def _info(self) -> tfds.core.DatasetInfo:
    """Returns the dataset metadata."""
    # TODO(my_dataset): Specifies the tfds.core.DatasetInfo object
    return tfds.core.DatasetInfo(
        builder=self,
        description=_DESCRIPTION,
        features=tfds.features.FeaturesDict({
            # These are the features of your dataset like images, labels ...
            'image': tfds.features.Image(shape=(None, None, 3)),
            'label': tfds.features.ClassLabel(names=['no', 'yes']),
        }),
        # If there's a common (input, target) tuple from the
        # features, specify them here. They'll be used if
        # `as_supervised=True` in `builder.as_dataset`.
        supervised_keys=('image', 'label'),  # Set to `None` to disable
        homepage='https://dataset-homepage/',
        citation=_CITATION,
    )

  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
    """Returns SplitGenerators."""
    # TODO(my_dataset): Downloads the data and defines the splits
    path = dl_manager.download_and_extract('https://todo-data-url')

    # TODO(my_dataset): Returns the Dict[split names, Iterator[Key, Example]]
    return {
        'train': self._generate_examples(path / 'train_imgs'),
    }

  def _generate_examples(self, path):
    """Yields examples."""
    # TODO(my_dataset): Yields (key, example) tuples from the dataset
    for f in path.glob('*.jpeg'):
      yield 'key', {
          'image': f,
          'label': 'yes',
      }

초기값이 이렇게 되어 있으므로 TODO(my_dataset)이라고 된 부분을 수정하면 됩니다.

"""my_dataset dataset."""

import tensorflow_datasets as tfds
import csv

# TODO(my_dataset): Markdown description  that will appear on the catalog page.
_DESCRIPTION = """
Description is **formatted** as markdown.

It should also contain any processing which has been applied (if any),
(e.g. corrupted example skipped, images cropped,...):
"""

# TODO(my_dataset): BibTeX citation
_CITATION = """

@aricle{pigface, auther = {seungha Lee}}
"""


class MyDataset(tfds.core.GeneratorBasedBuilder):
    MANUAL_DOWNLOAD_INSTRUCTIONS = """
    file in the `~/tensorflow_datasets/downloads/manual/manual_dir/`.
    """

    VERSION = tfds.core.Version('1.0.0')
    RELEASE_NOTES = {
    '1.0.0': 'Initial release.',
    }

    def _info(self) -> tfds.core.DatasetInfo:
        """Returns the dataset metadata."""
        # TODO(my_dataset): Specifies the tfds.core.DatasetInfo object
        return tfds.core.DatasetInfo(
            builder=self,
            description=_DESCRIPTION,
            features=tfds.features.FeaturesDict({
            # These are the features of your dataset like images, labels ...
            'image': tfds.features.Image(shape=(None, None, 3)),
            'label' : tfds.features.ClassLabel(names=["a101", "a102", ..., "a105"]) # 만약에 labeling이 string이면 하나하나 넣어줘야 한다
            }),
            # If there's a common (input, target) tuple from the
            # features, specify them here. They'll be used if
            # `as_supervised=True` in `builder.as_dataset`.
            supervised_keys=('image', 'label'),  # Set to `None` to disable
            disable_shuffling = True,
            # homepage='https://dataset-homepage/',
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager: tfds.download.DownloadManager):
        """Returns SplitGenerators."""
        # TODO(my_dataset): Downloads the data and defines the splits
        # data_path
        archive_path = dl_manager.manual_dir
        # extract the manually downloaded 'data.zip'
        extracted_path = dl_manager.extract(archive_path)

        # TODO(my_dataset): Returns the Dict[split names, Iterator[Key, Example]]
        return {
            'train': self._generate_examples(
                images_path=extracted_path / 'train_images',
                label_path=extracted_path / 'train_labels.csv',
            ),
            'test': self._generate_examples(
                images_path=extracted_path / 'test_images',
                label_path=extracted_path / 'test_labels.csv',
            ),
        }

    def _generate_examples(self, images_path, label_path):
    # Read the input data out of the source files
        with label_path.open() as f:
            for row in csv.DictReader(f):
                image_id = row['image_id']
                # And yield (key, feature_dict)
                yield image_id, {
                    'image': images_path / f'{image_id}.jpg',
                    'label': row['label'],
                }
profile
매일 성장하고 있습니다

0개의 댓글