참고 페이지 : https://www.tensorflow.org/datasets/add_dataset
라이브러리 설치
pip install -q tfds-nightly
my_dataset/my_dataset.py 파일 만들기
cd path/to/my/Project/Dataset
tfds new my_dataset
ls -1 my_dataset/
my_dataset 디렉토리 아래 생성된 my_dataset.py 파일을 적절하게 수정한다
"""my_dataset dataset."""
import tensorflow_datasets as tfds
# TODO(my_dataset): Markdown description that will appear on the catalog page.
_DESCRIPTION = """
Description is **formatted** as markdown.
It should also contain any processing which has been applied (if any),
(e.g. corrupted example skipped, images cropped,...):
"""
# TODO(my_dataset): BibTeX citation
_CITATION = """
"""
class MyDataset(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for my_dataset dataset."""
VERSION = tfds.core.Version('1.0.0')
RELEASE_NOTES = {
'1.0.0': 'Initial release.',
}
def _info(self) -> tfds.core.DatasetInfo:
"""Returns the dataset metadata."""
# TODO(my_dataset): Specifies the tfds.core.DatasetInfo object
return tfds.core.DatasetInfo(
builder=self,
description=_DESCRIPTION,
features=tfds.features.FeaturesDict({
# These are the features of your dataset like images, labels ...
'image': tfds.features.Image(shape=(None, None, 3)),
'label': tfds.features.ClassLabel(names=['no', 'yes']),
}),
# If there's a common (input, target) tuple from the
# features, specify them here. They'll be used if
# `as_supervised=True` in `builder.as_dataset`.
supervised_keys=('image', 'label'), # Set to `None` to disable
homepage='https://dataset-homepage/',
citation=_CITATION,
)
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
"""Returns SplitGenerators."""
# TODO(my_dataset): Downloads the data and defines the splits
path = dl_manager.download_and_extract('https://todo-data-url')
# TODO(my_dataset): Returns the Dict[split names, Iterator[Key, Example]]
return {
'train': self._generate_examples(path / 'train_imgs'),
}
def _generate_examples(self, path):
"""Yields examples."""
# TODO(my_dataset): Yields (key, example) tuples from the dataset
for f in path.glob('*.jpeg'):
yield 'key', {
'image': f,
'label': 'yes',
}
초기값이 이렇게 되어 있으므로 TODO(my_dataset)이라고 된 부분을 수정하면 됩니다.
"""my_dataset dataset."""
import tensorflow_datasets as tfds
import csv
# TODO(my_dataset): Markdown description that will appear on the catalog page.
_DESCRIPTION = """
Description is **formatted** as markdown.
It should also contain any processing which has been applied (if any),
(e.g. corrupted example skipped, images cropped,...):
"""
# TODO(my_dataset): BibTeX citation
_CITATION = """
@aricle{pigface, auther = {seungha Lee}}
"""
class MyDataset(tfds.core.GeneratorBasedBuilder):
MANUAL_DOWNLOAD_INSTRUCTIONS = """
file in the `~/tensorflow_datasets/downloads/manual/manual_dir/`.
"""
VERSION = tfds.core.Version('1.0.0')
RELEASE_NOTES = {
'1.0.0': 'Initial release.',
}
def _info(self) -> tfds.core.DatasetInfo:
"""Returns the dataset metadata."""
# TODO(my_dataset): Specifies the tfds.core.DatasetInfo object
return tfds.core.DatasetInfo(
builder=self,
description=_DESCRIPTION,
features=tfds.features.FeaturesDict({
# These are the features of your dataset like images, labels ...
'image': tfds.features.Image(shape=(None, None, 3)),
'label' : tfds.features.ClassLabel(names=["a101", "a102", ..., "a105"]) # 만약에 labeling이 string이면 하나하나 넣어줘야 한다
}),
# If there's a common (input, target) tuple from the
# features, specify them here. They'll be used if
# `as_supervised=True` in `builder.as_dataset`.
supervised_keys=('image', 'label'), # Set to `None` to disable
disable_shuffling = True,
# homepage='https://dataset-homepage/',
citation=_CITATION,
)
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
"""Returns SplitGenerators."""
# TODO(my_dataset): Downloads the data and defines the splits
# data_path
archive_path = dl_manager.manual_dir
# extract the manually downloaded 'data.zip'
extracted_path = dl_manager.extract(archive_path)
# TODO(my_dataset): Returns the Dict[split names, Iterator[Key, Example]]
return {
'train': self._generate_examples(
images_path=extracted_path / 'train_images',
label_path=extracted_path / 'train_labels.csv',
),
'test': self._generate_examples(
images_path=extracted_path / 'test_images',
label_path=extracted_path / 'test_labels.csv',
),
}
def _generate_examples(self, images_path, label_path):
# Read the input data out of the source files
with label_path.open() as f:
for row in csv.DictReader(f):
image_id = row['image_id']
# And yield (key, feature_dict)
yield image_id, {
'image': images_path / f'{image_id}.jpg',
'label': row['label'],
}