Labels : 54 --- ['apple_fuji_L', 'apple_fuji_M', 'apple_fuji_S', 'apple_yanggwang_L', 'apple_yanggwang_M', 'apple_yanggwang_S', 'cabbage_green_L', 'cabbage_green_M', 'cabbage_green_S', 'cabbage_red_L', 'cabbage_red_M', 'cabbage_red_S', 'chinese-cabbage_L', 'chinese-cabbage_M', 'chinese-cabbage_S', 'garlic_uiseong_L', 'garlic_uiseong_M', 'garlic_uiseong_S', 'mandarine_hallabong_L', 'mandarine_hallabong_M', 'mandarine_hallabong_S', 'mandarine_onjumilgam_L', 'mandarine_onjumilgam_M', 'mandarine_onjumilgam_S', 'onion_red_L', 'onion_red_M', 'onion_red_S', 'onion_white_L', 'onion_white_M', 'onion_white_S', 'pear_chuhwang_L', 'pear_chuhwang_M', 'pear_chuhwang_S', 'pear_singo_L', 'pear_singo_M', 'pear_singo_S', 'persimmon_bansi_L', 'persimmon_bansi_M', 'persimmon_bansi_S', 'persimmon_booyu_L', 'persimmon_booyu_M', 'persimmon_booyu_S', 'persimmon_daebong_L', 'persimmon_daebong_M', 'persimmon_daebong_S', 'potato_seolbong_L', 'potato_seolbong_M', 'potato_seolbong_S', 'potato_sumi_L', 'potato_sumi_M', 'potato_sumi_S', 'radish_winter-radish_L', 'radish_winter-radish_M', 'radish_winter-radish_S']
train count : [33, 37, 33, 37, 36, 37, 52, 54, 51, 56, 54, 52, 49, 51, 48, 56, 57, 51, 37, 38, 38, 38, 35, 34, 37, 37, 38, 36, 38, 38, 36, 32, 37, 39, 35, 36, 36, 39, 35, 36, 37, 38, 36, 34, 35, 37, 36, 37, 38, 37, 29, 35, 38, 35]
valid count : [7, 3, 7, 3, 4, 3, 8, 6, 9, 4, 6, 8, 8, 6, 6, 4, 3, 9, 3, 2, 2, 2, 5, 6, 3, 3, 2, 4, 2, 2, 4, 8, 3, 1, 5, 4, 4, 1, 5, 4, 3, 2, 4, 6, 5, 3, 4, 3, 2, 3, 3, 5, 2, 5]
라벨은 54개 나왔고
그런데 train/valid 나누는 부분에서
폴더 구성을 안하고 전체에서 그냥 나눠버렸더니
비율이 제멋대로라 이대로 해도 괜찮을까 걱정은 됨
def data_split(path="./data/org", split_predictions=0.1):
# train_dict, valid_dict로 데이터를 담을 변수를 만들어주고
train_dict = defaultdict(list)
valid_dict = defaultdict(list)
for root, dirs, files in os.walk(path):
# 파일을 다 돌면서 file_paths를 구성해주고
file_paths = [os.path.join(root, files[i]) for i in range(len(files))]
# 10%는 valid_index로,
valid_index = np.random.randint(low=0, high=len(files), size=int(len(files) * split_predictions))
# 나머지는 train_index로 만들어준다.
train_index = list(set(range(0, len(files))) - set(valid_index))
for idx, file in enumerate(files):
file_paths.append(os.path.join(root, file))
print("LL : ", file_paths[-1])
if file.find('_S') > 0:
file_name = file[0:file.find('_S') + 2]
elif file.find('_M') > 0:
file_name = file[0:file.find('_M') + 2]
elif file.find('_L') > 0:
file_name = file[0:file.find('_L') + 2]
else:
print("!!! Size not described.")
if idx in train_index:
train_dict[file_name].append(file_paths[-1])
else:
valid_dict[file_name].append(file_paths[-1])
# train_dict, valid_dict를 반환하여 데이터셋을 구성한다.
return train_dict, valid_dict