더미 데이터를 생성하는 과정중에서 다음과 같은 문제점이 발생했다.
2. Order 클래스의 createdAt도 임의의 날짜 데이터인데 최근 날짜로 변경이 필요
3. Product 클래스의 가격 범위가 1,000원~10,000,000원인데 범위가 너무 넓어서 수정이 좀 필요하지 않을까 싶음.
4. User 클래스 role에 대한 비율 조절이 필요함.
해당 문제점들을 각각 해결해보자..!
현재 코드 분석
# -*- coding: utf-8 -*-
from table.User import User
from faker import Faker
from faker.providers import DynamicProvider
fake = Faker('ko_KR')
def create_user_dataset(num):
user_class_list = []
user_role_provider = DynamicProvider(
provider_name="set_user_role",
elements=["ADMIN", "USER"],
)
# 동적 프로바이더 추가
fake.add_provider(user_role_provider)
# num개의 User 생성
for i in range(1, num + 1):
email = fake.unique.ascii_free_email()
password = fake.password()
role = fake.set_user_role()
user = User(i, email, password, role)
user_class_list.append(user)
return user_class_list
if __name__ == "__main__":
user_class_list = create_user_dataset(1000)
user_cnt = 0
admin_cnt = 0
for i in range(1000):
if user_class_list[i].role == "ADMIN":
admin_cnt += 1
if user_class_list[i].role == "USER":
user_cnt += 1
print("일반 사용자 수 : ", user_cnt)
print("셀러 수 : ", admin_cnt)
일반 사용자 수 : 469
셀러 수 : 531
faker 패키지 공식문서에서 random_element라는 메서드를 발견했다. 사용법은 아래와 같다.
>>> Faker.seed(0)
>>> for _ in range(10):
... fake.random_element(elements=OrderedDict([("a", 0.45), ("b", 0.35), ("c", 0.15), ("d", 0.05), ]))
...
'c'
'b'
'a'
'a'
'b'
'a'
'b'
'a'
'b'
'b'
적용 후 코드
# -*- coding: utf-8 -*-
from table.User import User
from faker import Faker
from collections import OrderedDict
fake = Faker('ko_KR')
def create_user_dataset(num):
user_class_list = []
# num개의 User 생성
for i in range(1, num + 1):
email = fake.unique.ascii_free_email()
password = fake.sha256()
role = fake.random_element(elements=OrderedDict([("USER", 0.9), ("ADMIN", 0.1)]))
user = User(i, email, password, role)
user_class_list.append(user)
return user_class_list
if __name__ == "__main__":
user_class_list = create_user_dataset(1000)
user_cnt = 0
admin_cnt = 0
for i in range(1000):
if user_class_list[i].role == "ADMIN":
admin_cnt += 1
if user_class_list[i].role == "USER":
user_cnt += 1
print("일반 사용자 수 : ", user_cnt)
print("셀러 수 : ", admin_cnt)
결과값은 아래와 같다.
일반 사용자 수 : 901
셀러 수 : 99
현재 코드 분석
def create_product_dataset(brand_class_list, categroy_class_list, num):
product_class_list = []
...
for i in range(1, num + 1):
...
price = fake.pyint(min_value=1000, max_value=10000000, step=100)
...
product = Product(i, brand_id, name, thumbnail, category_id, price, amount, review_num, review_avg)
product_class_list.append(product)
return product_class_list
문제해결시도!
from faker import Faker
from collections import OrderedDict
fake = Faker('ko_KR')
max_value_100000_cnt = 0
max_value_500000_cnt = 0
max_value_1000000_cnt = 0
max_value_5000000_cnt = 0
max_value_10000000_cnt = 0
for i in range(1, 1000):
# max value 확률 지정
max_value = fake.random_element(
elements=OrderedDict([
("100000", 0.7),
("500000", 0.2),
("1000000", 0.05),
("5000000", 0.03),
("10000000", 0.02)
])
)
if max_value == "100000":
price = fake.pyint(min_value=1000, max_value=int(max_value), step=100)
max_value_100000_cnt += 1
if max_value == "500000":
price = fake.pyint(min_value=100001, max_value=int(max_value), step=1000)
max_value_500000_cnt += 1
if max_value == "1000000":
price = fake.pyint(min_value=500001, max_value=int(max_value), step=1000)
max_value_1000000_cnt += 1
if max_value == "5000000":
price = fake.pyint(min_value=1000001, max_value=int(max_value), step=1000)
max_value_5000000_cnt += 1
if max_value == "10000000":
price = fake.pyint(min_value=5000001, max_value=int(max_value), step=1000)
max_value_10000000_cnt += 1
print("max_value_100000_cnt", max_value_100000_cnt)
print("max_value_500000_cnt", max_value_500000_cnt)
print("max_value_1000000_cnt", max_value_1000000_cnt)
print("max_value_5000000_cnt", max_value_5000000_cnt)
print("max_value_10000000_cnt", max_value_10000000_cnt)
또 다른 시도!
def create_product_dataset(brand_class_list, categroy_class_list, num):
product_class_list = []
...
for i in range(1, num + 1):
...
# 약 70% 20% 5% 3% 2% 비율로 max_value 지정
if i <= num * 0.6:
price = fake.pyint(min_value=1000, max_value=100000, step=100)
if num * 0.6 < i <= num * (0.6 + 0.3) + 1:
price = fake.pyint(min_value=100000, max_value=500000, step=1000)
if num * (0.6 + 0.3) + 1 < i <= num * (0.6 + 0.3 + 0.05):
price = fake.pyint(min_value=500000, max_value=1000000, step=1000)
if num * (0.6 + 0.3 + 0.05) < i <= num * (0.6 + 0.3 + 0.05 + 0.03):
price = fake.pyint(min_value=1000000, max_value=5000000, step=1000)
if num * (0.6 + 0.3 + 0.05 + 0.03) < i <= num * (0.6 + 0.3 + 0.05 + 0.03 + 0.02):
price = fake.pyint(min_value=5000000, max_value=10000000, step=1000)
...
product = Product(i, brand_id, name, thumbnail, category_id, price, amount, review_num, review_avg)
product_class_list.append(product)
return product_class_list
if __name__ == "__main__":
user_class_list = user_faker.create_user_dataset(50)
brand_class_list = brand_faker.create_brand_dataset(user_class_list, 100)
categroy_class_list = category_faker.create_catogory_dataset()
product_class_list = create_product_dataset(brand_class_list, categroy_class_list, 1000)
range_1000_to_100000_cnt = 0
range_100000_to_500000_cnt = 0
range_500000_to_1000000_cnt = 0
range_1000000_to_5000000_cnt = 0
range_5000000_to_10000000_cnt = 0
for product in product_class_list:
print(product)
price = product.price
if 1000 <= price <= 100000:
range_1000_to_100000_cnt += 1
if 100000 < price <= 500000:
range_100000_to_500000_cnt += 1
if 500000 < price <= 1000000:
range_500000_to_1000000_cnt += 1
if 1000000 < price <= 5000000:
range_1000000_to_5000000_cnt += 1
if 5000000 < price <= 10000000:
range_5000000_to_10000000_cnt += 1
print("range_1000_to_100000_cnt : ", range_1000_to_100000_cnt)
print("range_100000_to_500000_cnt : ", range_100000_to_500000_cnt)
print("range_500000_to_1000000_cnt : ", range_500000_to_1000000_cnt)
print("range_1000000_to_5000000_cnt : ", range_1000000_to_5000000_cnt)
print("range_5000000_to_10000000_cnt : ", range_5000000_to_10000000_cnt)
print(len(product_class_list))