mimesis 6.0.0 install
python3.9 -m pip install mimesis
import time
from mimesis.schema import Field, Schema
from mimesis import builtins, Person
us = builtins.USASpecProvider()
_ = Field(locale="en")
schema = Schema(schema=lambda: {
"name": _("full_name"),
"ssn": us.ssn(),
"address": _("address"),
"blood_type": _("blood_type"),
"job": _("occupation"),
"company": _("company"),
"continent": _("continent"),
"sex": _("sex"),
"email": _("person.email", domains=["test.com"], key=str.lower),
"birthdate": _("timestamp", posix=False),
})
start = time.perf_counter()
schema.to_csv(file_path='data.csv', iterations=100000)
finish = time.perf_counter()
print(f'Finished in {round(finish-start, 2)} second(s)')
Finished in 8.44 second(s) |
single process로 10만건의 데이터 생성에 8초
from mimesis.schema import Field, Schema
from mimesis import builtins, Person
from concurrent.futures import ProcessPoolExecutor
import os
import threading
import time
us = builtins.USASpecProvider()
csv_save_dir = './mime_csv/'
worker_count = 5
per_row = 20000
file_name = "mime.csv"
file_num = list(range(worker_count))
if not os.path.exists(csv_save_dir):
os.makedirs(csv_save_dir)
def write_fake_data(file_num):
csv_file_save_path = os.path.join(csv_save_dir, f"{file_num}_{file_name}")
_ = Field(locale="en")
schema = Schema(schema=lambda: {
"name": _("full_name"),
"ssn": us.ssn(),
"address": _("address"),
"blood_type": _("blood_type"),
"job": _("occupation"),
"company": _("company"),
"continent": _("continent"),
"sex": _("sex"),
"email": _("person.email", domains=["test.com"], key=str.lower),
"birthdate": _("timestamp", posix=False),
})
schema.to_csv(file_path=csv_file_save_path, iterations=per_row)
print(f"Thread={threading.get_ident()}, Process={os.getpid()} CSV File saved...")
def main():
with ProcessPoolExecutor(max_workers=worker_count) as executor:
executor.map(write_fake_data, file_num)
if __name__ == '__main__':
start = time.perf_counter()
main()
finish = time.perf_counter()
print(f'Finished in {round(finish-start, 2)} second(s)')
Thread=139880914614080, Process=40013 CSV File saved... Thread=139880914614080, Process=40015 CSV File saved... Thread=139880914614080, Process=40009 CSV File saved... Thread=139880914614080, Process=40012 CSV File saved... Thread=139880914614080, Process=40014 CSV File saved... Finished in 1.75 second(s) |
5개의 process로 10만건의 데이터 생성에 약 2초
https://kyeongseo.tistory.com/entry/python-fake-data-with-multiprocess Faker랑은 비교도 안되게 빠름
1천만건 생성에 173.27 second(s) 소요되었음.
mimesis를 사용해 대용량 데이터를 생성하니 oom이 발생하였음.
확인해보니 데이터를 전부 메모리에 담아놨다가 생성이 완료되면 한번에 file에 write하는 구조였음.
일정 단위로 file에 write하도록 코드를 수정 후 대용량 데이터 생성이 가능해짐
from mimesis.schema import Field, Schema
from mimesis import builtins, Person
from concurrent.futures import ProcessPoolExecutor
import os
import threading
import time
import csv
us = builtins.USASpecProvider()
csv_save_dir = './mime_csv/'
worker_count = 5
per_row = 20000000
bulk_write = 100000
quotient, remainder = divmod(per_row,bulk_write)
file_name = "mime.csv"
file_num = list(range(worker_count))
if not os.path.exists(csv_save_dir):
os.makedirs(csv_save_dir)
def write_fake_data(file_num):
csv_file_save_path = os.path.join(csv_save_dir, f"{file_num}_{file_name}")
_ = Field(locale="en")
schema = Schema(schema=lambda: {
"name": _("full_name"),
"ssn": us.ssn(),
"address": _("address"),
"blood_type": _("blood_type"),
"job": _("occupation"),
"company": _("company"),
"continent": _("continent"),
"sex": _("sex"),
"email": _("person.email", domains=["test.com"], key=str.lower),
"birthdate": _("timestamp", posix=False),
})
data = schema.create(1)
fieldnames = list(data[0])
with open(csv_file_save_path, "a", encoding="utf-8", newline="") as fp:
dict_writer = csv.DictWriter(fp, fieldnames)
dict_writer.writeheader()
if quotient != 0:
for i in range(quotient):
data = schema.create(bulk_write)
dict_writer.writerows(data)
if remainder != 0:
data = schema.create(remainder)
dict_writer.writerows(data)
print(f"Thread={threading.get_ident()}, Process={os.getpid()} CSV File saved...")
def main():
with ProcessPoolExecutor(max_workers=worker_count) as executor:
executor.map(write_fake_data, file_num)
if __name__ == '__main__':
start = time.perf_counter()
main()
finish = time.perf_counter()
print(f'Finished in {round(finish-start, 2)} second(s)')
Thread=140022190335808, Process=126781 CSV File saved... Thread=140022190335808, Process=126786 CSV File saved... Thread=140022190335808, Process=126787 CSV File saved... Thread=140022190335808, Process=126785 CSV File saved... Thread=140022190335808, Process=126784 CSV File saved... Finished in 1706.34 second(s) |
5개의 process로 1억건의 데이터를 생성하였으며, 10만건 단위로 file에 write 하였음
'python' 카테고리의 다른 글
이것저것 (0) | 2023.01.27 |
---|---|
report 생성 (0) | 2023.01.23 |
python single line to multi line (0) | 2023.01.18 |
aiosmtpd (1) | 2022.12.29 |
python fake data with multiprocess (Faker) (0) | 2022.09.09 |
댓글