python fake data with multiprocess (Faker)

from concurrent.futures import ProcessPoolExecutor
import os
import threading
import time
from faker import Faker
import csv
fake = Faker()

csv_save_dir = './save_csv/'
worker_count = 5
per_row = 20000
file_name = "test.txt"
file_num = list(range(worker_count))

if not os.path.exists(csv_save_dir):
    os.makedirs(csv_save_dir)

def write_fake_data(file_num):

    csv_file_save_path = os.path.join(csv_save_dir, f"{str(file_num)}_{file_name}")
    with open(csv_file_save_path, 'w', newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["name", "ssn", "address", "blood_group", "job", "company", "residence", "sex", "mail", "birthdate"])
        for _ in range(per_row):
            writer.writerow([fake.profile()["name"], fake.profile()["ssn"], fake.profile()["address"].replace("\n", ""), fake.profile()["blood_group"], fake.profile()["job"], fake.profile()["company"].replace("\n", ""), fake.profile()["residence"].replace("\n", ""), fake.profile()["sex"], fake.profile()["mail"], fake.profile()["birthdate"]])
    print(f"Thread={threading.get_ident()}, Process={os.getpid()} CSV File saved...")

def main():
    with ProcessPoolExecutor(max_workers=worker_count) as executor:
        executor.map(write_fake_data, file_num)



if __name__ == '__main__':
    start = time.perf_counter()
    main()
    finish = time.perf_counter()
    print(f'Finished in {round(finish-start, 2)} second(s)')

Thread=140308890367808, Process=25978 CSV File saved...
Thread=140308890367808, Process=25979 CSV File saved...
Thread=140308890367808, Process=25980 CSV File saved...
Thread=140308890367808, Process=25976 CSV File saved...
Thread=140308890367808, Process=25977 CSV File saved...
Finished in 341.58 second(s)

5개의 process로 10만건 데이터 생성에 341초

from faker import Faker
import csv
import time

fake = Faker()
start  = time.perf_counter()
with open("test.txt", 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["name", "ssn", "address", "blood_group", "job", "company", "residence", "sex", "mail", "birthdate"])
        for _ in range(100000):
            writer.writerow([fake.profile()["name"], fake.profile()["ssn"], fake.profile()["address"].replace("\n", ""), fake.profile()["blood_group"], fake.profile()["job"], fake.profile()["company"].replace("\n", ""), fake.profile()["residence"].replace("\n", ""), fake.profile()["sex"], fake.profile()["mail"], fake.profile()["birthdate"]])

finish = time.perf_counter()

print(f'Finished in {round(finish-start, 2)} second(s)')

Finished in 1674.05 second(s)

1개의 process로 10만건 데이터 생성에 1674초

저작자표시 비영리 변경금지

'python' 카테고리의 다른 글

이것저것 (0)	2023.01.27
report 생성 (0)	2023.01.23
python single line to multi line (0)	2023.01.18
aiosmtpd (1)	2022.12.29
python fake data (mimesis) (0)	2022.09.10

kyeongseo.oh

python fake data with multiprocess (Faker)

'python' 카테고리의 다른 글

댓글

티스토리툴바

python fake data with multiprocess (Faker)

'python' 카테고리의 다른 글

관련글

댓글

티스토리툴바