본문 바로가기
python

python fake data with multiprocess (Faker)

by kyeongseo.oh 2022. 9. 9.
from concurrent.futures import ProcessPoolExecutor
import os
import threading
import time
from faker import Faker
import csv
fake = Faker()

csv_save_dir = './save_csv/'
worker_count = 5
per_row = 20000
file_name = "test.txt"
file_num = list(range(worker_count))

if not os.path.exists(csv_save_dir):
    os.makedirs(csv_save_dir)

def write_fake_data(file_num):

    csv_file_save_path = os.path.join(csv_save_dir, f"{str(file_num)}_{file_name}")
    with open(csv_file_save_path, 'w', newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["name", "ssn", "address", "blood_group", "job", "company", "residence", "sex", "mail", "birthdate"])
        for _ in range(per_row):
            writer.writerow([fake.profile()["name"], fake.profile()["ssn"], fake.profile()["address"].replace("\n", ""), fake.profile()["blood_group"], fake.profile()["job"], fake.profile()["company"].replace("\n", ""), fake.profile()["residence"].replace("\n", ""), fake.profile()["sex"], fake.profile()["mail"], fake.profile()["birthdate"]])
    print(f"Thread={threading.get_ident()}, Process={os.getpid()} CSV File saved...")

def main():
    with ProcessPoolExecutor(max_workers=worker_count) as executor:
        executor.map(write_fake_data, file_num)



if __name__ == '__main__':
    start = time.perf_counter()
    main()
    finish = time.perf_counter()
    print(f'Finished in {round(finish-start, 2)} second(s)')
Thread=140308890367808, Process=25978 CSV File saved...
Thread=140308890367808, Process=25979 CSV File saved...
Thread=140308890367808, Process=25980 CSV File saved...
Thread=140308890367808, Process=25976 CSV File saved...
Thread=140308890367808, Process=25977 CSV File saved...
Finished in 341.58 second(s)

5개의 process로 10만건 데이터 생성에 341초 

 

from faker import Faker
import csv
import time

fake = Faker()
start  = time.perf_counter()
with open("test.txt", 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["name", "ssn", "address", "blood_group", "job", "company", "residence", "sex", "mail", "birthdate"])
        for _ in range(100000):
            writer.writerow([fake.profile()["name"], fake.profile()["ssn"], fake.profile()["address"].replace("\n", ""), fake.profile()["blood_group"], fake.profile()["job"], fake.profile()["company"].replace("\n", ""), fake.profile()["residence"].replace("\n", ""), fake.profile()["sex"], fake.profile()["mail"], fake.profile()["birthdate"]])

finish = time.perf_counter()

print(f'Finished in {round(finish-start, 2)} second(s)')
Finished in 1674.05 second(s)

1개의 process로 10만건 데이터 생성에 1674초

'python' 카테고리의 다른 글

이것저것  (0) 2023.01.27
report 생성  (0) 2023.01.23
python single line to multi line  (0) 2023.01.18
aiosmtpd  (1) 2022.12.29
python fake data (mimesis)  (0) 2022.09.10

댓글