python fake data with multiprocess (Faker)

from concurrent.futures import ProcessPoolExecutor
import os
import threading
import time
from faker import Faker
import csv
fake = Faker()

csv_save_dir = './save_csv/'
worker_count = 5
per_row = 20000
file_name = "test.txt"
file_num = list(range(worker_count))

if not os.path.exists(csv_save_dir):
    os.makedirs(csv_save_dir)

def write_fake_data(file_num):

    csv_file_save_path = os.path.join(csv_save_dir, f"{str(file_num)}_{file_name}")
    with open(csv_file_save_path, 'w', newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["name", "ssn", "address", "blood_group", "job", "company", "residence", "sex", "mail", "birthdate"])
        for _ in range(per_row):
            writer.writerow([fake.profile()["name"], fake.profile()["ssn"], fake.profile()["address"].replace("\n", ""), fake.profile()["blood_group"], fake.profile()["job"], fake.profile()["company"].replace("\n", ""), fake.profile()["residence"].replace("\n", ""), fake.profile()["sex"], fake.profile()["mail"], fake.profile()["birthdate"]])
    print(f"Thread={threading.get_ident()}, Process={os.getpid()} CSV File saved...")

def main():
    with ProcessPoolExecutor(max_workers=worker_count) as executor:
        executor.map(write_fake_data, file_num)



if __name__ == '__main__':
    start = time.perf_counter()
    main()
    finish = time.perf_counter()
    print(f'Finished in {round(finish-start, 2)} second(s)')

Thread=140308890367808, Process=25978 CSV File saved...
Thread=140308890367808, Process=25979 CSV File saved...
Thread=140308890367808, Process=25980 CSV File saved...
Thread=140308890367808, Process=25976 CSV File saved...
Thread=140308890367808, Process=25977 CSV File saved...
Finished in 341.58 second(s)

5개의 process로 10만건 데이터 생성에 341초

from faker import Faker
import csv
import time

fake = Faker()
start  = time.perf_counter()
with open("test.txt", 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["name", "ssn", "address", "blood_group", "job", "company", "residence", "sex", "mail", "birthdate"])
        for _ in range(100000):
            writer.writerow([fake.profile()["name"], fake.profile()["ssn"], fake.profile()["address"].replace("\n", ""), fake.profile()["blood_group"], fake.profile()["job"], fake.profile()["company"].replace("\n", ""), fake.profile()["residence"].replace("\n", ""), fake.profile()["sex"], fake.profile()["mail"], fake.profile()["birthdate"]])

finish = time.perf_counter()

print(f'Finished in {round(finish-start, 2)} second(s)')

Finished in 1674.05 second(s)

1개의 process로 10만건 데이터 생성에 1674초

저작자표시 비영리 변경금지

'python' 카테고리의 다른 글

이것저것 (0)	2023.01.27
report 생성 (0)	2023.01.23
python single line to multi line (0)	2023.01.18
aiosmtpd (1)	2022.12.29
python fake data (mimesis) (0)	2022.09.10

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

kyeongseo.oh

python fake data with multiprocess (Faker)

'python' 카테고리의 다른 글

댓글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

python fake data with multiprocess (Faker)

'python' 카테고리의 다른 글

관련글

댓글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역