#!/usr/bin/env python # -*- coding: utf-8 -*- """ Generate a dataset with millions of rows Coded by Arul John in 2024 """ import argparse import random import csv import json import time # Vars road_names = 'Main Church High Elm Park Walnut Washington Chestnut Broad Maple Oak Maple Center Pine River Market Washington Water Union'.split() road_types = 'Road Way Street Avenue Boulevard Lane Drive Terrace Place Court Plaza Square'.split() cities = 'Savannah Eugene Jackson Spokane Florence Morro Missoula Flagstaff Covington Newport Springfield'.split() states = 'MA MI MN MS MO MT NE NV NH OH OK OR PA RI SC SD TN TX UT VT VA WA WV'.split() first_names = 'Olivia Noah Emma Liam Amelia Oliver Sophia Elijah Charlotte Mateo Ava Lucas Isabella Levi'.split() last_names = 'Smith Johnson Williams Brown Jones Miller Davis Garcia Rodriguez Wilson'.split() one_million = 1000000 headers = ['Name', 'Date of Birth', 'SSN', 'City', 'State', 'Address', 'Zip', 'Phone'] def generate_name(): return f'{random.choice(first_names)} {random.choice(last_names)}' def generate_ssn(): return f'{random.randint(100, 999)}-{random.randint(10, 99)}-{random.randint(1000, 9999)}' def generate_dob(): return f'{random.randint(1919, 2023)}-{random.randint(1, 12)}-{random.randint(1, 31)}' def generate_phone(): return f'{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}' def generate_address(): return f'{random.randint(100,99999)} {random.choice(road_names)} {random.choice(road_types)}' def generate_city(): return f'{random.choice(cities)}' def generate_state(): return f'{random.choice(states)}' def generate_zipcode(): return f'{str(random.randint(100, 99999)).zfill(5)}' # Create dataset with * 1,000,000 records def generate_dataset(num_millions, format): # csv or json? if format == 'json': json_keys = [h.replace(' ', '_').lower() for h in headers] people = [dict(zip(json_keys, (generate_name(), generate_dob(), generate_ssn(), generate_address(), generate_city(), generate_state(), generate_zipcode(), generate_phone()))) for i in range(num_millions * one_million)] output_filename = f'{num_millions}_million_people.json' with open(output_filename, 'w') as f: json.dump(people, f, indent=2, ensure_ascii=False) elif format == 'csv': people = [(generate_name(), generate_dob(), generate_ssn(), generate_address(), generate_city(), generate_state(), generate_zipcode(), generate_phone()) for i in range(num_millions * one_million)] output_filename = f'{num_millions}_million_people.csv' with open(output_filename, 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(headers) writer.writerows(people) # Main function if __name__ == '__main__': start_time = time.perf_counter() parser = argparse.ArgumentParser(description='Dataset Generator') parser.add_argument('-n', '--number', type=int, required=True, choices=list(range(1,101)), help='Number of records, in millions') parser.add_argument('-f', '--format', type=str, required=True, choices=['csv', 'json'], help='Create CSV or JSON dataset') args = parser.parse_args() print(f'>> Generating {args.format} dataset with {args.number} million rows') generate_dataset(args.number, args.format) end_time = time.perf_counter() print(f'Total Time: {round(end_time - start_time, 5)} seconds.')