Skip to content

Single table - Adult dataset

In the following we present an example script using the aindo.rdml library to generate synthetic data in the single table case. We make use of the UCI Adult dataset.

import argparse
import json
from pathlib import Path

import pandas as pd
import torch

from aindo.rdml.eval import compute_privacy_stats, report
from aindo.rdml.relational import Column, RelationalData, Schema, Table
from aindo.rdml.synth import (
    Size,
    TabularDataset,
    TabularModel,
    TabularModelSize,
    TabularPreproc,
    TabularTrainer,
    Validation,
)


def example_adult(
    data_dir: Path,
    output_dir: Path,
    data_frac: float | None,
    model_size: Size | TabularModelSize | str,
    n_epochs: int | None,
    n_steps: int | None,
    valid_each: int,
    device: str | torch.device | None,
    memory: int,
) -> None:
    # Load data and define schema
    schema = Schema(
        adult=Table(
            columns={
                "age": Column.INTEGER,
                "workclass": Column.CATEGORICAL,
                "fnlwgt": Column.INTEGER,
                "education": Column.CATEGORICAL,
                "education-num": Column.CATEGORICAL,
                "marital-status": Column.CATEGORICAL,
                "occupation": Column.CATEGORICAL,
                "relationship": Column.CATEGORICAL,
                "race": Column.CATEGORICAL,
                "sex": Column.CATEGORICAL,
                "capital-gain": Column.INTEGER,
                "capital-loss": Column.INTEGER,
                "hours-per-week": Column.INTEGER,
                "native-country": Column.CATEGORICAL,
                "y": Column.CATEGORICAL,
            }
        ),
    )
    data = {
        "adult": pd.read_csv(
            data_dir / "adult.data",
            names=list(schema.tables["adult"].columns),
        ),
    }
    data = RelationalData(data=data, schema=schema)
    if data_frac is not None:
        _, data = data.split(ratio=data_frac)

    # Define preprocessor
    preproc = TabularPreproc.from_schema(schema=schema).fit(data=data)

    # Split data
    split_ratio = 0.1
    data_train_valid, data_test = data.split(ratio=split_ratio)
    data_train, data_valid = data_train_valid.split(ratio=split_ratio)

    # Build model
    model = TabularModel.build(preproc=preproc, size=model_size)
    model.device = device  # Device to None means it will be set to CUDA if the latter is available, otherwise CPU

    # Train the model
    dataset_train = TabularDataset.from_data(data=data_train, preproc=preproc, on_disk=True)
    dataset_valid = TabularDataset.from_data(data=data_valid, preproc=preproc)
    trainer = TabularTrainer(model=model)
    trainer.train(
        dataset=dataset_train,
        n_epochs=n_epochs,
        n_steps=n_steps,
        memory=memory,
        valid=Validation(
            dataset=dataset_valid,
            early_stop="normal",
            save_best=output_dir / "best.pt",
            tensorboard=output_dir / "tb",
            each=valid_each,
            trigger="step",
        ),
    )

    # Generate synthetic data
    data_synth = model.generate(
        n_samples=data["adult"].shape[0],
        batch_size=1024,
    )
    data_synth.to_csv(output_dir / "synth")

    # Compute and print PDF report
    report(
        data_train=data_train,
        data_test=data_test,
        data_synth=data_synth,
        path=output_dir / "report.pdf",
    )

    # Compute extra privacy stats and print some results
    privacy_stats = compute_privacy_stats(
        data_train=data_train,
        data_synth=data_synth,
    )
    privacy_stats_out = {
        t: {
            "privacy_score": ps.privacy_score,
            "privacy_score_std": ps.privacy_score_std,
            "%_points_at_risk": ps.risk * 100,
        }
        for t, ps in privacy_stats.items()
    }
    with open(output_dir / "privacy_stats.json", mode="w", encoding="utf-8") as f:
        json.dump(privacy_stats_out, f)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("data_dir", type=Path, help="The directory were to find the 'adult' dataset")
    parser.add_argument("output_dir", type=Path, help="The output directory")
    parser.add_argument("--data-frac", "-d", type=float, help="Fraction of data to use")
    parser.add_argument("--model-size", "-m", type=Size.from_str, default=Size.SMALL, help="Model size")
    parser.add_argument(
        "--n",
        "-n",
        type=int,
        default=1000,
        help="Training epochs (or steps if the --steps flag is used)",
    )
    parser.add_argument("--steps", "-s", action="store_true", help="Use steps instead of epochs")
    parser.add_argument("--valid-each", "-v", type=int, default=200, help="# steps between validations")
    parser.add_argument("--device", "-g", default=None, help="Training device")
    parser.add_argument("--memory", "-y", type=int, default=4096, help="Available memory (MB)")
    args = parser.parse_args()

    example_adult(
        data_dir=args.data_dir,
        output_dir=args.output_dir,
        data_frac=args.data_frac,
        model_size=args.model_size,
        n_epochs=None if args.steps else args.n,
        n_steps=args.n if args.steps else None,
        valid_each=args.valid_each,
        device=args.device,
        memory=args.memory,
    )