Multi table with text - Airbnb dataset

In the following we present an example script using the aindo.rdml library to generate synthetic data with both tabular and text data.

We make use of the Airbnb Open Data dataset, which in its original form consists of a single table. However, after looking at the content of the dataset columns, we find it natural to rearrange the data into two tables:

A table host, with primary key host_id.
A table listings, with primary key id and foreign key host_id, referring to the primary key of host.

The columns host_name and calculated_host_listings_count are indeed attributes of the host, and they are constant across all listings belonging to the same host. The other columns, on the contrary, contain attributes of each particular listing. In the script, the function preproc_data takes care of this rearrangement, while postproc_data performs the inverse transformation, joining the two tables into a single one.

Two columns of the original dataset should be treated as text columns, host_name and name. After rearranging the data, host_name belongs to the host table, while name belongs to the listings table. Since the two columns belong to two different tables, we need to build and train two text models, on top of the tabular one (used to generate the rest of the tabular data).

import argparse
import json
from pathlib import Path

import pandas as pd
import torch

from aindo.rdml.eval import compute_privacy_stats, report
from aindo.rdml.relational import Column, ForeignKey, PrimaryKey, RelationalData, Schema, Table
from aindo.rdml.synth import (
    Size,
    TabularDataset,
    TabularModel,
    TabularModelSize,
    TabularPreproc,
    TabularTrainer,
    TextDataset,
    TextModel,
    TextModelSize,
    TextPreproc,
    TextTrainer,
    Validation,
)


def preproc_data(df: pd.DataFrame, schema: Schema) -> dict[str, pd.DataFrame]:
    """Split the Airbnb dataset in two tables: host (parent) and listings (child)."""
    return {
        "host": df.loc[:, list(schema.tables["host"].all_columns)].drop_duplicates(),
        "listings": df.loc[:, list(schema.tables["listings"].all_columns)],
    }


def postproc_data(data: RelationalData) -> pd.DataFrame:
    """Join the host and listings tables along the foreign key to recover the original format of the Airbnb dataset."""
    return data["host"].merge(data["listings"], on="host_id")


def example_airbnb(
    data_dir: Path,
    output_dir: Path,
    data_frac: float | None,
    model_size_tab: Size | TabularModelSize | str,
    model_size_text: Size | TextModelSize | str,
    n_epochs: int | None,
    n_steps: int | None,
    valid_each: int,
    device: str | torch.device | None,
    memory: int,
) -> None:
    # Load data and define schema
    df = pd.read_csv(data_dir / "airbnb.csv")
    schema = Schema(
        host=Table(
            host_id=PrimaryKey(),
            host_name=Column.TEXT,
            calculated_host_listings_count=Column.NUMERIC,
        ),
        listings=Table(
            id=PrimaryKey(),
            host_id=ForeignKey(parent="host"),
            name=Column.TEXT,
            neighbourhood_group=Column.CATEGORICAL,
            neighbourhood=Column.CATEGORICAL,
            latitude=Column.NUMERIC,
            longitude=Column.NUMERIC,
            room_type=Column.CATEGORICAL,
            price=Column.INTEGER,
            minimum_nights=Column.INTEGER,
            number_of_reviews=Column.INTEGER,
            last_review=Column.DATETIME,
            reviews_per_month=Column.NUMERIC,
            availability_365=Column.INTEGER,
        ),
    )
    data = preproc_data(df=df, schema=schema)
    data = RelationalData(data=data, schema=schema)
    if data_frac is not None:
        _, data = data.split(ratio=data_frac)

    # Split data
    split_ratio = 0.1
    data_train_valid, data_test = data.split(ratio=split_ratio)
    data_train, data_valid = data_train_valid.split(ratio=split_ratio)

    # Tabular

    # Define the tabular preprocessor
    preproc = TabularPreproc.from_schema(schema=schema).fit(data=data)

    # Build the tabular datasets
    dataset_train = TabularDataset.from_data(data=data_train, preproc=preproc, on_disk=True)
    dataset_valid = TabularDataset.from_data(data=data_valid, preproc=preproc, on_disk=True)

    # Build the tabular model
    model = TabularModel.build(preproc=preproc, size=model_size_tab)
    model.device = device  # Device to None means it will be set to CUDA if the latter is available, otherwise CPU

    # Train the tabular model
    TabularTrainer(model=model).train(
        dataset=dataset_train,
        n_epochs=n_epochs,
        n_steps=n_steps,
        memory=memory,
        valid=Validation(
            dataset=dataset_valid,
            early_stop="normal",
            save_best=output_dir / "ckpt" / "tabular.pt",
            tensorboard=output_dir / "tb" / "tabular",
            each=valid_each,
            trigger="step",
        ),
    )

    # Generate synthetic tabular data
    data_synth = model.generate(
        n_samples=data["host"].shape[0],
        batch_size=1024,
    )

    # Compute and print PDF report
    report(
        data_train=data_train,
        data_test=data_test,
        data_synth=data_synth,
        path=output_dir / "report.pdf",
    )

    # Compute extra privacy stats and print some results
    privacy_stats = compute_privacy_stats(
        data_train=data_train,
        data_synth=data_synth,
    )
    privacy_stats_out = {
        t: {
            "privacy_score": ps.privacy_score,
            "privacy_score_std": ps.privacy_score_std,
            "%_points_at_risk": ps.risk * 100,
        }
        for t, ps in privacy_stats.items()
    }
    with open(output_dir / "privacy_stats.json", mode="w", encoding="utf-8") as f:
        json.dump(privacy_stats_out, f)

    # Text (host)

    # Define the text preprocessor
    preproc_text_host = TextPreproc.from_tabular(preproc=preproc, table="host").fit(data=data)

    # Build the text datasets
    dataset_text_host_train = TextDataset.from_data(data=data_train, preproc=preproc_text_host, on_disk=True)
    dataset_text_host_valid = TextDataset.from_data(data=data_valid, preproc=preproc_text_host, on_disk=True)

    # Build the text model
    model_text_host = TextModel.build(
        preproc=preproc_text_host,
        size=model_size_text,
        block_size=max(dataset_text_host_train.max_text_len, dataset_text_host_valid.max_text_len),
    )
    model_text_host.device = device

    # Train the text model
    TextTrainer(model=model_text_host).train(
        dataset=dataset_text_host_train,
        n_epochs=n_epochs,
        n_steps=n_steps,
        memory=memory,
        valid=Validation(
            dataset=dataset_text_host_valid,
            early_stop="normal",
            save_best=output_dir / "ckpt" / "text_host.pt",
            tensorboard=output_dir / "tb" / "text_host",
            each=valid_each,
            trigger="step",
        ),
    )

    # Generate synthetic text data for table host
    data_synth = model_text_host.generate(
        data=data_synth,
        batch_size=512,
    )

    # Text (listings)

    # Define preprocessor
    preproc_text_listings = TextPreproc.from_tabular(preproc=preproc, table="listings").fit(data=data)

    # Build datasets
    dataset_text_listings_train = TextDataset.from_data(data=data_train, preproc=preproc_text_listings, on_disk=True)
    dataset_text_listings_valid = TextDataset.from_data(data=data_valid, preproc=preproc_text_listings, on_disk=True)

    # Build model
    model_text_listings = TextModel.build(
        preproc=preproc_text_listings,
        size=model_size_text,
        block_size=max(dataset_text_listings_train.max_text_len, dataset_text_listings_valid.max_text_len),
    )
    model_text_listings.device = device

    # Train the model
    TextTrainer(model=model_text_listings).train(
        dataset=dataset_text_listings_train,
        n_epochs=n_epochs,
        n_steps=n_steps,
        memory=memory,
        valid=Validation(
            dataset=dataset_text_listings_valid,
            early_stop="normal",
            save_best=output_dir / "ckpt" / "text_listings.pt",
            tensorboard=output_dir / "tb" / "text_listings",
            each=valid_each,
            trigger="step",
        ),
    )

    # Generate synthetic text data for table listings
    data_synth = model_text_listings.generate(
        data=data_synth,
        batch_size=512,
    )

    # Output

    # Save the synthetic data
    synth_dir = output_dir / "synth"
    data_synth.to_csv(synth_dir, escapechar="\\")

    # Revert to the original form
    df_synth = postproc_data(data=data_synth).loc[:, df.columns]
    df_synth.to_csv(synth_dir / "airbnb.csv", index=False, escapechar="\\")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("data_dir", type=Path, help="The directory were to find the 'airbnb' dataset")
    parser.add_argument("output_dir", type=Path, help="The output directory")
    parser.add_argument("--data-frac", "-d", type=float, help="Fraction of data to use")
    parser.add_argument("--model-size-tab", "-m", type=Size.from_str, default=Size.SMALL, help="Size of tabular model")
    parser.add_argument("--model-size-text", "-t", type=Size.from_str, default=Size.MEDIUM, help="Size of text model")
    parser.add_argument(
        "--n",
        "-n",
        type=int,
        default=1000,
        help="Training epochs (or steps if the --steps flag is used)",
    )
    parser.add_argument("--steps", "-s", action="store_true", help="Use steps instead of epochs")
    parser.add_argument("--valid-each", "-v", type=int, default=200, help="# steps between validations")
    parser.add_argument("--device", "-g", default=None, help="Training device")
    parser.add_argument("--memory", "-y", type=int, default=4096, help="Available memory (MB)")
    args = parser.parse_args()

    example_airbnb(
        data_dir=args.data_dir,
        output_dir=args.output_dir,
        data_frac=args.data_frac,
        model_size_tab=args.model_size_tab,
        model_size_text=args.model_size_text,
        n_epochs=None if args.steps else args.n,
        n_steps=args.n if args.steps else None,
        valid_each=args.valid_each,
        device=args.device,
        memory=args.memory,
    )