Berka example

Setting up the data schema

from hazy_configurator import (
    CategoryType,
    DataSchema,
    DatetimeType,
    FloatType,
    ForeignKeyType,
    IdType,
    IncrementalIdSettings,
    IntType,
    MappedType,
    NumericalIdSettings,
    RealType,
    ReferenceTable,
    SequentialTable,
    TabularTable,
)

# define tables
ACCOUNT = "account"
CARD = "card"
CLIENT = "client"
DISP = "disp"
DISTRICT = "district"
LOAN = "loan"
ORDER = "order"
TRANS = "trans"


def get_berka_tables():
    return [
        ReferenceTable(
            name=DISTRICT,
            dtypes=[
                RealType(
                    col="district_id",
                    primary_key=True,
                ),
                CategoryType(col="A2"),
                CategoryType(col="A3"),
                IntType(col="A4"),
                IntType(col="A5"),
                IntType(col="A6"),
                IntType(col="A7"),
                IntType(col="A8"),
                IntType(col="A9"),
                FloatType(col="A10"),
                IntType(col="A11"),
                FloatType(col="A12"),
                FloatType(col="A13"),
                IntType(col="A14"),
                FloatType(col="A15"),
                IntType(col="A16"),
            ],
        ),
        TabularTable(
            name=ACCOUNT,
            dtypes=[
                IdType(
                    col="account_id",
                    settings=IncrementalIdSettings(start=1, increment=1),
                    primary_key=True,
                ),
                ForeignKeyType(col="district_id", ref=(DISTRICT, "district_id")),
                CategoryType(col="frequency"),
                DatetimeType(col="date", format="%Y-%m-%d"),
            ],
        ),
        TabularTable(
            name=CARD,
            dtypes=[
                IdType(
                    col="card_id",
                    settings=IncrementalIdSettings(start=1, increment=1),
                    primary_key=True,
                ),
                ForeignKeyType(col="disp_id", ref=(DISP, "disp_id")),
                CategoryType(col="type"),
                DatetimeType(col="issued", format="%Y-%m-%d"),
            ],
        ),
        TabularTable(
            name=CLIENT,
            dtypes=[
                IdType(
                    col="client_id",
                    settings=IncrementalIdSettings(start=1, increment=1),
                    primary_key=True,
                ),
                ForeignKeyType(col="district_id", ref=(DISTRICT, "district_id")),
                CategoryType(col="gender"),
                DatetimeType(col="birth_date", format="%Y-%m-%d"),
            ],
        ),
        TabularTable(
            name=DISP,
            dtypes=[
                IdType(
                    col="disp_id",
                    settings=IncrementalIdSettings(start=1, increment=1),
                    primary_key=True,
                ),
                ForeignKeyType(col="client_id", ref=(CLIENT, "client_id")),
                ForeignKeyType(col="account_id", ref=(ACCOUNT, "account_id")),
                CategoryType(col="type"),
            ],
        ),
        TabularTable(
            name=LOAN,
            dtypes=[
                IdType(
                    col="loan_id",
                    settings=NumericalIdSettings(length=12),
                    primary_key=True,
                ),
                ForeignKeyType(col="account_id", ref=(ACCOUNT, "account_id")),
                CategoryType(col="status"),
                DatetimeType(col="date", format="%Y-%m-%d"),
                IntType(col="amount"),
                IntType(col="payments"),
                IntType(col="duration"),
            ],
        ),
        TabularTable(
            name=ORDER,
            dtypes=[
                IdType(
                    col="order_id",
                    settings=NumericalIdSettings(length=12),
                    primary_key=True,
                ),
                ForeignKeyType(col="account_id", ref=(ACCOUNT, "account_id")),
                CategoryType(col="bank_to"),
                MappedType(
                    col="account_to",
                    settings=NumericalIdSettings(length=8),
                ),
                CategoryType(col="k_symbol"),
                FloatType(col="amount"),
            ],
        ),
        SequentialTable(
            name=TRANS,
            sort_by=["date"],
            seq_id="account_id",
            dtypes=[
                IdType(
                    col="trans_id",
                    settings=NumericalIdSettings(length=12),
                    primary_key=True,
                ),
                ForeignKeyType(
                    col="account_id",
                    ref=(ACCOUNT, "account_id"),
                ),
                MappedType(
                    col="account",
                    settings=NumericalIdSettings(length=8),
                ),
                CategoryType(col="type"),
                CategoryType(col="operation"),
                CategoryType(col="k_symbol"),
                CategoryType(col="bank"),
                DatetimeType(col="date", format="%Y-%m-%d"),
                IntType(col="amount"),
                IntType(col="balance"),
            ],
        ),
    ]


def get_berka_schema(trans_table=True):
    if trans_table:
        return DataSchema(tables=get_berka_tables())
    else:
        return DataSchema(tables=get_berka_tables()[:-1])

A simple training config

from datetime import datetime

from hazy_configurator import DataLocationInput, EvaluationConfig, TrainingConfig
from hazy_configurator.examples.berka.data_schema import get_berka_schema

# define tables
ACCOUNT = "account"
CARD = "card"
CLIENT = "client"
DISP = "disp"
DISTRICT = "district"
LOAN = "loan"
ORDER = "order"
TRANS = "trans"

data_path = "/path/to/berka/"

data_input = [
    DataLocationInput(name=ACCOUNT, location=data_path + f"{ACCOUNT}.csv"),
    DataLocationInput(name=CLIENT, location=data_path + f"{CLIENT}.csv"),
    DataLocationInput(name=CARD, location=data_path + f"{CARD}.csv"),
    DataLocationInput(name=DISP, location=data_path + f"{DISP}.csv"),
    DataLocationInput(name=DISTRICT, location=data_path + f"{DISTRICT}.csv"),
    DataLocationInput(name=LOAN, location=data_path + f"{LOAN}.csv"),
    DataLocationInput(name=ORDER, location=data_path + f"{ORDER}.csv"),
    DataLocationInput(name=TRANS, location=data_path + f"{TRANS}.csv"),
]


def get_data_input(trans_table=True):
    if trans_table:
        return data_input
    else:
        return data_input[:-1]


def berka_basic_training_config(trans_table=True):
    return TrainingConfig(
        data_schema=get_berka_schema(trans_table),
        model_output="berka/fixtures/model.hmf",
        data_input=get_data_input(trans_table),
        created_at=datetime.now(),
        evaluation=EvaluationConfig(),
    )


if __name__ == "__main__":
    training_config = berka_basic_training_config()

    # For writing config to JSON
    with open("training_config.json", "w") as f:
        f.write(training_config.json(indent=4))

A more complex training config

from datetime import datetime

from hazy_configurator import (
    AdjacencyType,
    CrossTableMutualInformationSimilarityParams,
    DataLocationInput,
    DegreeDistributionSimilarityParams,
    EvalSampleParams,
    EvaluationConfig,
    HistogramSimilarityParams,
    ModelParameters,
    MultiTableTrainingParams,
    MutualInformationSimilarityParams,
    PrivBayesConfig,
    PrivBayesUnknownCombinationStrategyType,
    SequentialTrainingParams,
    TrainingConfig,
)
from hazy_configurator.examples.berka.data_schema import get_berka_schema

# define tables
ACCOUNT = "account"
CARD = "card"
CLIENT = "client"
DISP = "disp"
DISTRICT = "district"
LOAN = "loan"
ORDER = "order"
TRANS = "trans"

data_path = "/path/to/berka/"

data_input = [
    DataLocationInput(name=ACCOUNT, location=data_path + f"{ACCOUNT}.csv"),
    DataLocationInput(name=CLIENT, location=data_path + f"{CLIENT}.csv"),
    DataLocationInput(name=CARD, location=data_path + f"{CARD}.csv"),
    DataLocationInput(name=DISP, location=data_path + f"{DISP}.csv"),
    DataLocationInput(name=DISTRICT, location=data_path + f"{DISTRICT}.csv"),
    DataLocationInput(name=LOAN, location=data_path + f"{LOAN}.csv"),
    DataLocationInput(name=ORDER, location=data_path + f"{ORDER}.csv"),
    DataLocationInput(name=TRANS, location=data_path + f"{TRANS}.csv"),
]


def berka_complex_training_config():
    return TrainingConfig(
        data_schema=get_berka_schema(),
        created_at=datetime.now(),
        model_output="berka/fixtures/model.hmf",
        data_input=data_input,
        model_parameters=ModelParameters(
            generator=PrivBayesConfig(
                epsilon=0.001,
                n_parents=2,
                default_strategy=PrivBayesUnknownCombinationStrategyType.MARGINAL,
                n_bins=50,
                max_cat=100,
            ),
            multi_table=MultiTableTrainingParams(
                adjacency_type=AdjacencyType.DEGREE_PRESERVING,
            ),
            sequential=SequentialTrainingParams(
                window_size=6,
                n_predict=2,
            ),
        ),
        evaluation=EvaluationConfig(
            metrics=[
                HistogramSimilarityParams(),
                MutualInformationSimilarityParams(),
                CrossTableMutualInformationSimilarityParams(),
                DegreeDistributionSimilarityParams(),
            ],
            eval_sample_params=EvalSampleParams(magnitude=0.5),
        ),
    )


if __name__ == "__main__":
    training_config = berka_complex_training_config()

    # For writing config to JSON
    with open("training_config.json", "w") as f:
        f.write(training_config.json(indent=4))