Berka example¶
Setting up the data schema¶
from hazy_configurator import (
CategoryType,
DataSchema,
DatetimeType,
FloatType,
ForeignKeyType,
IdType,
IncrementalIdSettings,
IntType,
MappedType,
NumericalIdSettings,
RealType,
ReferenceTable,
SequentialTable,
TabularTable,
)
# define tables
ACCOUNT = "account"
CARD = "card"
CLIENT = "client"
DISP = "disp"
DISTRICT = "district"
LOAN = "loan"
ORDER = "order"
TRANS = "trans"
def get_berka_tables():
return [
ReferenceTable(
name=DISTRICT,
dtypes=[
RealType(
col="district_id",
primary_key=True,
),
CategoryType(col="A2"),
CategoryType(col="A3"),
IntType(col="A4"),
IntType(col="A5"),
IntType(col="A6"),
IntType(col="A7"),
IntType(col="A8"),
IntType(col="A9"),
FloatType(col="A10"),
IntType(col="A11"),
FloatType(col="A12"),
FloatType(col="A13"),
IntType(col="A14"),
FloatType(col="A15"),
IntType(col="A16"),
],
),
TabularTable(
name=ACCOUNT,
dtypes=[
IdType(
col="account_id",
settings=IncrementalIdSettings(start=1, increment=1),
primary_key=True,
),
ForeignKeyType(col="district_id", ref=(DISTRICT, "district_id")),
CategoryType(col="frequency"),
DatetimeType(col="date", format="%Y-%m-%d"),
],
),
TabularTable(
name=CARD,
dtypes=[
IdType(
col="card_id",
settings=IncrementalIdSettings(start=1, increment=1),
primary_key=True,
),
ForeignKeyType(col="disp_id", ref=(DISP, "disp_id")),
CategoryType(col="type"),
DatetimeType(col="issued", format="%Y-%m-%d"),
],
),
TabularTable(
name=CLIENT,
dtypes=[
IdType(
col="client_id",
settings=IncrementalIdSettings(start=1, increment=1),
primary_key=True,
),
ForeignKeyType(col="district_id", ref=(DISTRICT, "district_id")),
CategoryType(col="gender"),
DatetimeType(col="birth_date", format="%Y-%m-%d"),
],
),
TabularTable(
name=DISP,
dtypes=[
IdType(
col="disp_id",
settings=IncrementalIdSettings(start=1, increment=1),
primary_key=True,
),
ForeignKeyType(col="client_id", ref=(CLIENT, "client_id")),
ForeignKeyType(col="account_id", ref=(ACCOUNT, "account_id")),
CategoryType(col="type"),
],
),
TabularTable(
name=LOAN,
dtypes=[
IdType(
col="loan_id",
settings=NumericalIdSettings(length=12),
primary_key=True,
),
ForeignKeyType(col="account_id", ref=(ACCOUNT, "account_id")),
CategoryType(col="status"),
DatetimeType(col="date", format="%Y-%m-%d"),
IntType(col="amount"),
IntType(col="payments"),
IntType(col="duration"),
],
),
TabularTable(
name=ORDER,
dtypes=[
IdType(
col="order_id",
settings=NumericalIdSettings(length=12),
primary_key=True,
),
ForeignKeyType(col="account_id", ref=(ACCOUNT, "account_id")),
CategoryType(col="bank_to"),
MappedType(
col="account_to",
settings=NumericalIdSettings(length=8),
),
CategoryType(col="k_symbol"),
FloatType(col="amount"),
],
),
SequentialTable(
name=TRANS,
sort_by=["date"],
seq_id="account_id",
dtypes=[
IdType(
col="trans_id",
settings=NumericalIdSettings(length=12),
primary_key=True,
),
ForeignKeyType(
col="account_id",
ref=(ACCOUNT, "account_id"),
),
MappedType(
col="account",
settings=NumericalIdSettings(length=8),
),
CategoryType(col="type"),
CategoryType(col="operation"),
CategoryType(col="k_symbol"),
CategoryType(col="bank"),
DatetimeType(col="date", format="%Y-%m-%d"),
IntType(col="amount"),
IntType(col="balance"),
],
),
]
def get_berka_schema(trans_table=True):
if trans_table:
return DataSchema(tables=get_berka_tables())
else:
return DataSchema(tables=get_berka_tables()[:-1])
A simple training config¶
from datetime import datetime
from hazy_configurator import DataLocationInput, EvaluationConfig, TrainingConfig
from hazy_configurator.examples.berka.data_schema import get_berka_schema
# define tables
ACCOUNT = "account"
CARD = "card"
CLIENT = "client"
DISP = "disp"
DISTRICT = "district"
LOAN = "loan"
ORDER = "order"
TRANS = "trans"
data_path = "/path/to/berka/"
data_input = [
DataLocationInput(name=ACCOUNT, location=data_path + f"{ACCOUNT}.csv"),
DataLocationInput(name=CLIENT, location=data_path + f"{CLIENT}.csv"),
DataLocationInput(name=CARD, location=data_path + f"{CARD}.csv"),
DataLocationInput(name=DISP, location=data_path + f"{DISP}.csv"),
DataLocationInput(name=DISTRICT, location=data_path + f"{DISTRICT}.csv"),
DataLocationInput(name=LOAN, location=data_path + f"{LOAN}.csv"),
DataLocationInput(name=ORDER, location=data_path + f"{ORDER}.csv"),
DataLocationInput(name=TRANS, location=data_path + f"{TRANS}.csv"),
]
def get_data_input(trans_table=True):
if trans_table:
return data_input
else:
return data_input[:-1]
def berka_basic_training_config(trans_table=True):
return TrainingConfig(
data_schema=get_berka_schema(trans_table),
model_output="berka/fixtures/model.hmf",
data_input=get_data_input(trans_table),
created_at=datetime.now(),
evaluation=EvaluationConfig(),
)
if __name__ == "__main__":
training_config = berka_basic_training_config()
# For writing config to JSON
with open("training_config.json", "w") as f:
f.write(training_config.json(indent=4))
A more complex training config¶
from datetime import datetime
from hazy_configurator import (
AdjacencyType,
CrossTableMutualInformationSimilarityParams,
DataLocationInput,
DegreeDistributionSimilarityParams,
EvalSampleParams,
EvaluationConfig,
HistogramSimilarityParams,
ModelParameters,
MultiTableTrainingParams,
MutualInformationSimilarityParams,
PrivBayesConfig,
PrivBayesUnknownCombinationStrategyType,
SequentialTrainingParams,
TrainingConfig,
)
from hazy_configurator.examples.berka.data_schema import get_berka_schema
# define tables
ACCOUNT = "account"
CARD = "card"
CLIENT = "client"
DISP = "disp"
DISTRICT = "district"
LOAN = "loan"
ORDER = "order"
TRANS = "trans"
data_path = "/path/to/berka/"
data_input = [
DataLocationInput(name=ACCOUNT, location=data_path + f"{ACCOUNT}.csv"),
DataLocationInput(name=CLIENT, location=data_path + f"{CLIENT}.csv"),
DataLocationInput(name=CARD, location=data_path + f"{CARD}.csv"),
DataLocationInput(name=DISP, location=data_path + f"{DISP}.csv"),
DataLocationInput(name=DISTRICT, location=data_path + f"{DISTRICT}.csv"),
DataLocationInput(name=LOAN, location=data_path + f"{LOAN}.csv"),
DataLocationInput(name=ORDER, location=data_path + f"{ORDER}.csv"),
DataLocationInput(name=TRANS, location=data_path + f"{TRANS}.csv"),
]
def berka_complex_training_config():
return TrainingConfig(
data_schema=get_berka_schema(),
created_at=datetime.now(),
model_output="berka/fixtures/model.hmf",
data_input=data_input,
model_parameters=ModelParameters(
generator=PrivBayesConfig(
epsilon=0.001,
n_parents=2,
default_strategy=PrivBayesUnknownCombinationStrategyType.MARGINAL,
n_bins=50,
max_cat=100,
),
multi_table=MultiTableTrainingParams(
adjacency_type=AdjacencyType.DEGREE_PRESERVING,
),
sequential=SequentialTrainingParams(
window_size=6,
n_predict=2,
),
),
evaluation=EvaluationConfig(
metrics=[
HistogramSimilarityParams(),
MutualInformationSimilarityParams(),
CrossTableMutualInformationSimilarityParams(),
DegreeDistributionSimilarityParams(),
],
eval_sample_params=EvalSampleParams(magnitude=0.5),
),
)
if __name__ == "__main__":
training_config = berka_complex_training_config()
# For writing config to JSON
with open("training_config.json", "w") as f:
f.write(training_config.json(indent=4))