Movielens example

Setting up the data schema

from hazy_configurator import (
    CategoryType,
    DataSchema,
    DatetimeType,
    ForeignKeyType,
    GeoLocales,
    IdType,
    IncrementalIdSettings,
    IntType,
    LocationEntity,
    PostcodeType,
    TabularTable,
)

USER = "user"
MOVIE = "movie"
RATINGS = "ratings"


def get_movielens_tables():
    return [
        TabularTable(
            name=MOVIE,
            dtypes=[
                IdType(
                    col="movie_id",
                    settings=IncrementalIdSettings(start=1, increment=1),
                    primary_key=True,
                ),
                DatetimeType(col="release_date", format="%Y-%m-%d"),
                CategoryType(col="Action"),
                CategoryType(col="Adventure"),
                CategoryType(col="Animation"),
                CategoryType(col="Childrens"),
                CategoryType(col="Comedy"),
                CategoryType(col="Crime"),
                CategoryType(col="Documentary"),
                CategoryType(col="Drama"),
                CategoryType(col="Fantasy"),
                CategoryType(col="Film_Noir"),
                CategoryType(col="Horror"),
                CategoryType(col="Musical"),
                CategoryType(col="Mystery"),
                CategoryType(col="Romance"),
                CategoryType(col="Sci_Fi"),
                CategoryType(col="Thriller"),
                CategoryType(col="War"),
                CategoryType(col="Western"),
            ],
            drop_columns=[
                "movie_title",  # Free text not supported
                "IMDb_URL",  # URLs not supported
            ],
        ),
        TabularTable(
            name=USER,
            dtypes=[
                IdType(
                    col="user_id",
                    settings=IncrementalIdSettings(start=1, increment=1),
                    primary_key=True,
                ),
                IntType(col="age"),
                CategoryType(col="gender"),
                CategoryType(col="occupation"),
                PostcodeType(
                    col="zip_code",
                    entity_id=1,
                ),
            ],
        ),
        TabularTable(
            name=RATINGS,
            dtypes=[
                ForeignKeyType(
                    col="movie_id", primary_key=True, ref=(MOVIE, "movie_id")
                ),
                ForeignKeyType(col="user_id", primary_key=True, ref=(USER, "user_id")),
                CategoryType(col="rating"),
                DatetimeType(col="timestamp", format="%Y-%m-%d %H:%M:%S"),
            ],
        ),
    ]


def get_movielens_entities():
    return [LocationEntity(entity_id=1, num_clusters=1000, locales=[GeoLocales.en_US])]


def get_movielens_tables_and_entities():
    return get_movielens_tables(), get_movielens_entities()


def get_movielens_schema():
    return DataSchema(
        tables=get_movielens_tables(),
        entities=get_movielens_entities(),
    )

A simple training config

from datetime import datetime

from hazy_configurator import DataLocationInput, EvaluationConfig, TrainingConfig
from hazy_configurator.examples.movielens.data_schema import get_movielens_schema

# define tables
USER = "user"
MOVIE = "movie"
RATINGS = "ratings"


data_path = "/path/to/movielens/"

data_input = [
    DataLocationInput(name=USER, location=data_path + f"{USER}.csv"),
    DataLocationInput(name=MOVIE, location=data_path + f"{MOVIE}.csv"),
    DataLocationInput(name=RATINGS, location=data_path + f"{RATINGS}.csv"),
]


def movielens_basic_training_config():
    return TrainingConfig(
        data_schema=get_movielens_schema(),
        model_output="movielens_model.hmf",
        data_input=data_input,
        created_at=datetime.now(),
        evaluation=EvaluationConfig(),
    )


if __name__ == "__main__":
    training_config = movielens_basic_training_config()

    # For writing config to JSON
    with open("training_config.json", "w") as f:
        f.write(training_config.json(indent=4))

A more complex training config

from datetime import datetime

from hazy_configurator import (
    AdjacencyType,
    CrossTableMutualInformationSimilarityParams,
    DataLocationInput,
    DegreeDistributionSimilarityParams,
    EvalSampleParams,
    EvaluationConfig,
    HistogramSimilarityParams,
    ModelParameters,
    MultiTableTrainingParams,
    MutualInformationSimilarityParams,
    PrivBayesConfig,
    PrivBayesUnknownCombinationStrategyType,
    TrainingConfig,
)
from hazy_configurator.examples.movielens.data_schema import get_movielens_schema
from hazy_configurator.general_params.sample_generation_config import SampleParams

# define tables
USER = "user"
MOVIE = "movie"
RATINGS = "ratings"


data_path = "/path/to/movielens/"

data_input = [
    DataLocationInput(name=USER, location=data_path + f"{USER}.csv"),
    DataLocationInput(name=MOVIE, location=data_path + f"{MOVIE}.csv"),
    DataLocationInput(name=RATINGS, location=data_path + f"{RATINGS}.csv"),
]


def movielens_complex_training_config():
    return TrainingConfig(
        data_schema=get_movielens_schema(),
        created_at=datetime.now(),
        model_output="movielens_model.hmf",
        data_input=data_input,
        model_parameters=ModelParameters(
            generator=PrivBayesConfig(
                epsilon=0.001,
                n_parents=2,
                default_strategy=PrivBayesUnknownCombinationStrategyType.MARGINAL,
                n_bins=50,
                max_cat=100,
            ),
            multi_table=MultiTableTrainingParams(
                adjacency_type=AdjacencyType.DEGREE_PRESERVING,
            ),
        ),
        evaluation=EvaluationConfig(
            metrics=[
                HistogramSimilarityParams(),
                MutualInformationSimilarityParams(),
                CrossTableMutualInformationSimilarityParams(),
                DegreeDistributionSimilarityParams(),
            ],
            eval_sample_params=EvalSampleParams(magnitude=0.5),
        ),
        sample_params=SampleParams(magnitude=0.2),
    )


if __name__ == "__main__":
    training_config = movielens_complex_training_config()

    # For writing config to JSON
    with open("training_config.json", "w") as f:
        f.write(training_config.json(indent=4))