Movielens example¶
Setting up the data schema¶
from hazy_configurator import (
CategoryType,
DataSchema,
DatetimeType,
ForeignKeyType,
GeoLocales,
IdType,
IncrementalIdSettings,
IntType,
LocationEntity,
PostcodeType,
TabularTable,
)
USER = "user"
MOVIE = "movie"
RATINGS = "ratings"
def get_movielens_tables():
return [
TabularTable(
name=MOVIE,
dtypes=[
IdType(
col="movie_id",
settings=IncrementalIdSettings(start=1, increment=1),
primary_key=True,
),
DatetimeType(col="release_date", format="%Y-%m-%d"),
CategoryType(col="Action"),
CategoryType(col="Adventure"),
CategoryType(col="Animation"),
CategoryType(col="Childrens"),
CategoryType(col="Comedy"),
CategoryType(col="Crime"),
CategoryType(col="Documentary"),
CategoryType(col="Drama"),
CategoryType(col="Fantasy"),
CategoryType(col="Film_Noir"),
CategoryType(col="Horror"),
CategoryType(col="Musical"),
CategoryType(col="Mystery"),
CategoryType(col="Romance"),
CategoryType(col="Sci_Fi"),
CategoryType(col="Thriller"),
CategoryType(col="War"),
CategoryType(col="Western"),
],
drop_columns=[
"movie_title", # Free text not supported
"IMDb_URL", # URLs not supported
],
),
TabularTable(
name=USER,
dtypes=[
IdType(
col="user_id",
settings=IncrementalIdSettings(start=1, increment=1),
primary_key=True,
),
IntType(col="age"),
CategoryType(col="gender"),
CategoryType(col="occupation"),
PostcodeType(
col="zip_code",
entity_id=1,
),
],
),
TabularTable(
name=RATINGS,
dtypes=[
ForeignKeyType(
col="movie_id", primary_key=True, ref=(MOVIE, "movie_id")
),
ForeignKeyType(col="user_id", primary_key=True, ref=(USER, "user_id")),
CategoryType(col="rating"),
DatetimeType(col="timestamp", format="%Y-%m-%d %H:%M:%S"),
],
),
]
def get_movielens_entities():
return [LocationEntity(entity_id=1, num_clusters=1000, locales=[GeoLocales.en_US])]
def get_movielens_tables_and_entities():
return get_movielens_tables(), get_movielens_entities()
def get_movielens_schema():
return DataSchema(
tables=get_movielens_tables(),
entities=get_movielens_entities(),
)
A simple training config¶
from datetime import datetime
from hazy_configurator import DataLocationInput, EvaluationConfig, TrainingConfig
from hazy_configurator.examples.movielens.data_schema import get_movielens_schema
# define tables
USER = "user"
MOVIE = "movie"
RATINGS = "ratings"
data_path = "/path/to/movielens/"
data_input = [
DataLocationInput(name=USER, location=data_path + f"{USER}.csv"),
DataLocationInput(name=MOVIE, location=data_path + f"{MOVIE}.csv"),
DataLocationInput(name=RATINGS, location=data_path + f"{RATINGS}.csv"),
]
def movielens_basic_training_config():
return TrainingConfig(
data_schema=get_movielens_schema(),
model_output="movielens_model.hmf",
data_input=data_input,
created_at=datetime.now(),
evaluation=EvaluationConfig(),
)
if __name__ == "__main__":
training_config = movielens_basic_training_config()
# For writing config to JSON
with open("training_config.json", "w") as f:
f.write(training_config.json(indent=4))
A more complex training config¶
from datetime import datetime
from hazy_configurator import (
AdjacencyType,
CrossTableMutualInformationSimilarityParams,
DataLocationInput,
DegreeDistributionSimilarityParams,
EvalSampleParams,
EvaluationConfig,
HistogramSimilarityParams,
ModelParameters,
MultiTableTrainingParams,
MutualInformationSimilarityParams,
PrivBayesConfig,
PrivBayesUnknownCombinationStrategyType,
TrainingConfig,
)
from hazy_configurator.examples.movielens.data_schema import get_movielens_schema
from hazy_configurator.general_params.sample_generation_config import SampleParams
# define tables
USER = "user"
MOVIE = "movie"
RATINGS = "ratings"
data_path = "/path/to/movielens/"
data_input = [
DataLocationInput(name=USER, location=data_path + f"{USER}.csv"),
DataLocationInput(name=MOVIE, location=data_path + f"{MOVIE}.csv"),
DataLocationInput(name=RATINGS, location=data_path + f"{RATINGS}.csv"),
]
def movielens_complex_training_config():
return TrainingConfig(
data_schema=get_movielens_schema(),
created_at=datetime.now(),
model_output="movielens_model.hmf",
data_input=data_input,
model_parameters=ModelParameters(
generator=PrivBayesConfig(
epsilon=0.001,
n_parents=2,
default_strategy=PrivBayesUnknownCombinationStrategyType.MARGINAL,
n_bins=50,
max_cat=100,
),
multi_table=MultiTableTrainingParams(
adjacency_type=AdjacencyType.DEGREE_PRESERVING,
),
),
evaluation=EvaluationConfig(
metrics=[
HistogramSimilarityParams(),
MutualInformationSimilarityParams(),
CrossTableMutualInformationSimilarityParams(),
DegreeDistributionSimilarityParams(),
],
eval_sample_params=EvalSampleParams(magnitude=0.5),
),
sample_params=SampleParams(magnitude=0.2),
)
if __name__ == "__main__":
training_config = movielens_complex_training_config()
# For writing config to JSON
with open("training_config.json", "w") as f:
f.write(training_config.json(indent=4))