Deploying a Recommendation System the Kedro Way

2023-06-12T20:54:17+08:00

Very cool blog post 🙂

2023-06-12T23:19:51+08:00

Hello! I am Nok from the developer team of Kedro, I found this post quite informative and I would love to add this into our awesome-kedro list. I have created a PR for this, I hope this is fine :). https://github.com/kedro-org/awesome-kedro/pull/4

LikeLike

2023-06-13T13:22:15+08:00

Hey, thank you very much!

LikeLike

	ratings:
	type: pandas.CSVDataSet
	filepath: data/01_raw/ratings.dat.gz
	load_args:
	compression: 'gzip'
	sep: '::'
	nrows: 1000000
	names: ["userId", "itemId", "rating", "timestamp"]
	items:
	type: pandas.CSVDataSet
	filepath: data/01_raw/movies.dat
	load_args:
	sep: '::'
	names: ["itemId", "movieName", "tags"]

	item_rank:
	type: pandas.CSVDataSet
	filepath: data/03_primary/default_rank.csv

	interactions:
	type: pickle.PickleDataSet
	filepath: data/05_model_input/interactions.pkl
	versioned: True

	# and others

	def factorize_optimize(train, test, eval_train, sp_item_feats, params: Dict):
	k = params["k"]
	random_seed = params["random_seed"]
	epochs = params["epochs"]
	loss = params["loss"]

	study = optuna.create_study(study_name="optimize warp", direction="maximize")
	fun_objective = partial(optuna_objective, train, test,
	eval_train, sp_item_feats, params)

	# mlflow callback for tracking
	# additional setting: nested runs
	mlflc = MLflowCallback(
	tracking_uri=mlflow.get_tracking_uri(),
	metric_name=f"test_precision_at_{k}",
	nest_trials=True
	)

	logger = logging.getLogger(__name__)
	logger.info("Optimizing model hyperparams")
	# increase trials for better success (>100)
	study.optimize(fun_objective, n_trials=10, callbacks=[mlflc])

	# storing best value and the model
	logger.info(f"Training best model (params: {study.best_params})")
	n_components = study.best_params["n_components"]
	mlflow.log_param("n_components", n_components)

	warp_model, test_prec, train_prec = train_model(
	train, test, eval_train, sp_item_feats,
	random_seed, epochs, k, n_components, loss)

	dict_metrics = {f"train_precision_at_{k}": {"value": train_prec, "step": 0},
	f"test_precision_at_{k}": {"value": test_prec, "step": 0}}

	item_biases, item_factors = warp_model.get_item_representations(
	features=sp_item_feats)
	user_biases, user_factors = warp_model.get_user_representations()

	return {"user_factors": user_factors,
	"item_factors": item_factors,
	"user_biases": user_biases,
	"item_biases": item_biases,
	"model_metrics": dict_metrics,
	"embedding_size": n_components}

	def train_model(train, test, eval_train, sp_item_feats,
	random_seed, epochs, k, n_components, loss):
	"""Trains model
	"""
	warp_model = LightFM(no_components=n_components,
	loss=loss, random_state=random_seed)
	for _ in range(epochs):
	warp_model.fit_partial(train, item_features=sp_item_feats,
	num_threads=2, epochs=1)
	test_prec = precision_at_k(
	warp_model, test, train_interactions=train, k=k, item_features=sp_item_feats)
	train_prec = precision_at_k(
	warp_model, eval_train, train_interactions=None, k=k, item_features=sp_item_feats)

	test_prec = np.mean(test_prec)
	train_prec = np.mean(train_prec)

	logger = logging.getLogger(__name__)
	logger.info(f"Train: {train_prec}, Test: {test_prec}")

	return warp_model, test_prec, train_prec


	def optuna_objective(train, test, eval_train, sp_item_feats, params: Dict,
	trial: optuna.trial):
	k = params["k"]
	random_seed = params["random_seed"]
	epochs = params["epochs"]
	loss = params["loss"]

	# optimize this
	n_components = trial.suggest_int("n_components", 10, 80)

	_, test_prec, _ = train_model(train, test, eval_train, sp_item_feats,
	random_seed, epochs, k, n_components, loss)
	return test_prec

	class KedroAnnoyIndex(AbstractDataSet):
	"""Wrap ANNOY so it can be included in Kedro data catalog

	Args:
	AbstractDataSet (AbstractDataset): Kedro abstract class
	"""

	def __init__(self, filepath, embedding_length, metric) -> None:
	self._filepath = Path(filepath)

	self.embedding_length = embedding_length
	self.metric = metric

	def _load(self) -> AnnoyIndex:
	annoy_index = AnnoyIndex(self.embedding_length, self.metric)
	annoy_index.load(self._filepath.as_posix())
	return annoy_index

	def _save(self, annoy_idx: AnnoyIndex) -> None:
	annoy_idx.save(self._filepath.as_posix())

	def _describe(self) -> Dict[str, Any]:
	return dict(filepath=self._filepath, embedding_length=self.embedding_length, metric=self.metric)

	def build_index(item_factors, params: Dict):
	metric = params["metric"]
	n_trees = params["n_trees"]

	factors = item_factors.shape[1]
	# dot product index
	annoy_idx = AnnoyIndex(factors, metric)
	for i in range(item_factors.shape[0]):
	v = item_factors[i]
	annoy_idx.add_item(i, v)

	annoy_idx.build(n_trees)
	# save
	annoy_dataset = MlflowArtifactDataSet(data_set={
	"type": KedroAnnoyIndex,
	"filepath": INDEX_PATH,
	"embedding_length": factors,
	"metric": metric
	})
	annoy_dataset.save(data=annoy_idx)
	return annoy_dataset


	def validate_index(kedro_annoy_dataset: KedroAnnoyIndex, idx_to_names: Dict):
	# 1558 = Dark Knight
	# 1042 = Ratatouille
	# 2196 = Spy who loved me
	# 1246 = Rambo
	# 818 = Rashomon
	# 2481 = The Haunting
	annoy_index = kedro_annoy_dataset.load()
	item_ids_for_sampling = [1558, 1042, 2196, 1246, 818, 2481]
	for item_id in item_ids_for_sampling:
	nearest_movies_annoy(item_id, annoy_index, idx_to_names)

	return kedro_annoy_dataset

	def upload_to_mlflow(kedro_annoy_dataset: KedroAnnoyIndex, idx_to_names: Dict,
	item_factors: np.array, user_factors: np.array, item_biases: np.array,
	user_biases: np.array, item_rank: pd.DataFrame, params: Dict):

	# store temporarily the different artifacts so mlflow facilitates the logging
	with tempfile.NamedTemporaryFile(prefix="idx_to_names-") as idx_to_names_file, \
	tempfile.NamedTemporaryFile(prefix="item_factors_file-") as item_factors_file, \
	tempfile.NamedTemporaryFile(prefix="user_factors_file-") as user_factors_file, \
	tempfile.NamedTemporaryFile(prefix="item_biases_file-") as item_biases_file, \
	tempfile.NamedTemporaryFile(prefix="user_biases_file-") as user_biases_file, \
	tempfile.NamedTemporaryFile(prefix="params_file-") as params_file, \
	tempfile.NamedTemporaryFile(prefix="item_rank_file-", mode='w') as item_rank_file:

	# save in temporary files
	cloudpickle.dump(idx_to_names, idx_to_names_file)
	cloudpickle.dump(params, params_file)
	np.save(item_factors_file, item_factors)
	np.save(user_factors_file, user_factors)
	np.save(item_biases_file, item_biases)
	np.save(user_biases_file, user_biases)
	item_rank.to_csv(item_rank_file, index=False)

	# flush files to disk
	idx_to_names_file.flush()
	item_factors_file.flush()
	user_factors_file.flush()
	item_biases_file.flush()
	user_biases_file.flush()
	params_file.flush()
	item_rank_file.flush()

	artifacts = {
	"annoy_index" : kedro_annoy_dataset._filepath.as_posix(),
	"idx_to_names": idx_to_names_file.name,
	"item_factors": item_factors_file.name,
	"user_factors": user_factors_file.name,
	"item_biases": item_biases_file.name,
	"user_biases": user_biases_file.name,
	"item_rank": item_rank_file.name,
	"params": params_file.name
	}

	mlflow_model_logger = MlflowModelLoggerDataSet(
	flavor="mlflow.pyfunc",
	pyfunc_workflow="python_model",
	save_args={
	"artifacts": artifacts
	},
	)
	mlflow_model_logger.save(KedroMLFlowLightFM())

	class KedroMLFlowLightFM(mlflow.pyfunc.PythonModel):

	def load_context(self, context):
	contents = context.artifacts

	self.idx_to_names = cloudpickle.load(open(contents["idx_to_names"], 'rb'))
	self.item_factors = np.load(contents["item_factors"])
	self.user_factors = np.load(contents["user_factors"])
	self.item_biases = np.load(contents["item_biases"])
	self.user_biases = np.load(contents["user_biases"])
	self.item_rank = pd.read_csv(contents["item_rank"])

	# load annoy index
	annoy_index_file_path = contents["annoy_index"]
	params = cloudpickle.load(open(contents["params"], 'rb'))
	metric = params["metric"]
	# n_trees = params["n_trees"]

	self.annoy_index = AnnoyIndex(self.item_factors.shape[1], metric)
	self.annoy_index.load(annoy_index_file_path)

	def predict(self, context, model_input : pd.DataFrame):
	"""Prediction

	Args:
	model_input (pd.DataFrame): if contains userid then prediction is warm start
	if not, then prediction is item-based only

	Returns:
	_type_: _description_
	"""

	# (1) if dataframe contains user id
	if USER_ID in model_input:
	# group every user to item id
	users_to_items = model_input.groupby(USER_ID)[ITEM_ID].unique()
	list_recos = []
	for user_id, items in users_to_items.iteritems():
	# get nearest neighbors
	list_nn = []
	for item_id in items:
	list_nn.extend(self.annoy_index.get_nns_by_item(item_id, n))

	# get indexes of items
	item_factors = self.item_factors[list_nn]
	item_biases = self.item_biases[list_nn]

	# get index of user
	user_factors = self.user_factors[user_id].reshape(1, -1)
	user_bias = self.user_biases[user_id].reshape(1)

	# perform scoring
	scores = RecommenderUtils.produce_scores(
	item_factors, item_biases, user_factors, user_bias)

	# argsort then reindex to old
	sorted_items = np.array(list_nn)[np.argsort(scores)][0]

	# get item names
	recos = [self.idx_to_names[v] for v in sorted_items][:N_RECOS]

	list_recos.append({USER_ID : user_id, "recos": recos})
	return list_recos

	elif ITEM_ID in model_input:
	items = model_input[ITEM_ID]
	# get nearest neighbors
	list_nn = []
	for item_id in items:
	list_nn.extend(self.annoy_index.get_nns_by_item(item_id, n))

	# get ranking (pandas)
	df_rank = self.item_rank.set_index(ITEM_POSITIONAL_INDEX_NAME)
	df_rank_subset = df_rank.loc[list_nn]
	df_rank_subset = df_rank_subset.sort_values(
	by=NUM_USERS_RANK_SORT_NAME, ascending=False)
	return df_rank_subset[MOVIE_NAME][:N_RECOS].tolist()
	else:
	raise ValueError("Please correct format")

	class TestEndpoint:
	endpoint = "http://127.0.0.1:5001/invocations"
	# Item ID
	# 1558 = Dark Knight
	# 1440 = Kung Fu Panda
	# 418 = Godfather II
	# User ID
	# 2241 = Classics, a little bit of crime
	# 190 = Very into Animation & Fantasy
	def test_single_item(self):
	payload = {
	"columns": [ITEM_ID],
	"data": [[1558]]
	}
	response = requests.post(self.endpoint, json=payload)
	print()
	print("Dark Knight")
	print(response.text)
	assert response.status_code == 200

	def test_two_items(self):
	payload = {
	"columns": [ITEM_ID],
	"data": [[1558], [1440]]
	}

	response = requests.post(self.endpoint, json=payload)
	print()
	print("Dark Knight & Kung Fu Panda")
	print(response.text)
	assert response.status_code == 200

	def test_three_items(self):
	payload = {
	"columns": [ITEM_ID],
	"data": [[1558], [1440], [418]]
	}

	response = requests.post(self.endpoint, json=payload)
	print()

The User API

Workflow Overview

Prep Ratings & Prep Item Features

Factorization

Indexing

Upload to MLFlow

Running the Pipeline

Serving (and Testing!)

Summary

Share this:

Related

By krsnewwave

3 replies on “Deploying a Recommendation System the Kedro Way”

Leave a comment Cancel reply