Source code for xframes.toolkit.recommend

from abc import ABCMeta, abstractmethod

import os
import pickle

from pyspark import RDD
from pyspark.mllib import recommendation
from pyspark.mllib.recommendation import ALS, Rating

from xframes.spark_context import CommonSparkContext
from xframes.toolkit.model import Model, ModelBuilder
from xframes import XArray, XFrame
from xframes.xarray_impl import XArrayImpl
from xframes.xframe_impl import XFrameImpl
from xframes.utils import delete_file_or_dir
from xframes import fileio

##
## TODO
## write metadata in sub-directory
# add __repr__ and __str__ functions for display


# Models
class RecommenderModel(Model):
    __metaclass__ = ABCMeta

    def __init__(self, model, ratings, user_col, item_col, rating_col):
        self.model = model
        self.ratings = ratings
        self.user_col = user_col
        self.item_col = item_col
        self.rating_col = rating_col

    def __repr__(self):
        res = '{!r}\n'.format(self.model)
        res += 'user_col: {}\n'.format(self.user_col)
        res += 'item_col: {}\n'.format(self.item_col)
        res += 'rating_col: {}'.format(self.rating_col)
        return res

    @staticmethod
    def _file_paths(path):
        """
        Return the file paths for model, ratings, and metadata.
        """
        model_path = os.path.join(path, 'model')
        ratings_path = os.path.join(path, 'ratings')
        metadata_path = os.path.join(path, '_metadata')
        return (model_path, ratings_path, metadata_path)

[docs]class MatrixFactorizationModel(RecommenderModel): """ Recommender model. """
[docs] def __init__(self, model, ratings, user_col, item_col, rating_col): super(MatrixFactorizationModel, self). \ __init__(model, ratings, user_col, item_col, rating_col) self.users = self.ratings.apply(lambda row: row[user_col]).unique() self.items = self.ratings.apply(lambda row: row[item_col]).unique()
# TODO - when there is a second derived type, # see what we can move to the base class.
[docs] def predict(self, user, item): """ Predict the rating given by a user to an item. Parameters ---------- user : int The user to predict. item : int The item to rate. Returns ------- out : float The predicted rating. """ res = self.model.predict(user, item) return res
[docs] def predict_all(self, user): """ Predict ratings for all items. Parameters ---------- user : int The user to make predictions for. Returns ------- out : XFrame Each row of the frame consists of a user id, an item id, and a predicted rating. """ # build rdd to pass to predictAll user_item = XFrame() user_item[self.item_col] = self.items user_item[self.user_col] = user user_item.swap_columns(self.item_col, self.user_col) rdd = user_item.to_rdd() res = self.model.predictAll(rdd) res = res.map(lambda rating: (rating.user, rating.product, rating.rating)) col_names = [self.user_col, self.item_col, self.rating_col] user_type = self.users.dtype() item_type = self.items.dtype() col_types = [user_type, item_type, float] return XFrame.from_rdd(res, column_names=col_names, column_types=col_types)
def recommend(self, user, item): return self.model._java_model.recommendProducts(user, item)
[docs] def recommend_top_k(self, user, k=10): """ Recommend some items for a user. Parameters ---------- user : int The user to make recommendations for. Returns ------- out : XFrame A XFrame containing the highest predictions for the user. The items that the user has explicitly rated are excluded. """ predictions = self.predict_all(user) # filter out the movies that a user has rated rated_items = self.ratings.filterby(user, self.user_col)[self.item_col] predictions = predictions.filterby(rated_items, self.item_col, exclude=True) topk = predictions.topk(self.rating_col, k) return topk
[docs] def item_features(self): """ The item features. Underlying model parameters. """ return XArray.from_rdd(self.model.productFeatures(), list)
[docs] def user_features(self): """ The user features. Underlying model parameters. """ return XArray.from_rdd(self.model.userFeatures(), list)
[docs] def save(self, path): """ Save a model. The model can be saved, then reloaded later to provide recommendations. Parameters ---------- path : str The path where the model will be saved. This should refer to a file, not to a directory. Three items will be stored here: the underlying model parameters, the original ratings, and the column names. These are stored with suffix '.model', '.ratings', and '.metadata'. """ sc = CommonSparkContext().sc() delete_file_or_dir(path) os.makedirs(path) model_path, ratings_path, metadata_path = self._file_paths(path) # save model self.model.save(sc, model_path) # save ratings self.ratings.save(ratings_path) # save metadata metadata = [self.user_col, self.item_col, self.rating_col] with fileio.open_file(metadata_path, 'w') as f: # TODO detect filesystem errors pickle.dump(metadata, f)
@classmethod
[docs] def load(cls, path): """ Load a model that was saved previously. Parameters ---------- path : str The path where the model files are stored. This is the same path that was passed to ``save``. There are three files/directories based on this path, with extensions '.model', '.ratings', and '.metadata'. Returns ------- out : MatrixFactorizationModel A model that can be used to predict ratings. """ sc = CommonSparkContext().sc() model_path, ratings_path, metadata_path = cls._file_paths(path) # load model model = recommendation.MatrixFactorizationModel.load(sc, model_path) # load ratings ratings = XFrame.load(ratings_path) # load metadata with open(metadata_path) as f: user_col, item_col, rating_col = pickle.load(f) return cls(model, ratings, user_col, item_col, rating_col)
# Builders class RecommenderBuilder(ModelBuilder): __metaclass__ = ABCMeta def __init__(self, ratings, user_col, item_col, rating_col): self.ratings = ratings self.user_col = user_col self.item_col = item_col self.rating_col = rating_col def _prepare_ratings(self): user_col = self.user_col item_col = self.item_col rating_col = self.rating_col def create_rating(row): return [row[user_col], row[item_col], row[rating_col]] ratings = self.ratings.apply(create_rating) return ratings @abstractmethod def train(self): pass class ALSBuilder(RecommenderBuilder): def __init__(self, ratings, user_col, item_col, rating_col): """ Create an ALSBuilder. The builder can be used to train a model. Parameters ---------- ratings : XFrame A table containing the user ratings. This table must contin three columns corresponding to the users, the items, and the ratings. The table may contain other columns as well: these are not used. user_col : string The column name of the users. item_col : string The column name of the items. rating_col : string The column name of the ratings. This must be a number. """ super(ALSBuilder, self). \ __init__(ratings, user_col, item_col, rating_col) def train(self, rank, iterations=10, lambda_=0.01, seed=0, **kwargs): """ Train the model. Parameters ---------- rank : int The number of factors in the underlying model. Generally, larger numbers of factors lead to better models, but increase the memory required. A rank in the range of 10 to 200 is usually reasonable. iterations : int, optional The number of iterations to perform. With each iteration, the model improves. ALS typically converges quickly, so a value of 10 is recommended. lambda : float, optional This parameter controls regularization, which controls overfitting. The higher the value of lambda applies more regularization. The appropriate value here depends on the problem, and needs to be tuned by train/test techniques, which measure overfitting. Returns ------- out: : model A RecommenderModel. This can be used to make predidictions on how a user would rate an item. """ ratings = self._prepare_ratings() model = ALS.train(ratings.to_rdd(), rank, iterations=iterations, lambda_=lambda_, seed=seed, **kwargs) return MatrixFactorizationModel(model, self.ratings, self.user_col, self.item_col, self.rating_col) def train_implicit(self, rank, seed=0, iterations=50, lambda_=0.01, **kwargs): """ Train the model using implicit ratings. Parameters ---------- rank : int The number of factors in the underlying model. Generally, larger numbers of factors lead to better models, but increase the memory required. A rank in the range of 10 to 200 is usually reasonable. iterations : int, optional The number of iterations to perform. With each iteration, the model improves. ALS typically converges quickly, so a value of 10 is recommended. lambda : float, optional This parameter controls regularization, which controls overfitting. The higher the value of lambda applies more regularization. The appropriate value here depends on the problem, and needs to be tuned by train/test techniques, which measure overfitting. Returns ------- out: : model A RecommenderModel. This can be used to make predidictions on how a user would rate an item. """ ratings = self._prepare_ratings() model = ALS.trainImplicit(ratings.to_rdd(), rank, iterations=iterations, lambda_=lambda_, seed=seed, **kwargs) return MatrixFactorizationModel(model, self.ratings, self.user_col, self.item_col, self.rating_col)
[docs]def create(data, user_col, item_col, rating_col, recommender_type='ALS', rank=50, iterations=10, lambda_=0.01, seed=0, **kwargs): """ Create a recommendation model. Parameters ---------- data : XFrame A table containing the user ratings. This table must contin three columns corresponding to the users, the items, and the ratings. The table may contain other columns as well: these are not used. user_col : string The column name of the users. item_col : string The column name of the items. rating_col : string The column name of the ratings. This must be a number. recommender_type : string, optional The type of recommender. Optons are: * ALS * ALS-implicit rank : int, optional See ``ALSBuilder.train`` iterations : int, optional See ``ALSBuilder.train`` lambda_ : float, optional See ``ALSBuilder.train`` other : various, optional See optional arguments to pyspark.mllib.recommendations.train. """ if recommender_type == 'ALS': return ALSBuilder(data, user_col, item_col, rating_col) \ .train(rank=rank, iterations=iterations, lambda_=lambda_, seed=seed, **kwargs) if recommender_type == 'ALS-implicit': return ALSBuilder(data, user_col, item_col, rating_col) \ .train_implicit(rank=rank, iterations=iterations, lambda_=lambda_, seed=seed, **kwargs) raise ValueError('recommender type is not recognized')