Source code for xframes.aggregate


from aggregator_property_set import AggregatorPropertySet

# Builtin aggregators for groupby
from aggregator_impl import agg_sum, agg_argmax, agg_argmin, agg_max, agg_min, \
    agg_count, agg_mean, agg_variance, agg_stdv, agg_select_one, \
    agg_concat_list, agg_concat_dict, agg_values, agg_values_count, agg_quantile


# noinspection PyPep8Naming
[docs]def SUM(src_column):
    """
    Builtin sum aggregator for groupby

    Examples
    --------

    Get the sum of the rating column for each user.
    >>> xf.groupby("user", {'rating_sum':aggregate.SUM('rating')})

    """
    return AggregatorPropertySet(agg_sum, int, 'sum', 1), [src_column]


# noinspection PyPep8Naming
[docs]def ARGMAX(agg_column, out_column):
    """
    Builtin arg maximum aggregator for groupby.

    Examples
    --------

    Get the movie with maximum rating per user.

    >>> xf.groupby("user",
                    {'best_movie':aggregate.ARGMAX('rating','movie')})
    """
    return AggregatorPropertySet(agg_argmax, 1, 'argmax', 2), [agg_column, out_column]


# noinspection PyPep8Naming
[docs]def ARGMIN(agg_column, out_column):
    """
    Builtin arg minimum aggregator for groupby.

    Examples
    --------

    Get the movie with minimum rating per user.

    >>> xf.groupby("user",
                    {'best_movie':aggregate.ARGMIN('rating','movie')})

    """
    return AggregatorPropertySet(agg_argmin, 1, 'argmin', 2), [agg_column, out_column]


# noinspection PyPep8Naming
[docs]def MAX(src_column):
    """
    Builtin maximum aggregator for groupby

    Examples
    --------

    Get the maximum rating of each user.

    >>> xf.groupby("user",
                    {'rating_max':aggregate.MAX('rating')})

    """
    return AggregatorPropertySet(agg_max, 0, 'max', 1), [src_column]


# noinspection PyPep8Naming
[docs]def MIN(src_column):
    """
    Builtin minimum aggregator for groupby

    Examples
    --------

    Get the minimum rating of each user.

    >>> xf.groupby("user",
                    {'rating_min':aggregate.MIN('rating')})

    """
    return AggregatorPropertySet(agg_min, 0, 'min', 1), [src_column]


# noinspection PyPep8Naming
[docs]def COUNT():
    """
    Builtin count aggregator for groupby

    Examples
    --------

    Get the number of occurrences of each user.

    >>> xf.groupby("user",
                    {'count':aggregate.COUNT()})

    """
    return AggregatorPropertySet(agg_count, int, 'count', 0), ['']


# noinspection PyPep8Naming
[docs]def MEAN(src_column):
    """
    Builtin average aggregator for groupby.

    Examples
    --------

    Get the average rating of each user.

    >>> xf.groupby("user",
                    {'rating_mean':aggregate.MEAN('rating')})
    """
    return AggregatorPropertySet(agg_mean, float, 'mean', 1), [src_column]


# noinspection PyPep8Naming
[docs]def VARIANCE(src_column):
    """
    Builtin variance aggregator for groupby.

    Examples
    --------

    Get the rating variance of each user.

    >>> xf.groupby("user",
                 {'rating_var':aggregate.VARIANCE('rating')})

    """
    return AggregatorPropertySet(agg_variance, float, 'variance', 1), [src_column]


# noinspection PyPep8Naming
[docs]def STDV(src_column):
    """
    Builtin standard deviation aggregator for groupby.

    Examples
    --------

    Get the rating standard deviation of each user.

    >>> xf.groupby("user",
                    {'rating_stdv':aggregate.STDV('rating')})

    """
    return AggregatorPropertySet(agg_stdv, float, 'stdv', 1), [src_column]


# noinspection PyPep8Naming
[docs]def SELECT_ONE(src_column):
    """
    Builtin aggregator for groupby which selects one row in the group.

    Examples
    --------

    Get one rating row from a user.

    >>> xf.groupby("user", {'rating':aggregate.SELECT_ONE('rating')})

    If multiple columns are selected, they are guaranteed to come from the
    same row. For instance:
    >>> xf.groupby("user", {'rating':aggregate.SELECT_ONE('rating'), 'item':aggregate.SELECT_ONE('item')})

    The selected 'rating' and 'item' value for each user will come from the
    same row in the XFrame.
    """

    # use seed to make selection repeatable
    # it would be more random to use the column name
    seed = src_column
    return AggregatorPropertySet(agg_select_one, 0, 'select-one', 1), [src_column, seed]


# noinspection PyPep8Naming
[docs]def CONCAT(src_column, dict_value_column=None):
    """
    Builtin aggregator that combines values from one or two columns in one group
    into either a dictionary value, list value or array value.

    Examples
    --------

    To combine values from two columns that belong to one group into
    one dictionary value:

    >>> xf.groupby(["document"],
                   {"word_count": aggregate.CONCAT("word", "count")})

    To combine values from one column that belong to one group into a list value:

    >>> xf.groupby(["user"],
                   {"friends": aggregate.CONCAT("friend")})

    """
    if dict_value_column is None:
        return AggregatorPropertySet(agg_concat_list, list, 'concat', 1), [src_column]
    else:
        return AggregatorPropertySet(agg_concat_dict, dict, 'concat', 1), [src_column, dict_value_column]


# noinspection PyPep8Naming
[docs]def VALUES(src_column):
    """
    Builtin aggregator that combines distinct values from one  column in one group
    into a list value.

    Examples
    --------

    To combine values from one column that belong to one group into a list value:

    >>> xf.groupby(["user"],
                     {"friends": aggregate.VALUES("friend")})

    """
    return AggregatorPropertySet(agg_values, list, 'values', 1), [src_column]


# noinspection PyPep8Naming
[docs]def VALUES_COUNT(src_column):
    """
    Builtin aggregator that combines distinct values from one  column in one group
    into a dictionary value of unique values and their counts.

    Examples
    --------

    To combine values from one column that belong to one group into a dictionary of friend: count values:

    >>> xf.groupby(["user"],
       {"friends": aggregate.VALUES_COUNT("friend")})

    """
    return AggregatorPropertySet(agg_values_count, dict, 'values-count', 1), [src_column]


# noinspection PyPep8Naming
[docs]def QUANTILE(src_column, *args):
    """
    Builtin approximate quantile aggregator for groupby.
    Accepts as an argument, one or more of a list of quantiles to query.

    Examples
    --------

    To extract the median
        >>> xf.groupby("user",
                        {'rating_quantiles': aggregate.QUANTILE('rating', 0.5)})

    To extract a few quantiles
        >>> xf.groupby("user",
                        {'rating_quantiles': aggregate.QUANTILE('rating', [0.25,0.5,0.75])})

    Or equivalently
        >>> xf.groupby("user",
                        {'rating_quantiles': aggregate.QUANTILE('rating', 0.25,0.5,0.75)})

    The returned quantiles are guaranteed to have 0.5% accuracy. That is to say,
    if the requested quantile is 0.50, the resultant quantile value may be
    between 0.495 and 0.505 of the true quantile.
    """
    if len(args) == 1:
        quantiles = args[0]
    else:
        quantiles = list(args)

    if not hasattr(quantiles, '__iter__'):
        quantiles = [quantiles]
    query = ",".join([str(i) for i in quantiles])
    return AggregatorPropertySet(agg_quantile, float, 'quantile', 1), [src_column], '[' + query + ']'