Source code for xframes.xplot


import traceback
import operator
import math
import datetime
import logging

from xframes.deps import HAS_MATPLOTLIB

if HAS_MATPLOTLIB:
    import matplotlib.pyplot as plt

import xframes


[docs]class XPlot(object):
    """
    Plotting library for XFrames.

    Creates simple data plots.

    Parameters
    ----------
    axes : list, optional
        The size of the axes.  Should be a four-element list.
        [x_origin, y_origin, x_length, y_length]
        Defaults to [0.0, 0.0, 1.5, 1.0]

    alpha : float, optional
        The opacity of the plot.
    """
[docs]    def __init__(self, axes=None, alpha=None):
        """
        Create a plotting object.

        Parameters
        ----------
        axes : list, optional
            The size of the axes.  Should be a four-element list.
            [x_origin, y_origin, x_length, y_length]
            Defaults to [0.0, 0.0, 1.5, 1.0]

        alpha : float, optional
            The opacity of the plot.
        """
        self.axes = axes if axes else [0.0, 0.0, 1.5, 1.0]
        self.alpha = alpha or 0.5

    def make_barh(self, items, xlabel, ylabel, append_counts_to_label=False, title=None):
        if not HAS_MATPLOTLIB:
            return
        if items is not None and len(items) > 0:
            try:
                y_pos = range(len(items))
                vals = [int(key[1]) for key in items]
                labels = [str(key[0])[:30] for key in items]
                if append_counts_to_label:
                    labels = ['{} ({:,})'.format(label, val) for val, label in zip(vals, labels)]
                def safe_decode(str):
                    try:
                        return str.decode('utf8')
                    except:
                        return 'string decode error'
                labels = [safe_decode(label) for label in labels]
                plt.barh(y_pos, vals, align='center', alpha=self.alpha)
                plt.yticks(y_pos, labels)
                plt.xlabel(xlabel)
                plt.ylabel(ylabel)
                if title:
                    plt.title(title)
                plt.show()
            except Exception as e:
                logging.warn("Make_barh: got an exception!")
                logging.warn(traceback.format_exc())
                logging.warn(e)

    # noinspection PyShadowingBuiltins
    def make_bar(self, items, xlabel, ylabel, title=None):
        if not HAS_MATPLOTLIB:
            return
        if items is not None:
            bins = len(items)
            try:
                counts = [col[1] for col in items]
                vals = [col[0] for col in items]
                x_pos = range(len(counts))
                plt.bar(x_pos, counts, align='center', alpha=self.alpha)
                plt.xlabel(xlabel)
                plt.ylabel(ylabel)
                delta = vals[1] - vals[0]
                min = vals[0]
                max = min + bins * delta
                if bins < 8:
                    n_ticks = bins
                else:
                    n_ticks = 8
                tick_delta = (max - min) / float(n_ticks)
                step = int(bins / float(n_ticks))
                if step <= 0: step = 1
                tick_pos = range(0, bins + 1, step)
                tick_labels = [min + i * tick_delta for i in range(n_ticks + 1)]
                tick_labels = [str(lab)[:5] for lab in tick_labels]
                plt.xticks(tick_pos, tick_labels)
                if title:
                    plt.title(title)
                plt.show()
            except Exception as e:
                logging.warn("Make_barh: got an exception!")
                logging.warn(traceback.format_exc())
                logging.warn(e)

[docs]    def top_values(self, xf, x_col, y_col, k=15, title=None, xlabel=None, ylabel=None):
        """
        Plot the top values of a column of data.

        Parameters
        ----------
        xf : XFrame
            An XFrame containing the columns to be plotted.

        x_col : str
            A column name: the top values in this column are plotted.  These values must be numerical.

        y_col : str
            A column name: the values in this colum will be used to label the corresponding values
            in the x column.

        k : int, optional
            The number of values to plot.  Defaults to 15.

        title : str, optional
            A plot title.

        xlabel : str, optional
            A label for the X axis.

        ylabel : str, optional
            A label for the Y axis.

        Examples
        --------
        (Come up with an example)
        """
        top_rows = xf.topk(x_col, k=k)
        items = [(row[y_col], row[x_col]) for row in top_rows]
        xlabel = xlabel or x_col
        ylabel = ylabel or y_col

        self.make_barh(items, xlabel, ylabel, title=title)

[docs]    def frequent_values(self, column, k=15, title=None,
                        append_counts_to_label=False,
                        normalize=False,
                        xlabel=None, ylabel=None,
                        epsilon=None, delta=None, num_items=None):
        """
        Plots the number of occurances of specific values in a column.  

        The most frequent values are plotted.

        Parameters
        ----------
        column : XArray
            The column to plot.  The number of distinct occurrances of each value is
            calculated and plotted.  

        k : int, optional
            The number of different values to plot.  Defaults to 15.

        title : str, optional
            A plot title.

        append_counts_to_label : boolean, optional
            If true, append the bar count to the label

        normalize : bool, optional
            If true, plot percentages instead of counts.  Defaults to False.

        xlabel : str, optional
            A label for the X axis.

        ylabel : str, optional
            A label for the Y axis.

        epsilon : float, optional
            Governs accuracy of frequency counter.

        delta : float, optional
            Governs accuracy of frequency counter.

        num_items : float, optional
            Governs accuracy of frequency counter.

        Returns
        -------
        list of tuples
            List of (value, count) for the most frequent "k" values

        Examples
        --------
        (Need examples)

        """
        sk = column.sketch_summary()
        if epsilon:
            sk.set_frequency_sketch_parms(epsilon=epsilon)
        if delta:
            sk.set_frequency_sketch_parms(delta=delta)
        if num_items:
            sk.set_frequency_sketch_parms(num_items=num_items)

        fi = sk.frequent_items()
        if len(fi) > 0:
            sorted_fi = sorted(fi.iteritems(), key=operator.itemgetter(1), reverse=True)
        else:
            return []
        frequent = [x for x in sorted_fi[:k] if x[1] > 1]
        if normalize:
            total_count = float(sum([f[1] for f in frequent]))
            frequent = [(k, round(v * 100.0 /total_count)) for k, v in frequent]
        if len(frequent) > 0:
            default_xlabel = 'Percentage' if normalize else 'Count'
            xlabel = xlabel or default_xlabel
            ylabel = ylabel or 'Value'
            title = title or "Frequent Values"
            self.make_barh(frequent, xlabel, ylabel, append_counts_to_label=append_counts_to_label, title=title)
        return frequent

    @staticmethod
    def create_histogram_buckets(vals, bins, min_val, max_val):
        if max_val == min_val:
            return None, None
        interval = max_val - min_val
        n_buckets = bins or 50
        bucket_vals = [0] * n_buckets
        usetd = isinstance(interval, datetime.timedelta)
        if usetd:
            delta = interval.total_seconds() / n_buckets
            for i in range(0, n_buckets):
                bucket_vals[i] = min_val + datetime.timedelta(seconds=(i * delta))
        else:
            delta = float(interval) / n_buckets
            for i in range(0, n_buckets):
                bucket_vals[i] = min_val + (i * delta)

        def iterate_values(value_iterator):
            bucket_counts = [0] * n_buckets
            for val in value_iterator:
                if val is None:
                    continue
                if isinstance(val, float ) and math.isnan(val):
                    continue
                if usetd:
                    b = int((val - min_val).total_seconds() / delta)
                else:
                    b = int((val - min_val) / delta)
                if b >= n_buckets:
                    b = n_buckets - 1
                elif b < 0:
                    b = 0
                bucket_counts[b] += 1
            yield bucket_counts

        def merge_accumulators(acc1, acc2):
            return [a1 + a2 for a1, a2 in zip(acc1, acc2)]

        accumulators = vals._impl._rdd.mapPartitions(iterate_values)
        bucket_counts = accumulators.reduce(merge_accumulators)
        return bucket_vals, bucket_counts

[docs]    def histogram(self,
                  column,
                  title=None,
                  bins=None,
                  sketch=None, 
                  xlabel=None, ylabel=None,
                  lower_cutoff=0.0, upper_cutoff=1.0,
                  lower_bound = None, upper_bound=None):
        """ 
        Plot a histogram.

        All values greater than the cutoff (given as a quantile) are set equal to the cutoff.

        Parameters
        ----------
        column : XArray
            A column to display.

        title : str, optional
            A plot title.

        bins : int, optional
            The number of bins to use.  Defaults to 50.

        sketch : Sketch, optional
            The column sketch.  If this is available, then it saves time not to recompute it.

        xlabel : str, optional
            A label for the X axis.

        ylabel : str, optional
            A label for the Y axis.

        lower_cutoff : float, optional
            This is a quantile value, between 0 and 1.  
            Values below this cutoff are placed in the first bin.
            Defaults to 0.

        upper_cutoff : float, optional
            This is a quantile value, between 0 and 1.  
            Values above this cutoff are placed in the last bin.
            Defaults to 1.0.

        lower_bound : float, optional
            Values below this bound are placed in the first bin.

        upper_bound : float, optional
            Values below this bound are placed in the last bin.

        bins : int, optional
            The number of bins to use.  Defaults to 50.

        Examples
        --------
        (Need examples)
        """
        if lower_cutoff < 0.0 or lower_cutoff > 1.0:
            raise ValueError('lower cutoff must be between 0.0 and 1.0')
        if upper_cutoff < 0.0 or upper_cutoff > 1.0:
            raise ValueError('upper cutoff must be between 0.0 and 1.0')
        if lower_cutoff >= upper_cutoff:
            raise ValueError('lower cutoff must be less than upper cutoff')

        bins = bins or 50
        sk = sketch or column.sketch_summary()
        q_epsilon = 0.01
        q_lower = None
        q_upper = None
        if lower_cutoff > 0.0:
            q_lower = float(sk.quantile(lower_cutoff)) - q_epsilon
        if upper_cutoff < 1.0:
            q_upper = float(sk.quantile(upper_cutoff)) + q_epsilon
        if lower_bound is not None:
            q_lower = lower_bound
        if upper_bound is not None:
            q_upper = upper_bound
        xlabel = xlabel or 'Value'
        ylabel = ylabel or 'Count'
        vals = column.dropna()

        def enforce_lower_cutoff(x):
            return max(x, q_lower)
        def enforce_upper_cutoff(x):
            return min(x, q_upper)
        if q_lower is not None:
            vals = vals.apply(enforce_lower_cutoff)
            hist_min = q_lower
        else:
            hist_min = sk.min()
        if q_upper is not None:
            vals = vals.apply(enforce_upper_cutoff)
            hist_max = q_upper
        else:
            hist_max = sk.max()
        bucket_counts, bucket_vals = self.create_histogram_buckets(vals, bins, hist_min, hist_max)
        column = [(x, y) for x, y in zip(bucket_counts, bucket_vals)]
        self.make_bar(column, xlabel=xlabel, ylabel=ylabel, title=title)

[docs]    def col_info(self, column, column_name=None, table_name=None, title=None, topk=None, bins=None, cutoff=False):
        """ 
        Print column summary information.

        The number of the most frequent values is shown.
        If the column to summarize is numerical or datetime, then a histogram is also shown.

        Parameters
        ----------
        column : XArray
            The column to summarize.

        column_name : str
            The column name.

        table_name : str, optional
            The table name; used to labeling only.  The table that us used for the data
            is given in the constructor.

        title : str, optional
            The plot title.

        topk: int, optional
            The number of frequent items to show.

        bins : int, optional
            The number of bins in a histogram.

        cutoff : float, optional
            The number to use as an upper cutoff, if the plot is a histogram.

        Examples
        --------
        (Need examples)
        """

        title = title or table_name
        column_name = column_name or ''
        table_name = table_name or ''
        print 'Table Name:  ', table_name
        print 'Column Name: ', column_name
        print 'Column Type: ', column.dtype().__name__
        sk = column.sketch_summary()
        print 'Rows:        ', sk.size()
        unique_items = sk.num_unique()
        print 'Unique Items:', unique_items
        print 'Approximate Frequent Items:'
        fi = sk.frequent_items()
        topk = topk or 15
        if len(fi) == 0:
            print '    None'
            top = None
        else:
            sorted_fi = sorted(fi.iteritems(), key=operator.itemgetter(1), reverse=True)
            top = [x for x in sorted_fi[:topk] if x[1] > 1]
            for key in top:
                print '   {:10}  {:10}'.format(key[1], key[0])
        col_type = column.dtype()
        if col_type is int or col_type is float:
            # number: show a histogram
            print 'Num Undefined:', sk.num_undefined()
            print 'Min:          ', sk.min()
            print 'Max:          ', sk.max()
            print 'Mean:         ', sk.mean()
            if unique_items > 1:
                print 'StDev:        ', sk.std()
                print 'Distribution Plot'
                upper_cutoff = cutoff or 1.0
                self.histogram(column, title=title, bins=bins, sketch=sk, upper_cutoff=upper_cutoff)

        if col_type is datetime.datetime:
            # datetime: show a histogram
            print 'Num Undefined:', sk.num_undefined()
            print 'Min:          ', sk.min()
            print 'Max:          ', sk.max()
            if unique_items > 1:
                print 'Distribution Plot'
                upper_cutoff = cutoff or 1.0
                self.histogram(column, title=title, bins=bins, sketch=sk, upper_cutoff=upper_cutoff)

        # ordinal: show a bar chart of frequent values
        # set x_col and y_col
        if top is not None:
            vals = xframes.XArray([key[0] for key in top], dtype=col_type)
            counts = xframes.XArray([key[1] for key in top], dtype=int)
            x_col = 'Count'
            y_col = column_name
            tmp = xframes.XFrame({x_col: counts, y_col: vals})
            tmp.show().top_values(x_col, y_col, title=title, k=topk)