import traceback
import operator
import math
import datetime
import logging
from xframes.deps import HAS_MATPLOTLIB
if HAS_MATPLOTLIB:
import matplotlib.pyplot as plt
import xframes
[docs]class XPlot(object):
"""
Plotting library for XFrames.
Creates simple data plots.
Parameters
----------
axes : list, optional
The size of the axes. Should be a four-element list.
[x_origin, y_origin, x_length, y_length]
Defaults to [0.0, 0.0, 1.5, 1.0]
alpha : float, optional
The opacity of the plot.
"""
[docs] def __init__(self, axes=None, alpha=None):
"""
Create a plotting object.
Parameters
----------
axes : list, optional
The size of the axes. Should be a four-element list.
[x_origin, y_origin, x_length, y_length]
Defaults to [0.0, 0.0, 1.5, 1.0]
alpha : float, optional
The opacity of the plot.
"""
self.axes = axes if axes else [0.0, 0.0, 1.5, 1.0]
self.alpha = alpha or 0.5
def make_barh(self, items, xlabel, ylabel, append_counts_to_label=False, title=None):
if not HAS_MATPLOTLIB:
return
if items is not None and len(items) > 0:
try:
y_pos = range(len(items))
vals = [int(key[1]) for key in items]
labels = [str(key[0])[:30] for key in items]
if append_counts_to_label:
labels = ['{} ({:,})'.format(label, val) for val, label in zip(vals, labels)]
def safe_decode(str):
try:
return str.decode('utf8')
except:
return 'string decode error'
labels = [safe_decode(label) for label in labels]
plt.barh(y_pos, vals, align='center', alpha=self.alpha)
plt.yticks(y_pos, labels)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
if title:
plt.title(title)
plt.show()
except Exception as e:
logging.warn("Make_barh: got an exception!")
logging.warn(traceback.format_exc())
logging.warn(e)
# noinspection PyShadowingBuiltins
def make_bar(self, items, xlabel, ylabel, title=None):
if not HAS_MATPLOTLIB:
return
if items is not None:
bins = len(items)
try:
counts = [col[1] for col in items]
vals = [col[0] for col in items]
x_pos = range(len(counts))
plt.bar(x_pos, counts, align='center', alpha=self.alpha)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
delta = vals[1] - vals[0]
min = vals[0]
max = min + bins * delta
if bins < 8:
n_ticks = bins
else:
n_ticks = 8
tick_delta = (max - min) / float(n_ticks)
step = int(bins / float(n_ticks))
if step <= 0: step = 1
tick_pos = range(0, bins + 1, step)
tick_labels = [min + i * tick_delta for i in range(n_ticks + 1)]
tick_labels = [str(lab)[:5] for lab in tick_labels]
plt.xticks(tick_pos, tick_labels)
if title:
plt.title(title)
plt.show()
except Exception as e:
logging.warn("Make_barh: got an exception!")
logging.warn(traceback.format_exc())
logging.warn(e)
[docs] def top_values(self, xf, x_col, y_col, k=15, title=None, xlabel=None, ylabel=None):
"""
Plot the top values of a column of data.
Parameters
----------
xf : XFrame
An XFrame containing the columns to be plotted.
x_col : str
A column name: the top values in this column are plotted. These values must be numerical.
y_col : str
A column name: the values in this colum will be used to label the corresponding values
in the x column.
k : int, optional
The number of values to plot. Defaults to 15.
title : str, optional
A plot title.
xlabel : str, optional
A label for the X axis.
ylabel : str, optional
A label for the Y axis.
Examples
--------
(Come up with an example)
"""
top_rows = xf.topk(x_col, k=k)
items = [(row[y_col], row[x_col]) for row in top_rows]
xlabel = xlabel or x_col
ylabel = ylabel or y_col
self.make_barh(items, xlabel, ylabel, title=title)
[docs] def frequent_values(self, column, k=15, title=None,
append_counts_to_label=False,
normalize=False,
xlabel=None, ylabel=None,
epsilon=None, delta=None, num_items=None):
"""
Plots the number of occurances of specific values in a column.
The most frequent values are plotted.
Parameters
----------
column : XArray
The column to plot. The number of distinct occurrances of each value is
calculated and plotted.
k : int, optional
The number of different values to plot. Defaults to 15.
title : str, optional
A plot title.
append_counts_to_label : boolean, optional
If true, append the bar count to the label
normalize : bool, optional
If true, plot percentages instead of counts. Defaults to False.
xlabel : str, optional
A label for the X axis.
ylabel : str, optional
A label for the Y axis.
epsilon : float, optional
Governs accuracy of frequency counter.
delta : float, optional
Governs accuracy of frequency counter.
num_items : float, optional
Governs accuracy of frequency counter.
Returns
-------
list of tuples
List of (value, count) for the most frequent "k" values
Examples
--------
(Need examples)
"""
sk = column.sketch_summary()
if epsilon:
sk.set_frequency_sketch_parms(epsilon=epsilon)
if delta:
sk.set_frequency_sketch_parms(delta=delta)
if num_items:
sk.set_frequency_sketch_parms(num_items=num_items)
fi = sk.frequent_items()
if len(fi) > 0:
sorted_fi = sorted(fi.iteritems(), key=operator.itemgetter(1), reverse=True)
else:
return []
frequent = [x for x in sorted_fi[:k] if x[1] > 1]
if normalize:
total_count = float(sum([f[1] for f in frequent]))
frequent = [(k, round(v * 100.0 /total_count)) for k, v in frequent]
if len(frequent) > 0:
default_xlabel = 'Percentage' if normalize else 'Count'
xlabel = xlabel or default_xlabel
ylabel = ylabel or 'Value'
title = title or "Frequent Values"
self.make_barh(frequent, xlabel, ylabel, append_counts_to_label=append_counts_to_label, title=title)
return frequent
@staticmethod
def create_histogram_buckets(vals, bins, min_val, max_val):
if max_val == min_val:
return None, None
interval = max_val - min_val
n_buckets = bins or 50
bucket_vals = [0] * n_buckets
usetd = isinstance(interval, datetime.timedelta)
if usetd:
delta = interval.total_seconds() / n_buckets
for i in range(0, n_buckets):
bucket_vals[i] = min_val + datetime.timedelta(seconds=(i * delta))
else:
delta = float(interval) / n_buckets
for i in range(0, n_buckets):
bucket_vals[i] = min_val + (i * delta)
def iterate_values(value_iterator):
bucket_counts = [0] * n_buckets
for val in value_iterator:
if val is None:
continue
if isinstance(val, float ) and math.isnan(val):
continue
if usetd:
b = int((val - min_val).total_seconds() / delta)
else:
b = int((val - min_val) / delta)
if b >= n_buckets:
b = n_buckets - 1
elif b < 0:
b = 0
bucket_counts[b] += 1
yield bucket_counts
def merge_accumulators(acc1, acc2):
return [a1 + a2 for a1, a2 in zip(acc1, acc2)]
accumulators = vals._impl._rdd.mapPartitions(iterate_values)
bucket_counts = accumulators.reduce(merge_accumulators)
return bucket_vals, bucket_counts
[docs] def histogram(self,
column,
title=None,
bins=None,
sketch=None,
xlabel=None, ylabel=None,
lower_cutoff=0.0, upper_cutoff=1.0,
lower_bound = None, upper_bound=None):
"""
Plot a histogram.
All values greater than the cutoff (given as a quantile) are set equal to the cutoff.
Parameters
----------
column : XArray
A column to display.
title : str, optional
A plot title.
bins : int, optional
The number of bins to use. Defaults to 50.
sketch : Sketch, optional
The column sketch. If this is available, then it saves time not to recompute it.
xlabel : str, optional
A label for the X axis.
ylabel : str, optional
A label for the Y axis.
lower_cutoff : float, optional
This is a quantile value, between 0 and 1.
Values below this cutoff are placed in the first bin.
Defaults to 0.
upper_cutoff : float, optional
This is a quantile value, between 0 and 1.
Values above this cutoff are placed in the last bin.
Defaults to 1.0.
lower_bound : float, optional
Values below this bound are placed in the first bin.
upper_bound : float, optional
Values below this bound are placed in the last bin.
bins : int, optional
The number of bins to use. Defaults to 50.
Examples
--------
(Need examples)
"""
if lower_cutoff < 0.0 or lower_cutoff > 1.0:
raise ValueError('lower cutoff must be between 0.0 and 1.0')
if upper_cutoff < 0.0 or upper_cutoff > 1.0:
raise ValueError('upper cutoff must be between 0.0 and 1.0')
if lower_cutoff >= upper_cutoff:
raise ValueError('lower cutoff must be less than upper cutoff')
bins = bins or 50
sk = sketch or column.sketch_summary()
q_epsilon = 0.01
q_lower = None
q_upper = None
if lower_cutoff > 0.0:
q_lower = float(sk.quantile(lower_cutoff)) - q_epsilon
if upper_cutoff < 1.0:
q_upper = float(sk.quantile(upper_cutoff)) + q_epsilon
if lower_bound is not None:
q_lower = lower_bound
if upper_bound is not None:
q_upper = upper_bound
xlabel = xlabel or 'Value'
ylabel = ylabel or 'Count'
vals = column.dropna()
def enforce_lower_cutoff(x):
return max(x, q_lower)
def enforce_upper_cutoff(x):
return min(x, q_upper)
if q_lower is not None:
vals = vals.apply(enforce_lower_cutoff)
hist_min = q_lower
else:
hist_min = sk.min()
if q_upper is not None:
vals = vals.apply(enforce_upper_cutoff)
hist_max = q_upper
else:
hist_max = sk.max()
bucket_counts, bucket_vals = self.create_histogram_buckets(vals, bins, hist_min, hist_max)
column = [(x, y) for x, y in zip(bucket_counts, bucket_vals)]
self.make_bar(column, xlabel=xlabel, ylabel=ylabel, title=title)
[docs] def col_info(self, column, column_name=None, table_name=None, title=None, topk=None, bins=None, cutoff=False):
"""
Print column summary information.
The number of the most frequent values is shown.
If the column to summarize is numerical or datetime, then a histogram is also shown.
Parameters
----------
column : XArray
The column to summarize.
column_name : str
The column name.
table_name : str, optional
The table name; used to labeling only. The table that us used for the data
is given in the constructor.
title : str, optional
The plot title.
topk: int, optional
The number of frequent items to show.
bins : int, optional
The number of bins in a histogram.
cutoff : float, optional
The number to use as an upper cutoff, if the plot is a histogram.
Examples
--------
(Need examples)
"""
title = title or table_name
column_name = column_name or ''
table_name = table_name or ''
print 'Table Name: ', table_name
print 'Column Name: ', column_name
print 'Column Type: ', column.dtype().__name__
sk = column.sketch_summary()
print 'Rows: ', sk.size()
unique_items = sk.num_unique()
print 'Unique Items:', unique_items
print 'Approximate Frequent Items:'
fi = sk.frequent_items()
topk = topk or 15
if len(fi) == 0:
print ' None'
top = None
else:
sorted_fi = sorted(fi.iteritems(), key=operator.itemgetter(1), reverse=True)
top = [x for x in sorted_fi[:topk] if x[1] > 1]
for key in top:
print ' {:10} {:10}'.format(key[1], key[0])
col_type = column.dtype()
if col_type is int or col_type is float:
# number: show a histogram
print 'Num Undefined:', sk.num_undefined()
print 'Min: ', sk.min()
print 'Max: ', sk.max()
print 'Mean: ', sk.mean()
if unique_items > 1:
print 'StDev: ', sk.std()
print 'Distribution Plot'
upper_cutoff = cutoff or 1.0
self.histogram(column, title=title, bins=bins, sketch=sk, upper_cutoff=upper_cutoff)
if col_type is datetime.datetime:
# datetime: show a histogram
print 'Num Undefined:', sk.num_undefined()
print 'Min: ', sk.min()
print 'Max: ', sk.max()
if unique_items > 1:
print 'Distribution Plot'
upper_cutoff = cutoff or 1.0
self.histogram(column, title=title, bins=bins, sketch=sk, upper_cutoff=upper_cutoff)
# ordinal: show a bar chart of frequent values
# set x_col and y_col
if top is not None:
vals = xframes.XArray([key[0] for key in top], dtype=col_type)
counts = xframes.XArray([key[1] for key in top], dtype=int)
x_col = 'Count'
y_col = column_name
tmp = xframes.XFrame({x_col: counts, y_col: vals})
tmp.show().top_values(x_col, y_col, title=title, k=topk)