"""
This module defines the XArray class which provides the
ability to create, access and manipulate a remote scalable array object.
XArray acts similarly to pandas.Series but without indexing.
The data is immutable, homogeneous, and is stored in a Spark RDD.
"""
import inspect
import math
import time
import array
import warnings
import datetime
from xframes.deps import pandas, HAS_PANDAS
from xframes.deps import HAS_NUMPY
from xframes.xarray_impl import XArrayImpl
from xframes.utils import make_internal_url
from xframes.object_utils import check_input_uri, check_output_uri
from xframes.type_utils import infer_type_of_list, is_numeric_val, classify_auto
import xframes
if HAS_NUMPY:
import numpy
"""
Copyright (c) 2014, Dato, Inc.
All rights reserved.
Copyright (c) 2017, Charles Hayden
All rights reserved.
"""
__all__ = ['XArray']
def _create_sequential_xarray(size, start=0, reverse=False):
if not isinstance(size, int):
raise TypeError('Size must be int.')
if not isinstance(start, int):
raise TypeError('Size must be int.')
if not isinstance(reverse, bool):
raise TypeError('Reverse must me bool.')
return XArray(impl=XArrayImpl.create_sequential_xarray(size, start, reverse))
# noinspection PyUnresolvedReferences,PyRedeclaration
[docs]class XArray(object):
"""
An immutable, homogeneously typed array object backed by Spark RDD.
XArray is able to hold data that are much larger than the machine's main
memory. It fully supports missing values and random access (although random
access is inefficient). The data backing an XArray is located on the cluster
hosting Spark.
"""
[docs] def __init__(self, data=None, dtype=None, ignore_cast_failure=False, impl=None):
"""
Construct a new XArray. The source of data includes: list,
numpy.ndarray, pandas.Series, and urls.
Parameters
----------
data : list | numpy.ndarray | pandas.Series | string
The input data. If this is a list, numpy.ndarray, or pandas.Series,
the data in the list is converted and stored in an XArray.
Alternatively if this is a string, it is interpreted as a path (or
url) to a text file. Each line of the text file is loaded as a
separate row. If `data` is a directory where an XArray was previously
saved, this is loaded as an XArray read directly out of that
directory.
dtype : {int, float, str, list, array.array, dict, datetime.datetime}, optional
The data type of the XArray. If not specified, we attempt to
infer it from the input. If it is a numpy array or a Pandas series, the
data type of the array or series is used. If it is a list, the data type is
inferred from the inner list. If it is a URL or path to a text file, we
default the data type to str.
ignore_cast_failure : bool, optional
If True, ignores casting failures but warns when elements cannot be
cast into the specified data type.
Notes
-----
- If `data` is pandas.Series, the index will be ignored.
The following functionality is currently not implemented:
- numpy.ndarray as row data
- pandas.Series data
- count_words, count_ngrams
- sketch sub_sketch_keys
See Also
--------
xframes.XArray.from_const : Constructs an XArray of a given size with a const value.
xframes.XArray.from_sequence : Constructs an XArray by generating a sequence of consecutive numbers.
xframes.XArray.from_rdd : Create a new XArray from a Spark RDD or Spark DataFrame.
xframes.XArray.set_trace : Controls entry and exit tracing.
xframes.XArray.spark_context : Returns the spark context.
xframes.XArray.spark_sql_context : Returns the spark sql context.
xframes.XArray.hive_context : Returns the spark hive context.
Examples
--------
>>> xa = XArray(data=[1,2,3,4,5], dtype=int)
>>> xa = XArray('s3://testdatasets/a_to_z.txt.gz')
>>> xa = XArray([[1,2,3], [3,4,5]])
>>> xa = XArray(data=[{'a':1, 'b': 2}, {'b':2, 'c': 1}])
>>> xa = XArray(data=[datetime.datetime(2011, 10, 20, 9, 30, 10)])
"""
if dtype is not None and not isinstance(dtype, type):
raise TypeError("Dtype must be a type, e.g. use int rather than 'int'.")
if impl:
self._impl = impl
return
if isinstance(data, XArray):
self._impl = data._impl
return
# we need to perform type inference
dtype = dtype or classify_auto(data)
if data is None:
self._impl = XArrayImpl()
elif HAS_PANDAS and isinstance(data, pandas.Series):
self._impl = XArrayImpl.load_from_iterable(data.values, dtype, ignore_cast_failure)
elif HAS_NUMPY and isinstance(data, numpy.ndarray):
self._impl = XArrayImpl.load_from_iterable(data, dtype, ignore_cast_failure)
elif isinstance(data, (list, array.array)):
self._impl = XArrayImpl.load_from_iterable(data, dtype, ignore_cast_failure)
elif hasattr(data, '__iter__'):
self._impl = XArrayImpl.load_from_iterable(data, dtype, ignore_cast_failure)
elif isinstance(data, str):
internal_url = make_internal_url(data)
check_input_uri(internal_url)
self._impl = XArrayImpl.load_autodetect(internal_url, dtype)
else:
raise TypeError('Unexpected data source: {}. '
"Possible data source types are: 'list', "
"'numpy.ndarray', 'pandas.Series', and 'string(url)'.".format(type(data).__name__))
[docs] def dump_debug_info(self):
"""
Print information about the Spark RDD associated with this XArray.
"""
return self.impl().dump_debug_info()
@classmethod
[docs] def read_text(cls, path, delimiter=None, nrows=None, verbose=False):
"""
Constructs an XArray from a text file or a path to multiple text files.
Parameters
----------
path : string
Location of the text file or directory to load. If 'path' is a directory
or a "glob" pattern, all matching files will be loaded.
delimiter : string, optional
This describes the delimiter used for separating records. Must be a
single character. Defaults to newline.
nrows : int, optional
If set, only this many rows will be read from the file.
verbose : bool, optional
If True, print the progress while reading files.
Returns
-------
:class:`.XArray`
Examples
--------
Read a regular text file, with default options.
>>> path = 'http://s3.amazonaws.com/gl-testdata/rating_data_example.csv'
>>> xa = xframes.XArray.read_text(path)
>>> xa
[25904, 25907, 25923, 25924, 25928, ... ]
Read only the first 100 lines of the text file:
>>> xa = xframes.XArray.read_text(path, nrows=100)
>>> xa
[25904, 25907, 25923, 25924, 25928, ... ]
"""
check_input_uri(path)
url = make_internal_url(path)
return cls(impl=XArrayImpl.read_from_text(url, delimiter=delimiter, nrows=nrows, verbose=verbose))
@classmethod
[docs] def from_const(cls, value, size):
"""
Constructs an XArray of size with a const value.
Parameters
----------
value : [int | float | str | array.array | datetime.datetime | list | dict]
The value to fill the XArray.
size : int
The size of the XArray. Must be positive.
Examples
--------
Construct an XArray consisting of 10 zeroes:
>>> xframes.XArray.from_const(0, 10)
"""
if not isinstance(size, int):
raise TypeError('Size must be a int.')
if size <= 0:
raise ValueError('Size must be positive.')
if not isinstance(value, (int, float, str, array.array, datetime.datetime, list, dict)):
raise TypeError("Cannot create xarray of value type '{}'.".format(type(value).__name__))
return cls(impl=XArrayImpl.load_from_const(value, size))
@classmethod
[docs] def from_sequence(cls, start, stop=None):
"""
Constructs an XArray by generating a sequence of consecutive numbers.
Parameters
----------
start : int
If `stop` is not given, the sequence consists of numbers 0 .. `start`-1.
Otherwise, the sequence starts with `start`.
stop : int, optional
If given, the sequence consists of the numbers `start`, `start`+1 ... `end`-1.
The sequence will not contain this value.
Examples
--------
>>> from_sequence(1000)
Construct an XArray of integer values from 0 to 999
This is equivalent, but more efficient than:
>>> XArray(range(1000))
>>> from_sequence(10, 1000)
Construct an XArray of integer values from 10 to 999
This is equivalent, but more efficient than:
>>> XArray(range(10, 1000))
"""
if not isinstance(start, int) or (stop is not None and not isinstance(stop, int)):
raise TypeError("'Start' and 'stop' must be int.")
if stop is None:
return _create_sequential_xarray(start)
size = stop - start
# this matches the behavior of range
# i.e. range(100,10) just returns an empty array
if size < 0:
size = 0
return _create_sequential_xarray(size, start)
def _get_content_identifier(self):
"""
Returns the unique identifier of the content that backs the XArray
Notes
-----
Meant for internal use only.
"""
return self._impl.get_content_identifier()
# noinspection PyShadowingBuiltins
[docs] def save(self, filename, format=None):
"""
Saves the XArray to file.
The saved XArray will be in a directory named with the `filename`
parameter.
Parameters
----------
filename : string
A local path or a remote URL. If format is 'text', it will be
saved as a text file. If format is 'binary', a directory will be
created at the location which will contain the XArray.
format : {'binary', 'text', 'csv'}, optional
Format in which to save the XFrame. Binary saved XArrays can be
loaded much faster and without any format conversion losses.
The values 'text' and 'csv' are synonymous: Each XArray row will be written
as a single line in an output text file. If not
given, will try to infer the format from filename given. If file
name ends with 'csv', or 'txt', then save as 'csv' format,
otherwise save as 'binary' format.
"""
if format is None:
if filename.endswith('.txt'):
format = 'text'
elif filename.endswith('.csv'):
format = 'csv'
else:
format = 'binary'
url = make_internal_url(filename)
check_output_uri(url)
if format == 'binary':
self._impl.save(url)
elif format == 'text':
self._impl.save_as_text(url)
elif format == 'csv':
self._impl.save_as_csv(url)
[docs] def to_rdd(self, number_of_partitions=4):
"""
Convert the current XArray to the Spark RDD.
Parameters
----------
number_of_partitions: int, optional
The number of partitions to create in the rdd. Defaults to 4.
Returns
-------
out: RDD
The internal RDD used to stores XArray instances.
"""
if not isinstance(number_of_partitions, int):
raise ValueError('Number_of_partitions parameter expects an integer type.')
if number_of_partitions == 0:
raise ValueError('Number_of_partitions can not be initialized to zero.')
return self._impl.to_rdd(number_of_partitions)
@classmethod
[docs] def from_rdd(cls, rdd, dtype, lineage=None):
"""
Convert a Spark RDD into an XArray
Parameters
----------
rdd : pyspark.rdd.RDD
The Spark RDD containing the XArray values.
dtype : type
The values in `rdd` should have the data type `dtype`.
lineage: dict, optional
The lineage to apply to the rdd.
Returns
-------
class:`.XArray`
This incorporates the given RDD.
"""
return cls(impl=XArrayImpl.from_rdd(rdd, dtype, lineage=lineage))
def __repr__(self):
"""
A string description of the XArray.
Returns
-------
str
A string representation of the XArray.
"""
ret = 'dtype: {}\n'.format(self.dtype().__name__)
ret += 'Rows: {}\n'.format(self.size())
ret += str(self)
return ret
def __str__(self):
"""
A string containing the first 100 elements of the array.
Returns
-------
str
Returns a string containing the first 100 elements of the array.
"""
h = self._impl.head_as_list(100)
headln = str(h)
if self.size() > 100:
# cut the last close bracket
# and replace it with ...
headln = headln[0:-1] + ', ... ]'
return headln
def __nonzero__(self):
"""
Returns True if the array is not empty.
"""
return self.size() != 0
def __len__(self):
"""
Returns the length of the array
"""
return self.size()
def __iter__(self):
"""
Provides an iterator to the contents of the array.
"""
def generator():
elems_at_a_time = 262144
self._impl.begin_iterator()
ret = self._impl.iterator_get_next(elems_at_a_time)
while True:
for j in ret:
yield j
if len(ret) == elems_at_a_time:
ret = self._impl.iterator_get_next(elems_at_a_time)
else:
break
return generator()
def __add__(self, other):
"""
If other is a scalar value, adds it to the current array, returning
the new result. If other is an XArray, performs an element-wise
addition of the two arrays.
"""
if isinstance(other, XArray):
return XArray(impl=self._impl.vector_operator(other._impl, '+'))
else:
return XArray(impl=self._impl.left_scalar_operator(other, '+'))
def __sub__(self, other):
"""
If other is a scalar value, subtracts it from the current array, returning
the new result. If other is an XArray, performs an element-wise
subtraction of the two arrays.
"""
if isinstance(other, XArray):
return XArray(impl=self._impl.vector_operator(other._impl, '-'))
else:
return XArray(impl=self._impl.left_scalar_operator(other, '-'))
def __mul__(self, other):
"""
If other is a scalar value, multiplies it to the current array, returning
the new result. If other is an XArray, performs an element-wise
multiplication of the two arrays.
"""
if isinstance(other, XArray):
return XArray(impl=self._impl.vector_operator(other._impl, '*'))
else:
return XArray(impl=self._impl.left_scalar_operator(other, '*'))
def __div__(self, other):
"""
If other is a scalar value, divides each element of the current array
by the value, returning the result. If other is an XArray, performs
an element-wise division of the two arrays.
"""
if isinstance(other, XArray):
return XArray(impl=self._impl.vector_operator(other._impl, '/'))
else:
return XArray(impl=self._impl.left_scalar_operator(other, '/'))
def __pow__(self, other):
"""
Oher must be a scalar value, raises to the current array to thet power, returning
the new result.
"""
if is_numeric_val(other):
return XArray(impl=self._impl.left_scalar_operator(other, '**'))
def __lt__(self, other):
"""
If other is a scalar value, compares each element of the current array
by the value, returning the result. If other is an XArray, performs
an element-wise comparison of the two arrays.
"""
if isinstance(other, XArray):
return XArray(impl=self._impl.vector_operator(other._impl, '<'))
else:
return XArray(impl=self._impl.left_scalar_operator(other, '<'))
def __gt__(self, other):
"""
If other is a scalar value, compares each element of the current array
by the value, returning the result. If other is an XArray, performs
an element-wise comparison of the two arrays.
"""
if isinstance(other, XArray):
return XArray(impl=self._impl.vector_operator(other._impl, '>'))
else:
return XArray(impl=self._impl.left_scalar_operator(other, '>'))
def __le__(self, other):
"""
If other is a scalar value, compares each element of the current array
by the value, returning the result. If other is an XArray, performs
an element-wise comparison of the two arrays.
"""
if isinstance(other, XArray):
return XArray(impl=self._impl.vector_operator(other._impl, '<='))
else:
return XArray(impl=self._impl.left_scalar_operator(other, '<='))
def __ge__(self, other):
"""
If other is a scalar value, compares each element of the current array
by the value, returning the result. If other is an XArray, performs
an element-wise comparison of the two arrays.
"""
if isinstance(other, XArray):
return XArray(impl=self._impl.vector_operator(other._impl, '>='))
else:
return XArray(impl=self._impl.left_scalar_operator(other, '>='))
def __radd__(self, other):
"""
Adds a scalar value to the current array.
Returned array has the same type as the array on the right hand side
"""
return XArray(impl=self._impl.right_scalar_operator(other, '+'))
def __rsub__(self, other):
"""
Subtracts a scalar value from the current array.
Returned array has the same type as the array on the right hand side
"""
return XArray(impl=self._impl.right_scalar_operator(other, '-'))
def __rmul__(self, other):
"""
Multiplies a scalar value to the current array.
Returned array has the same type as the array on the right hand side
"""
return XArray(impl=self._impl.right_scalar_operator(other, '*'))
def __rdiv__(self, other):
"""
Divides a scalar value by each element in the array
Returned array has the same type as the array on the right hand side
"""
return XArray(impl=self._impl.right_scalar_operator(other, '/'))
def __neg__(self):
"""
Negates each element in the array.
"""
return XArray(impl=self._impl.unary_operator('-'))
def __pos__(self):
"""
Implements the unary plus operator.
"""
return XArray(impl=self._impl.unary_operator('+'))
def __abs__(self):
"""
Takes the absolute value of each element in the array
"""
return XArray(impl=self._impl.unary_operator('abs'))
def __eq__(self, other):
"""
If other is a scalar value, compares each element of the current array
by the value, returning the new result. If other is an XArray, performs
an element-wise comparison of the two arrays.
"""
if isinstance(other, XArray):
return XArray(impl=self._impl.vector_operator(other._impl, '=='))
else:
return XArray(impl=self._impl.left_scalar_operator(other, '=='))
def __ne__(self, other):
"""
If other is a scalar value, compares each element of the current array
by the value, returning the new result. If other is an XArray, performs
an element-wise comparison of the two arrays.
"""
if isinstance(other, XArray):
return XArray(impl=self._impl.vector_operator(other._impl, '!='))
else:
return XArray(impl=self._impl.left_scalar_operator(other, '!='))
def __and__(self, other):
"""
Perform a logical element-wise 'and' against another XArray.
Note that this is not the "and" operator, which cannot be overridden,
but the "&" operator.
"""
if isinstance(other, XArray):
return XArray(impl=self._impl.vector_operator(other._impl, '&'))
else:
raise TypeError('XArray can only perform logical and against another XArray.')
def __or__(self, other):
"""
Perform a logical element-wise 'or' against another XArray.
Note that this is not the "or" operator, which cannot be overridden,
but the "|" operator.
"""
if isinstance(other, XArray):
return XArray(impl=self._impl.vector_operator(other._impl, '|'))
else:
raise TypeError('XArray can only perform logical or against another XArray.')
def __getitem__(self, other):
"""
If the key is an XArray of identical length, this function performs a
logical filter: i.e. it subselects all the elements in this array
where the corresponding value in the other array evaluates to true.
If the key is an integer this returns a single row of
the XArray. If the key is a slice, this returns an XArray with the
sliced rows.
"""
if isinstance(other, XArray):
if len(other) != len(self):
raise IndexError('Cannot perform logical indexing on arrays of different length.')
return XArray(impl=self._impl.logical_filter(other._impl))
elif isinstance(other, int):
if other < 0:
other += len(self)
if other >= len(self):
raise IndexError('XArray index out of range.')
return list(XArray(impl=self._impl.copy_range(other, 1, other + 1)))[0]
elif isinstance(other, slice):
start = other.start
stop = other.stop
step = other.step
if start is None:
start = 0
if stop is None:
stop = len(self)
if step is None:
step = 1
# handle negative indices
if start < 0:
start += len(self)
if stop < 0:
stop += len(self)
return XArray(impl=self._impl.copy_range(start, step, stop))
else:
raise IndexError('Invalid type to use for indexing.')
def _materialize(self):
"""
For a XArray that is lazily evaluated, force persist this xarray
to disk, committing all lazy evaluated operations.
"""
self._impl.materialize()
def _is_materialized(self):
"""
Returns whether or not the xarray has been materialized.
"""
return self._impl.is_materialized()
[docs] def size(self):
"""
The size of the XArray.
"""
return self._impl.size()
[docs] def impl(self):
"""
Get the impl. For internal use.
"""
return self._impl
[docs] def dtype(self):
"""
The data type of the XArray.
Returns
-------
type
The type of the XArray.
Examples
--------
>>> xa = XArray(['The quick brown fox jumps over the lazy dog.'])
>>> xa.dtype()
str
>>> xa = XArray(range(10))
>>> xa.dtype()
int
"""
return self._impl.dtype()
[docs] def lineage(self):
"""
The lineage: the files that went into building this array.
Returns
-------
dict
* key 'table': set[filename]
The files that were used to build the XArray
* key 'column': dict{column_name: set[filename]}
The set of files that were used to build each column
"""
return self._impl.lineage_as_dict()
[docs] def head(self, n=10):
"""
Returns an XArray which contains the first n rows of this XArray.
Parameters
----------
n : int
The number of rows to fetch.
Returns
-------
:class:`.XArray`
A new XArray which contains the first n rows of the current XArray.
Examples
--------
>>> XArray(range(10)).head(5)
dtype: int
Rows: 5
[0, 1, 2, 3, 4]
"""
return XArray(impl=self._impl.head(n))
[docs] def vector_slice(self, start, end=None):
"""
If this XArray contains vectors or recursive types, this returns a new XArray
containing each individual vector sliced, between start and end, exclusive.
Parameters
----------
start : int
The start position of the slice.
end : int, optional.
The end position of the slice. Note that the end position
is NOT included in the slice. Thus a g.vector_slice(1,3) will extract
entries in position 1 and 2.
Returns
-------
:class:`.XArray`
Each individual vector sliced according to the arguments.
Examples
--------
If g is a vector of floats:
>>> g = XArray([[1,2,3],[2,3,4]])
>>> g
dtype: array
Rows: 2
[array('d', [1.0, 2.0, 3.0]), array('d', [2.0, 3.0, 4.0])]
>>> g.vector_slice(0) # extracts the first element of each vector
dtype: float
Rows: 2
[1.0, 2.0]
>>> g.vector_slice(0, 2) # extracts the first two elements of each vector
dtype: array.array
Rows: 2
[array('d', [1.0, 2.0]), array('d', [2.0, 3.0])]
If a vector cannot be sliced, the result will be None:
>>> g = XArray([[1],[1,2],[1,2,3]])
>>> g
dtype: array.array
Rows: 3
[array('d', [1.0]), array('d', [1.0, 2.0]), array('d', [1.0, 2.0, 3.0])]
>>> g.vector_slice(2)
dtype: float
Rows: 3
[None, None, 3.0]
>>> g.vector_slice(0,2)
dtype: list
Rows: 3
[None, array('d', [1.0, 2.0]), array('d', [1.0, 2.0])]
If g is a vector of mixed types (float, int, str, array, list, etc.):
>>> g = XArray([['a',1,1.0],['b',2,2.0]])
>>> g
dtype: list
Rows: 2
[['a', 1, 1.0], ['b', 2, 2.0]]
>>> g.vector_slice(0) # extracts the first element of each vector
dtype: list
Rows: 2
[['a'], ['b']]
"""
if isinstance(self.dtype(), array.array) and not isinstance(self.dtype(), list):
raise RuntimeError("Only 'array' and 'list' type can be sliced.")
if end is None:
end = start + 1
return XArray(impl=self._impl.vector_slice(start, end))
def _count_words(self, to_lower=True):
"""
Count words in the XArray. Return an XArray of dictionary type where
each element contains the word count for each word that appeared in the
corresponding input element. The words are split on all whitespace and
punctuation characters. Only works if this XArray is of string type.
Parameters
----------
to_lower : bool, optional
If True, all words are converted to lower case before counting.
Returns
-------
:class:`.XArray`
The XArray of dictionary type, where each element contains the word
count for each word that appeared in corresponding input element.
See Also
--------
xframes.XArray.count_ngrams
Examples
--------
>>> xa = xframes.XArray(['The quick brown fox jumps.',
"Word word WORD, word!!!word"])
>>> xa.count_words()
dtype: dict
Rows: 2
[{'quick': 1, 'brown': 1, 'jumps': 1, 'fox': 1, 'the': 1}, {'word': 5}]
"""
if not isinstance(self.dtype(), basestring):
raise TypeError('Only XArray of string type is supported for counting bag of words.')
# construct options, will extend over time
options = dict()
options['to_lower'] = True if to_lower else False
return XArray(impl=self._impl.count_bag_of_words(options))
def _count_ngrams(self, n=2, method="word", to_lower=True, ignore_space=True):
"""
Return an XArray of ``dict`` type where each element contains the count
for each of the n-grams that appear in the corresponding input element.
The n-grams can be specified to be either character n-grams or word
n-grams. The input XArray must contain strings.
Parameters
----------
n : int, optional
The number of words in each n-gram. An `n` value of 1 returns word
counts.
method : {'word', 'character'}, optional
If "word", the function performs a count of word n-grams. If
"character", does a character n-gram count.
to_lower : bool, optional
If True, all words are converted to lower case before counting.
ignore_space : bool, optional
If method is "character", indicates if spaces between words are
counted as part of the n-gram. For instance, with the input XArray
element of "fun games", if this parameter is set to False one
tri-gram would be 'n g'. If `ignore_space` is set to True, there
would be no such tri-gram (there would still be 'nga'). This
parameter has no effect if the method is set to "word".
Returns
-------
:class:`.XArray`
An XArray of dictionary type, where each key is the n-gram string
and each value is its count.
See Also
--------
xframes.XArray.count_words
Notes
-----
- Ignoring case (with `to_lower`) involves a full string copy of the
XArray data. To increase speed for large documents, set `to_lower`
to False.
- Punctuation and spaces are both delimiters when counting word n-grams.
When counting character n-grams, punctuation is always ignored.
References
----------
- `N-gram wikipedia article <http://en.wikipedia.org/wiki/N-gram>`_
Examples
--------
Counting word n-grams:
>>> from xframes import XArray
>>> xa = XArray(['I like big dogs. I LIKE BIG DOGS.'])
>>> xa.count_ngrams(xa, 3)
dtype: dict
Rows: 1
[{'big dogs i': 1, 'like big dogs': 2, 'dogs i like': 1, 'i like big': 2}]
Counting character n-grams:
>>> xa = XArray(['Fun. Is. Fun'])
>>> xa.count_ngrams(xa, 3, 'character')
dtype: dict
Rows: 1
{'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}]
"""
if not issubclass(self.dtype(), str):
raise TypeError('Only XArray of string type is supported for counting n-grams.')
if not isinstance(n, int):
raise TypeError("Input 'n' must be of type int.")
if n < 1:
raise ValueError("Input 'n' must be greater than 0.")
if n > 5:
warnings.warn('It is unusual for n-grams to be of size larger than 5.')
# construct options, will extend over time
options = dict()
options['to_lower'] = True if to_lower else False
options['ignore_space'] = True if ignore_space else False
if method == 'word':
return XArray(impl=self._impl.count_ngrams(n, options))
elif method == 'character':
return XArray(impl=self._impl.count_character_ngrams(n, options))
else:
raise ValueError("Invalid 'method' input value. Please input either " +
"'word' or 'character' ")
[docs] def dict_trim_by_keys(self, keys, exclude=True):
"""
Filter an XArray of dictionary type by the given keys. By default, all
keys that are in the provided list in `keys` are *excluded* from the
returned XArray.
Parameters
----------
keys : list
A collection of keys to trim down the elements in the XArray.
exclude : bool, optional
If True, all keys that are in the input key list are removed. If
False, only keys that are in the input key list are retained.
Returns
-------
:class:`.XArray`
A XArray of dictionary type, with each dictionary element trimmed
according to the input criteria.
See Also
--------
xframes.XArray.dict_trim_by_values
Examples
--------
>>> xa = xframes.XArray([{"this":1, "is":1, "dog":2},
{"this": 2, "are": 2, "cat": 1}])
>>> xa.dict_trim_by_keys(["this", "is", "and", "are"], exclude=True)
dtype: dict
Rows: 2
[{'dog': 2}, {'cat': 1}]
"""
if isinstance(keys, str) or (not hasattr(keys, "__iter__")):
keys = [keys]
return XArray(impl=self._impl.dict_trim_by_keys(keys, exclude))
[docs] def dict_trim_by_values(self, lower=None, upper=None):
"""
Filter dictionary values to a given range (inclusive). Trimming is only
performed on values which can be compared to the bound values. Fails on
XArrays whose data type is not ``dict``.
Parameters
----------
lower : int or long or float, optional
The lowest dictionary value that would be retained in the result. If
not given, lower bound is not applied.
upper : int or long or float, optional
The highest dictionary value that would be retained in the result.
If not given, upper bound is not applied.
Returns
-------
:class:`.XArray`
An XArray of dictionary type, with each dict element trimmed
according to the input criteria.
See Also
--------
xframes.XArray.dict_trim_by_keys
Examples
--------
>>> xa = xframes.XArray([{"this":1, "is":5, "dog":7},
{"this": 2, "are": 1, "cat": 5}])
>>> xa.dict_trim_by_values(2,5)
dtype: dict
Rows: 2
[{'is': 5}, {'this': 2, 'cat': 5}]
>>> xa.dict_trim_by_values(upper=5)
dtype: dict
Rows: 2
[{'this': 1, 'is': 5}, {'this': 2, 'are': 1, 'cat': 5}]
"""
if lower is not None and not is_numeric_val(lower):
raise TypeError('Lower bound has to be a numeric value.')
if upper is not None and not is_numeric_val(upper):
raise TypeError('Upper bound has to be a numeric value.')
return XArray(impl=self._impl.dict_trim_by_values(lower, upper))
[docs] def dict_keys(self):
"""
Create an XArray that contains all the keys from each dictionary
element as a list. Fails on XArrays whose data type is not ``dict``.
Returns
-------
:class:`.XArray`
A XArray of list type, where each element is a list of keys
from the input XArray element.
See Also
--------
xframes.XArray.dict_values
Examples
---------
>>> xa = xframes.XArray([{"this":1, "is":5, "dog":7},
{"this": 2, "are": 1, "cat": 5}])
>>> xa.dict_keys()
dtype: list
Rows: 2
[['this', 'is', 'dog'], ['this', 'are', 'cat']]
"""
return xframes.XFrame(impl=self._impl.dict_keys())
[docs] def dict_values(self):
"""
Create an XArray that contains all the values from each dictionary
element as a list. Fails on XArrays whose data type is not ``dict``.
Returns
-------
:class:`.XArray`
A XArray of list type, where each element is a list of values
from the input XArray element.
See Also
--------
xframes.XArray.dict_keys
Examples
--------
>>> xa = xframes.XArray([{"this":1, "is":5, "dog":7},
{"this": 2, "are": 1, "cat": 5}])
>>> xa.dict_values()
dtype: list
Rows: 2
[[1, 5, 7], [2, 1, 5]]
"""
return xframes.XFrame(impl=self._impl.dict_values())
[docs] def dict_has_any_keys(self, keys):
"""
Create a boolean XArray by checking the keys of an XArray of
dictionaries. An element of the output XArray is True if the
corresponding input element's dictionary has any of the given keys.
Fails on XArrays whose data type is not ``dict``.
Parameters
----------
keys : list
A list of key values to check each dictionary against.
Returns
-------
:class:`.XArray`
A XArray of int type, where each element indicates whether the
input XArray element contains any key in the input list.
See Also
--------
xframes.XArray.dict_has_all_keys
Examples
--------
>>> xa = xframes.XArray([{"this":1, "is":5, "dog":7}, {"animal":1},
{"this": 2, "are": 1, "cat": 5}])
>>> xa.dict_has_any_keys(["is", "this", "are"])
dtype: int
Rows: 3
[1, 1, 0]
"""
if isinstance(keys, str) or not hasattr(keys, "__iter__"):
keys = [keys]
return XArray(impl=self._impl.dict_has_any_keys(keys))
[docs] def dict_has_all_keys(self, keys):
"""
Create a boolean XArray by checking the keys of an XArray of
dictionaries.
An element of the output XArray is True if the
corresponding input element's dictionary has all of the given keys.
Fails on XArrays whose data type is not ``dict``.
Parameters
----------
keys : list
A list of key values to check each dictionary against.
Returns
-------
:class:`.XArray`
An XArray of int type, where each element indicates whether the
input XArray element contains all keys in the input list.
See Also
--------
xframes.XArray.dict_has_any_keys
Examples
--------
>>> xa = xframes.XArray([{"this":1, "is":5, "dog":7},
{"this": 2, "are": 1, "cat": 5}])
>>> xa.dict_has_all_keys(["is", "this"])
dtype: int
Rows: 2
[1, 0]
"""
if isinstance(keys, str) or (not hasattr(keys, "__iter__")):
keys = [keys]
return XArray(impl=self._impl.dict_has_all_keys(keys))
[docs] def apply(self, fn, dtype=None, skip_undefined=True, seed=None):
"""
Transform each element of the XArray by a given function.
The result XArray is of type `dtype`. `fn` should be a function that returns
exactly one value which can be cast into the type specified by
`dtype`. If `dtype` is not specified, the first 100 elements of the
XArray are used to make a guess about the data type.
Parameters
----------
fn : function
The function to transform each element. Must return exactly one
value which can be cast into the type specified by `dtype`.
dtype : {int, float, str, list, array.array, dict}, optional
The data type of the new XArray. If not supplied, the first 100 elements
of the array are used to guess the target data type.
skip_undefined : bool, optional
If True, will not apply `fn` to any missing values.
seed : int, optional
Used as the seed if a random number generator is included in `fn`.
Returns
-------
:class:`.XArray`
The XArray transformed by `fn`. Each element of the XArray is of
type `dtype`.
See Also
--------
xframes.XFrame.apply
Applies a function to a column of an XFrame. Note that the functions differ in these
two cases: on an XArray the function receives one value, on an XFrame it receives a dict of the
column name/value pairs.
Examples
--------
>>> xa = xframes.XArray([1,2,3])
>>> xa.apply(lambda x: x*2)
dtype: int
Rows: 3
[2, 4, 6]
"""
if not inspect.isfunction(fn):
raise TypeError('Input must be a function.')
if dtype is None:
h = self._impl.head_as_list(100)
dryrun = [fn(i) for i in h if i is not None]
dtype = infer_type_of_list(dryrun)
if not seed:
seed = time.time()
return XArray(impl=self._impl.transform(fn, dtype, skip_undefined, seed))
[docs] def flat_map(self, fn=None, dtype=None, skip_undefined=True, seed=None):
"""
Transform each element of the XArray by a given function, which must return
a list.
Each item in the result XArray is made up of a list element.
The result XArray is of type `dtype`. `fn` should be a function that returns
a list of values which can be cast into the type specified by
`dtype`. If `dtype` is not specified, the first 100 elements of the
XArray are used to make a guess about the data type.
Parameters
----------
fn : function
The function to transform each element. Must return a list of
values which can be cast into the type specified by `dtype`.
dtype : {None, int, float, str, list, array.array, dict}, optional
The data type of the new XArray. If None, the first 100 elements
of the array are used to guess the target data type.
skip_undefined : bool, optional
If True, will not apply `fn` to any undefined values.
seed : int, optional
Used as the seed if a random number generator is included in `fn`.
Returns
-------
:class:`.XArray`
The XArray transformed by `fn` and flattened. Each element of the XArray is of
type `dtype`.
See Also
--------
xframes.XFrame.flat_map
Examples
--------
>>> xa = xframes.XArray([[1], [1, 2], [1, 2, 3]])
>>> xa.apply(lambda x: x*2)
dtype: int
Rows: 3
[2, 2, 4, 2, 4, 6]
"""
if fn is None:
def fn(x):
return x
if not inspect.isfunction(fn):
raise TypeError('Input must be a function.')
if dtype is None:
h = self._impl.head_as_list(100)
dryrun = [fn(i) for i in h if i is not None]
dryrun = [item for lst in dryrun for item in lst]
dtype = infer_type_of_list(dryrun)
if not seed:
seed = time.time()
return XArray(impl=self._impl.flat_map(fn, dtype, skip_undefined, seed))
[docs] def filter(self, fn, skip_undefined=True, seed=None):
"""
Filter this XArray by a function.
Returns a new XArray filtered by a function. If `fn` evaluates an
element to true, this element is copied to the new XArray. If not, it
isn't. Throws an exception if the return type of `fn` is not castable
to a boolean value.
Parameters
----------
fn : function
Function that filters the XArray. Must evaluate to bool or int.
skip_undefined : bool, optional
If True, will not apply fn to any undefined values.
seed : int, optional
Used as the seed if a random number generator is included in fn.
Returns
-------
:class:`.XArray`
The XArray filtered by fn. Each element of the XArray is of
type int.
Examples
--------
>>> xa = xframes.XArray([1,2,3])
>>> xa.filter(lambda x: x < 3)
dtype: int
Rows: 2
[1, 2]
"""
if not inspect.isfunction(fn):
raise TypeError('Input must be a function.')
if not seed:
seed = time.time()
return XArray(impl=self._impl.filter(fn, skip_undefined, seed))
[docs] def sample(self, fraction, max_partitions=None, seed=None):
"""
Create an XArray which contains a subsample of the current XArray.
Parameters
----------
fraction : float
The fraction of the rows to fetch. Must be between 0 and 1.
max_partitions : int, optional
After sampling, coalesce to this number of partition. If not given,
do not perform this step.
seed : int
The random seed for the random number generator.
Returns
-------
:class:`.XArray`
The new XArray which contains the subsampled rows.
Examples
--------
>>> xa = xframes.XArray(range(10))
>>> xa.sample(.3)
dtype: int
Rows: 3
[2, 6, 9]
"""
if fraction > 1 or fraction < 0:
raise ValueError('Invalid sampling rate: {}.'.format(fraction))
if self.size() == 0:
return XArray()
if not seed:
seed = int(time.time())
return XArray(impl=self._impl.sample(fraction, max_partitions, seed))
def _save_as_text(self, url):
"""
Save the XArray to disk as text file.
"""
raise NotImplementedError
[docs] def all(self):
"""
Return True if every element of the XArray evaluates to True.
For numeric XArrays zeros and missing values (None) evaluate to False,
while all non-zero, non-missing values evaluate to True. For string,
list, and dictionary XArrays, empty values (zero length strings, lists
or dictionaries) or missing values (None) evaluate to False. All
other values evaluate to True.
Returns True on an empty XArray.
Returns
-------
bool
See Also
--------
xframes.XArray.any
Examples
--------
>>> xframes.XArray([1, None]).all()
False
>>> xframes.XArray([1, 0]).all()
False
>>> xframes.XArray([1, 2]).all()
True
>>> xframes.XArray(["hello", "world"]).all()
True
>>> xframes.XArray(["hello", ""]).all()
False
>>> xframes.XArray([]).all()
True
"""
return self._impl.all()
[docs] def any(self):
"""
Return True if any element of the XArray evaluates to True.
For numeric XArrays any non-zero value evaluates to True. For string, list, and
dictionary XArrays, any element of non-zero length evaluates to True.
Returns False on an empty XArray.
Returns
-------
bool
See Also
--------
xframes.XArray.all
Examples
--------
>>> xframes.XArray([1, None]).any()
True
>>> xframes.XArray([1, 0]).any()
True
>>> xframes.XArray([0, 0]).any()
False
>>> xframes.XArray(["hello", "world"]).any()
True
>>> xframes.XArray(["hello", ""]).any()
True
>>> xframes.XArray(["", ""]).any()
False
>>> xframes.XArray([]).any()
False
"""
return self._impl.any()
[docs] def max(self):
"""
Get maximum numeric value in XArray.
Returns None on an empty XArray. Raises an exception if called on an
XArray with non-numeric type.
Returns
-------
type of XArray
Maximum value of XArray
See Also
--------
xframes.XArray.min
Examples
--------
>>> xframes.XArray([14, 62, 83, 72, 77, 96, 5, 25, 69, 66]).max()
96
"""
return self._impl.max()
[docs] def min(self):
"""
Get minimum numeric value in XArray.
Returns None on an empty XArray. Raises an exception if called on an
XArray with non-numeric type.
Returns
-------
type of XArray
Minimum value of XArray
See Also
--------
xframes.XArray.max
Examples
--------
>>> xframes.XArray([14, 62, 83, 72, 77, 96, 5, 25, 69, 66]).min()
"""
return self._impl.min()
[docs] def sum(self):
"""
Sum of all values in this XArray.
Raises an exception if called on an XArray of strings.
If the XArray contains numeric arrays (list or array.array) and
all the lists or arrays are the same length, the sum over all the arrays will be
returned.
If the XArray contains dictionaries whose values are numeric, then the sum of values whose
keys appear in every row.
Returns None on an empty XArray. For large values, this may
overflow without warning.
Returns
-------
type of XArray
Sum of all values in XArray
"""
return self._impl.sum()
[docs] def mean(self):
"""
Mean of all the values in the XArray.
Returns None on an empty XArray. Raises an exception if called on an
XArray with non-numeric type.
Returns
-------
float
Mean of all values in XArray.
"""
return self._impl.mean()
[docs] def std(self, ddof=0):
"""
Standard deviation of all the values in the XArray.
Returns None on an empty XArray. Raises an exception if called on an
XArray with non-numeric type or if `ddof` >= length of XArray.
Parameters
----------
ddof : int, optional
"delta degrees of freedom" in the variance calculation.
Returns
-------
float
The standard deviation of all the values.
"""
return self._impl.std(ddof)
[docs] def var(self, ddof=0):
"""
Variance of all the values in the XArray.
Returns None on an empty XArray. Raises an exception if called on an
XArray with non-numeric type or if `ddof` >= length of XArray.
Parameters
----------
ddof : int, optional
"delta degrees of freedom" in the variance calculation.
Returns
-------
float
Variance of all values in XArray.
"""
return self._impl.var(ddof)
[docs] def num_missing(self):
"""
Number of missing elements in the XArray.
Returns
-------
int
Number of missing values.
"""
return self._impl.num_missing()
[docs] def nnz(self):
"""
Number of non-zero elements in the XArray.
Returns
-------
int
Number of non-zero elements.
"""
return self._impl.nnz()
[docs] def datetime_to_str(self, str_format='%Y-%m-%dT%H:%M:%S%ZP'):
"""
Create a new XArray with all the values cast to str. The string format is
specified by the 'str_format' parameter.
Parameters
----------
str_format : str
The format to output the string. Default format is "%Y-%m-%dT%H:%M:%S%ZP".
Returns
-------
:class:`.XArray` of str
The XArray converted to the type 'str'.
Examples
--------
>>> dt = datetime.datetime(2011, 10, 20, 9, 30, 10, tzinfo=GMT(-5))
>>> xa = xframes.XArray([dt])
>>> xa.datetime_to_str('%e %b %Y %T %ZP')
dtype: str
Rows: 1
[20 Oct 2011 09:30:10 GMT-05:00]
See Also
----------
xframes.XArray.str_to_datetime
"""
if not issubclass(self.dtype(), datetime.datetime):
raise TypeError('Datetime_to_str expects XArray of datetime as input XArray.')
return XArray(impl=self._impl.datetime_to_str(str_format))
[docs] def str_to_datetime(self, str_format=None):
"""
Create a new XArray whose column type is datetime. The string format is
specified by the 'str_format' parameter.
Parameters
----------
str_format : str, optional
The string format of the input XArray.
If not given, dateutil parser is used.
Returns
-------
:class:`.XArray` of datetime.datetime
The XArray converted to the type 'datetime'.
Examples
--------
>>> xa = xframes.XArray(['20-Oct-2011 09:30:10 GMT-05:30'])
>>> xa.str_to_datetime('%d-%b-%Y %H:%M:%S %ZP')
dtype: datetime.datetime
Rows: 1
datetime.datetime(2011, 10, 20, 9, 30, 10)
>>> xa = xframes.XArray(['Aug 23, 2015'])
>>> xa.str_to_datetime()
dtype: datetime.datetime
Rows: 1
datetime.datetime(2015, 8, 23, 0, 0, 0)
See Also
----------
xframes.XArray.datetime_to_str
"""
if not issubclass(self.dtype(), basestring):
raise TypeError("'Str_to_datetime' expects XArray of str as input XArray.")
return XArray(impl=self._impl.str_to_datetime(str_format))
[docs] def astype(self, dtype, undefined_on_failure=False):
"""
Create a new XArray with all values cast to the given type. Throws an
exception if the types are not castable to the given type.
Parameters
----------
dtype : {int, float, str, list, array.array, dict, datetime.datetime}
The type to cast the elements to in XArray
undefined_on_failure: bool, optional
If set to True, runtime cast failures will be emitted as missing
values rather than failing.
Returns
-------
:class:`.XArray` of dtype
The XArray converted to the type `dtype`.
Notes
-----
- The string parsing techniques used to handle conversion to dictionary
and list types are quite generic and permit a variety of interesting
formats to be interpreted. For instance, a JSON string can usually be
interpreted as a list or a dictionary type. See the examples below.
- For datetime-to-string and string-to-datetime conversions,
use xa.datetime_to_str() and xa.str_to_datetime() functions.
Examples
--------
>>> xa = xframes.XArray(['1','2','3','4'])
>>> xa.astype(int)
dtype: int
Rows: 4
[1, 2, 3, 4]
Given an XArray of strings that look like dicts, convert to a dictionary
type:
>>> xa = xframes.XArray(['{1:2 3:4}', '{a:b c:d}'])
>>> xa.astype(dict)
dtype: dict
Rows: 2
[{1: 2, 3: 4}, {'a': 'b', 'c': 'd'}]
"""
return XArray(impl=self._impl.astype(dtype, undefined_on_failure))
[docs] def clip(self, lower=None, upper=None):
"""
Create a new XArray with each value clipped to be within the given
bounds.
In this case, "clipped" means that values below the lower bound will be
set to the lower bound value. Values above the upper bound will be set
to the upper bound value. This function can operate on XArrays of
numeric type as well as array type, in which case each individual
element in each array is clipped. By default `lower` and `upper` are
set to ``None`` which indicates the respective bound should be
ignored. The method fails if invoked on an XArray of non-numeric type.
Parameters
----------
lower : int, optional
The lower bound used to clip. Ignored if equal to ``None``
(the default).
upper : int, optional
The upper bound used to clip. Ignored if equal to ``None``
(the default).
Returns
-------
:class:`.XArray`
See Also
--------
xframes.XArray.clip_lower
xframes.XArray.clip_upper
Examples
--------
>>> xa = xframes.XArray([1,2,3])
>>> xa.clip(2,2)
dtype: int
Rows: 3
[2, 2, 2]
"""
return XArray(impl=self._impl.clip(lower, upper))
[docs] def clip_lower(self, threshold):
"""
Create new XArray with all values clipped to the given lower bound. This
function can operate on numeric arrays, as well as vector arrays, in
which case each individual element in each vector is clipped. Throws an
exception if the XArray is empty or the types are non-numeric.
Parameters
----------
threshold : float
The lower bound used to clip values.
Returns
-------
:class:`.XArray`
See Also
--------
xframes.XArray.clip
xframes.XArray.clip_upper
Examples
--------
>>> xa = xframes.XArray([1,2,3])
>>> xa.clip_lower(2)
dtype: int
Rows: 3
[2, 2, 3]
"""
return XArray(impl=self._impl.clip(threshold, None))
[docs] def clip_upper(self, threshold):
"""
Create new XArray with all values clipped to the given upper bound. This
function can operate on numeric arrays, as well as vector arrays, in
which case each individual element in each vector is clipped.
Parameters
----------
threshold : float
The upper bound used to clip values.
Returns
-------
:class:`.XArray`
See Also
--------
xframes.XArray.clip
xframes.XArray.clip_lower
Examples
--------
>>> xa = xframes.XArray([1,2,3])
>>> xa.clip_upper(2)
dtype: int
Rows: 3
[1, 2, 2]
"""
return XArray(impl=self._impl.clip(None, threshold))
[docs] def tail(self, n=10):
"""
Creates an XArray that contains the last n elements in the given XArray.
Parameters
----------
n : int
The number of elements.
Returns
-------
:class:`.XArray`
A new XArray which contains the last n rows of the current XArray.
"""
return XArray(impl=self._impl.tail(n))
[docs] def countna(self):
"""
Count the number of missing values in the XArray.
A missing value is represented in a float XArray as 'NaN' or None. A missing value in other types of
XArrays is None.
Returns
-------
int
The count of missing values.
"""
return self._impl.count_missing_values()
[docs] def dropna(self):
"""
Create new XArray containing only the non-missing values of the
XArray.
A missing value is represented in a float XArray as 'NaN' on None. A missing value in other types of
XArrays is None.
Returns
-------
:class:`.XArray`
The new XArray with missing values removed.
"""
return XArray(impl=self._impl.drop_missing_values())
[docs] def fillna(self, value):
"""
Create new XArray with all missing values (None or NaN) filled in
with the given value.
The size of the new XArray will be the same as the original XArray. If
the given value is not the same type as the values in the XArray,
`fillna` will attempt to convert the value to the original XArray's
type. If this fails, an error will be raised.
Parameters
----------
value : type convertible to XArray's type
The value used to replace all missing values.
Returns
-------
:class:`.XArray`
A new XArray with all missing values filled.
"""
return XArray(impl=self._impl.fill_missing_values(value))
[docs] def topk_index(self, topk=10, reverse=False):
"""
Create an XArray indicating which elements are in the top k.
Entries are '1' if the corresponding element in the current XArray is a
part of the top k elements, and '0' if that corresponding element is
not. Order is descending by default.
Parameters
----------
topk : int
The number of elements to determine if 'top'
reverse: bool
If True, return the topk elements in ascending order
Returns
-------
:class:`.XArray` of int
Notes
-----
This is used internally by XFrame's topk function.
"""
if not isinstance(topk, int):
raise TypeError("'Topk_index': topk must be an integer ({})".format(topk))
return XArray(impl=self._impl.topk_index(topk, reverse))
[docs] def sketch_summary(self, sub_sketch_keys=None):
"""
Summary statistics that can be calculated with one pass over the XArray.
Returns a :class:`~xframes.Sketch` object which can be further queried for many
descriptive statistics over this XArray. Many of the statistics are
approximate. See the :class:`~xframes.Sketch` documentation for more
detail.
Parameters
----------
sub_sketch_keys: int | str | list of int | list of str, optional
For XArray of dict type, also constructs sketches for a given set of keys,
For XArray of array type, also constructs sketches for the given indexes.
The sub sketches may be queried using: :py:func:`~xframes.Sketch.element_sub_sketch()`
Defaults to None in which case no subsketches will be constructed.
Returns
-------
:class:`.Sketch`
Sketch object that contains descriptive statistics for this XArray.
Many of the statistics are approximate.
"""
from xframes.sketch import Sketch
if sub_sketch_keys is not None:
if not issubclass(self.dtype(), (dict, array.array)):
raise TypeError("'Sub_sketch'_keys is only supported for " +
'XArray of dictionary or array type')
if not hasattr(sub_sketch_keys, "__iter__"):
sub_sketch_keys = [sub_sketch_keys]
value_types = set([type(i) for i in sub_sketch_keys])
if len(value_types) != 1:
raise ValueError("'Sub_sketch_keys' member values need to have the same type.")
value_type = value_types.pop()
if issubclass(self.dtype(), dict) and not isinstance(value_type, basestring):
raise TypeError("Only string value(s) can be passed to 'sub_sketch_keys' " +
'for XArray of dictionary type. ' +
'For dictionary types, sketch summary is ' +
'computed by casting keys to string values.')
if issubclass(self.dtype(), array.array) and not isinstance(value_type, int):
raise TypeError("Only int value(s) can be passed to 'sub_sketch_keys' " +
'for XArray of array type')
return Sketch(self, sub_sketch_keys=sub_sketch_keys)
[docs] def append(self, other):
"""
Append an XArray to the current XArray. Creates a new XArray with the
rows from both XArrays. Both XArrays must be of the same data type.
Parameters
----------
other : :class:`.XArray`
Another XArray whose rows are appended to current XArray.
Returns
-------
:class:`.XArray`
A new XArray that contains rows from both XArrays, with rows from
the other XArray coming after all rows from the current XArray.
See Also
--------
xframes.XFrame.append
Appends XFrames
Examples
--------
>>> xa = xframes.XArray([1, 2, 3])
>>> xa2 = xframes.XArray([4, 5, 6])
>>> xa.append(xa2)
dtype: int
Rows: 6
[1, 2, 3, 4, 5, 6]
"""
if not isinstance(other, XArray):
raise RuntimeError('XArray append can only work with XArray.')
if self.dtype() is not other.dtype():
raise RuntimeError('Data types in both XArrays have to be the same.')
return XArray(impl=self._impl.append(other.impl()))
[docs] def unique(self):
"""
Get all unique values in the current XArray.
Will not necessarily preserve the order of the given XArray in the new XArray.
Raises a TypeError if the XArray is of dictionary type.
Returns
-------
:class:`.XArray`
A new XArray that contains the unique values of the current XArray.
See Also
--------
xframes.XFrame.unique
Unique rows in XFrames.
"""
return XArray(impl=self._impl.unique())
[docs] def item_length(self):
"""
Length of each element in the current XArray.
Only works on XArrays of string, dict, array, or list type. If a given element
is a missing value, then the output elements is also a missing value.
This function is equivalent to the following but more performant:
xa_item_len = xa.apply(lambda x: len(x) if x is not None else None)
Returns
-------
:class:`.XArray`
A new XArray, each element in the XArray is the len of the corresponding
items in original XArray.
Examples
--------
>>> xa = XArray([
... {"is_restaurant": 1, "is_electronics": 0},
... {"is_restaurant": 1, "is_retail": 1, "is_electronics": 0},
... {"is_restaurant": 0, "is_retail": 1, "is_electronics": 0},
... {"is_restaurant": 0},
... {"is_restaurant": 1, "is_electronics": 1},
... None])
>>> xa.item_length()
dtype: int
Rows: 6
[2, 3, 3, 1, 2, None]
"""
if not issubclass(self.dtype(), (str, list, dict, array.array)):
raise TypeError("Item_length() is only applicable for XArray of type 'str', 'list', " +
"'dict' and 'array'.")
return XArray(impl=self._impl.item_length())
[docs] def split_datetime(self, column_name_prefix='X', limit=None):
"""
Splits an XArray of datetime type to multiple columns, return a
new XFrame that contains expanded columns. A XArray of datetime will be
split by default into an XFrame of 6 columns, one for each
year/month/day/hour/minute/second element.
column naming:
When splitting a XArray of datetime type, new columns are named:
prefix.year, prefix.month, etc. The prefix is set by the parameter
"column_name_prefix" and defaults to 'X'. If column_name_prefix is
None or empty, then no prefix is used.
Parameters
----------
column_name_prefix: str, optional
If provided, expanded column names would start with the given prefix.
Defaults to "X".
limit: str, list[str], optional
Limits the set of datetime elements to expand.
Elements may be 'year','month','day','hour','minute',
and 'second'.
Returns
-------
:class:`.XFrame`
A new XFrame that contains all expanded columns
Examples
--------
To expand only day and year elements of a datetime XArray
>>> xa = XArray(
[datetime.datetime(2011, 1, 21, 7, 7, 21),
datetime.datetime(2010, 2, 5, 7, 8, 21])
>>> xa.split_datetime(column_name_prefix=None,limit=['day','year'])
Columns:
day int
year int
Rows: 2
Data:
+-------+--------+
| day | year |
+-------+--------+
| 21 | 2011 |
| 5 | 2010 |
+-------+--------+
[2 rows x 2 columns]
"""
if not issubclass(self.dtype(), datetime.datetime):
raise TypeError('Only column of datetime type can be split.')
if column_name_prefix is None:
column_name_prefix = ''
if not isinstance(column_name_prefix, str):
raise TypeError("'Column_name_prefix' must be a string.")
# convert limit to column_keys
if limit is not None:
if isinstance(limit, str):
limit = [limit]
if not hasattr(limit, '__iter__'):
raise TypeError("'Limit' must be a list.")
for lim in limit:
if not isinstance(lim, str):
raise TypeError("'Limit' must contain string values.")
for item in limit:
if item not in ['year', 'month', 'day', 'hour', 'minute', 'second']:
raise ValueError("'Limit' values may be 'year', 'month', 'day', 'hour', 'minute', or 'second': {}"
.format(item))
if limit is not None:
column_types = list()
for _ in limit:
column_types.append(int)
else:
limit = ['year', 'month', 'day', 'hour', 'minute', 'second']
column_types = [int, int, int, int, int, int]
return xframes.XFrame(impl=self._impl.split_datetime(column_name_prefix, limit, column_types))
# noinspection PyTypeChecker
[docs] def unpack(self, column_name_prefix='X', column_types=None, na_value=None, limit=None):
"""
Convert an XFrame of list, array, or dict type to an XFrame with
multiple columns.
`unpack` expands an XArray using the values of each list/array/dict as
elements in a new XFrame of multiple columns. For example, an XArray of
lists each of length 4 will be expanded into an XFrame of 4 columns,
one for each list element. An XArray of lists/tuples/arrays of varying size
will be expand to a number of columns equal to the longest list/array.
An XArray of dictionaries will be expanded into as many columns as
there are keys.
When unpacking an XArray of list or array type, new columns are named:
`column_name_prefix`.0, `column_name_prefix`.1, etc. If unpacking a
column of dict type, unpacked columns are named
`column_name_prefix`.key1, `column_name_prefix`.key2, etc.
When unpacking an XArray of list or dictionary types, missing values in
the original element remain as missing values in the resultant columns.
If the `na_value` parameter is specified, all values equal to this
given value are also replaced with missing values. In an XArray of
array.array type, NaN is interpreted as a missing value.
:py:func:`xframes.XFrame.pack_columns()` is the reverse effect of unpack
Parameters
----------
column_name_prefix: str, optional
If provided, unpacked column names would start with the given prefix.
column_types: list[type], optional
Column types for the unpacked columns. If not provided, column
types are automatically inferred from first 100 rows. Defaults to
None.
na_value: optional
Convert all values that are equal to `na_value` to
missing value if specified.
limit: list, optional
Limits the set of list/array/dict keys to unpack.
For list/array XArrays, 'limit' must contain integer indices.
For dict XArray, 'limit' must contain dictionary keys.
Returns
-------
:class:`.XFrame`
A new XFrame that contains all unpacked columns
Examples
--------
To unpack a dict XArray
>>> xa = XArray([{ 'word': 'a', 'count': 1},
... { 'word': 'cat', 'count': 2},
... { 'word': 'is', 'count': 3},
... { 'word': 'coming','count': 4}])
Normal case of unpacking XArray of type dict:
>>> xa.unpack(column_name_prefix=None)
Columns:
count int
word str
<BLANKLINE>
Rows: 4
<BLANKLINE>
Data:
+-------+--------+
| count | word |
+-------+--------+
| 1 | a |
| 2 | cat |
| 3 | is |
| 4 | coming |
+-------+--------+
[4 rows x 2 columns]
<BLANKLINE>
Unpack only keys with 'word':
>>> xa.unpack(limit=['word'])
Columns:
X.word str
<BLANKLINE>
Rows: 4
<BLANKLINE>
Data:
+--------+
| X.word |
+--------+
| a |
| cat |
| is |
| coming |
+--------+
[4 rows x 1 columns]
<BLANKLINE>
>>> xa2 = XArray([
... [1, 0, 1],
... [1, 1, 1],
... [0, 1]])
Convert all zeros to missing values:
>>> xa2.unpack(column_types=[int, int, int], na_value=0)
Columns:
X.0 int
X.1 int
X.2 int
<BLANKLINE>
Rows: 3
<BLANKLINE>
Data:
+------+------+------+
| X.0 | X.1 | X.2 |
+------+------+------+
| 1 | None | 1 |
| 1 | 1 | 1 |
| None | 1 | None |
+------+------+------+
[3 rows x 3 columns]
<BLANKLINE>
"""
def is_missing(val):
if val is None:
return True
if isinstance(val, float) and math.isnan(val):
return True
return False
def type_from_typecode(typecode):
if typecode in 'cbBuhHiIlL':
return int
if typecode in 'fd':
return float
return None
# noinspection PyShadowingNames
def make_column_types(head_rows, keys):
column_types = {}
for row in head_rows:
for key in row.keys():
val = row[key]
if key not in column_types and not is_missing(val):
column_types[key] = type(val)
return [column_types[key] for key in keys]
if not issubclass(self.dtype(), (dict, array.array, list, tuple)):
raise TypeError('Only XArray of dict/list/tuple/array type supports unpack: {}.'.format(
self.dtype().__name__))
if column_name_prefix is None:
column_name_prefix = ""
if not isinstance(column_name_prefix, str):
raise TypeError("'Column_name_prefix' must be a string.")
# validdate 'limit'
if limit is not None:
if not hasattr(limit, '__iter__'):
raise TypeError("'Limit' must be a list.")
name_types = set([type(i) for i in limit])
if len(name_types) != 1:
raise TypeError("'Limit' contains values that are different types.")
# limit value should be numeric if unpacking xarray.array value
if not issubclass(self.dtype(), dict) and not issubclass(name_types.pop(), int):
raise TypeError("'Limit' must contain integer values.")
if len(set(limit)) != len(limit):
raise ValueError("'Limit' contains duplicate values.")
if column_types is not None:
if not hasattr(column_types, '__iter__'):
raise TypeError("'column_types' must be a list.")
for column_type in column_types:
if column_type not in (int, float, str, list, dict, array.array):
raise TypeError("'Column_types' contains unsupported types. " +
"Supported types are ['float', 'int', 'list', " +
"'dict', 'str', 'array.array'].")
if limit is not None:
if len(limit) != len(column_types):
raise ValueError("'Limit' and 'column_types' do not have the same length.")
elif issubclass(self.dtype(), dict):
raise ValueError("If 'column_types' is given, " +
"'limit' has to be provided to unpack dict type.")
else:
limit = range(len(column_types))
else:
head_rows = self.head(100).dropna()
lengths = [len(i) for i in head_rows]
if len(lengths) == 0 or max(lengths) == 0:
raise RuntimeError('Cannot infer number of items from the XArray. ' +
'XArray may be empty. ' +
'Please explicitly provide column types.')
# infer column types for dict type at server side,
# for list and array, infer from client side
if not issubclass(self.dtype(), dict):
length = max(lengths)
if limit is None:
limit = range(length)
else:
# adjust the length
length = len(limit)
if issubclass(self.dtype(), array.array):
typ = type_from_typecode(head_rows[0].typecode)
column_types = [typ for _ in range(length)]
else:
column_types = list()
for i in limit:
t = [(x[i] if ((x is not None) and len(x) > i) else None)
for x in head_rows]
column_types.append(infer_type_of_list(t))
else: # self.dtype() is dict
if limit is None:
key_set = set()
for row in head_rows:
key_set |= set(row.keys())
# translate to indexes
limit = list(key_set)
if column_types is None:
column_types = make_column_types(head_rows, limit)
return xframes.XFrame(impl=self._impl.unpack(column_name_prefix, limit, column_types, na_value))
[docs] def sort(self, ascending=True):
"""
Sort all values in this XArray.
Sort only works for xarray of type str, int and float, otherwise TypeError
will be raised. Creates a new, sorted XArray.
Parameters
----------
ascending: boolean, optional
If True, the xarray values are sorted in ascending order, otherwise,
descending order.
Returns
-------
:class:`.XArray`
The sorted XArray.
Examples
--------
>>> xa = XArray([3,2,1])
>>> xa.sort()
dtype: int
Rows: 3
[1, 2, 3]
"""
if not issubclass(self.dtype(), (int, float, str, datetime.datetime)):
raise TypeError("Only xarray with type ('int', 'float', 'str', and 'datetime.datetime)' can be sorted.")
return XArray(impl=self._impl.sort(ascending))