Source code for xframes.xarray

"""
This module defines the XArray class which provides the
ability to create, access and manipulate a remote scalable array object.

XArray acts similarly to pandas.Series but without indexing.
The data is immutable, homogeneous, and is stored in a Spark RDD.
"""


import inspect
import math
import time
import array
import warnings
import datetime

from xframes.deps import pandas, HAS_PANDAS
from xframes.deps import HAS_NUMPY
from xframes.xarray_impl import XArrayImpl
from xframes.utils import make_internal_url
from xframes.object_utils import check_input_uri, check_output_uri
from xframes.type_utils import infer_type_of_list, is_numeric_val, classify_auto
import xframes

if HAS_NUMPY:
    import numpy

"""
Copyright (c) 2014, Dato, Inc.
All rights reserved.

Copyright (c) 2017, Charles Hayden
All rights reserved.
"""


__all__ = ['XArray']


def _create_sequential_xarray(size, start=0, reverse=False):
    if not isinstance(size, int):
        raise TypeError('Size must be int.')

    if not isinstance(start, int):
        raise TypeError('Size must be int.')

    if not isinstance(reverse, bool):
        raise TypeError('Reverse must me bool.')

    return XArray(impl=XArrayImpl.create_sequential_xarray(size, start, reverse))


# noinspection PyUnresolvedReferences,PyRedeclaration
[docs]class XArray(object): """ An immutable, homogeneously typed array object backed by Spark RDD. XArray is able to hold data that are much larger than the machine's main memory. It fully supports missing values and random access (although random access is inefficient). The data backing an XArray is located on the cluster hosting Spark. """
[docs] def __init__(self, data=None, dtype=None, ignore_cast_failure=False, impl=None): """ Construct a new XArray. The source of data includes: list, numpy.ndarray, pandas.Series, and urls. Parameters ---------- data : list | numpy.ndarray | pandas.Series | string The input data. If this is a list, numpy.ndarray, or pandas.Series, the data in the list is converted and stored in an XArray. Alternatively if this is a string, it is interpreted as a path (or url) to a text file. Each line of the text file is loaded as a separate row. If `data` is a directory where an XArray was previously saved, this is loaded as an XArray read directly out of that directory. dtype : {int, float, str, list, array.array, dict, datetime.datetime}, optional The data type of the XArray. If not specified, we attempt to infer it from the input. If it is a numpy array or a Pandas series, the data type of the array or series is used. If it is a list, the data type is inferred from the inner list. If it is a URL or path to a text file, we default the data type to str. ignore_cast_failure : bool, optional If True, ignores casting failures but warns when elements cannot be cast into the specified data type. Notes ----- - If `data` is pandas.Series, the index will be ignored. The following functionality is currently not implemented: - numpy.ndarray as row data - pandas.Series data - count_words, count_ngrams - sketch sub_sketch_keys See Also -------- xframes.XArray.from_const : Constructs an XArray of a given size with a const value. xframes.XArray.from_sequence : Constructs an XArray by generating a sequence of consecutive numbers. xframes.XArray.from_rdd : Create a new XArray from a Spark RDD or Spark DataFrame. xframes.XArray.set_trace : Controls entry and exit tracing. xframes.XArray.spark_context : Returns the spark context. xframes.XArray.spark_sql_context : Returns the spark sql context. xframes.XArray.hive_context : Returns the spark hive context. Examples -------- >>> xa = XArray(data=[1,2,3,4,5], dtype=int) >>> xa = XArray('s3://testdatasets/a_to_z.txt.gz') >>> xa = XArray([[1,2,3], [3,4,5]]) >>> xa = XArray(data=[{'a':1, 'b': 2}, {'b':2, 'c': 1}]) >>> xa = XArray(data=[datetime.datetime(2011, 10, 20, 9, 30, 10)]) """ if dtype is not None and not isinstance(dtype, type): raise TypeError("Dtype must be a type, e.g. use int rather than 'int'.") if impl: self._impl = impl return if isinstance(data, XArray): self._impl = data._impl return # we need to perform type inference dtype = dtype or classify_auto(data) if data is None: self._impl = XArrayImpl() elif HAS_PANDAS and isinstance(data, pandas.Series): self._impl = XArrayImpl.load_from_iterable(data.values, dtype, ignore_cast_failure) elif HAS_NUMPY and isinstance(data, numpy.ndarray): self._impl = XArrayImpl.load_from_iterable(data, dtype, ignore_cast_failure) elif isinstance(data, (list, array.array)): self._impl = XArrayImpl.load_from_iterable(data, dtype, ignore_cast_failure) elif hasattr(data, '__iter__'): self._impl = XArrayImpl.load_from_iterable(data, dtype, ignore_cast_failure) elif isinstance(data, str): internal_url = make_internal_url(data) check_input_uri(internal_url) self._impl = XArrayImpl.load_autodetect(internal_url, dtype) else: raise TypeError('Unexpected data source: {}. ' "Possible data source types are: 'list', " "'numpy.ndarray', 'pandas.Series', and 'string(url)'.".format(type(data).__name__))
[docs] def dump_debug_info(self): """ Print information about the Spark RDD associated with this XArray. """ return self.impl().dump_debug_info()
@classmethod
[docs] def read_text(cls, path, delimiter=None, nrows=None, verbose=False): """ Constructs an XArray from a text file or a path to multiple text files. Parameters ---------- path : string Location of the text file or directory to load. If 'path' is a directory or a "glob" pattern, all matching files will be loaded. delimiter : string, optional This describes the delimiter used for separating records. Must be a single character. Defaults to newline. nrows : int, optional If set, only this many rows will be read from the file. verbose : bool, optional If True, print the progress while reading files. Returns ------- :class:`.XArray` Examples -------- Read a regular text file, with default options. >>> path = 'http://s3.amazonaws.com/gl-testdata/rating_data_example.csv' >>> xa = xframes.XArray.read_text(path) >>> xa [25904, 25907, 25923, 25924, 25928, ... ] Read only the first 100 lines of the text file: >>> xa = xframes.XArray.read_text(path, nrows=100) >>> xa [25904, 25907, 25923, 25924, 25928, ... ] """ check_input_uri(path) url = make_internal_url(path) return cls(impl=XArrayImpl.read_from_text(url, delimiter=delimiter, nrows=nrows, verbose=verbose))
@classmethod
[docs] def from_const(cls, value, size): """ Constructs an XArray of size with a const value. Parameters ---------- value : [int | float | str | array.array | datetime.datetime | list | dict] The value to fill the XArray. size : int The size of the XArray. Must be positive. Examples -------- Construct an XArray consisting of 10 zeroes: >>> xframes.XArray.from_const(0, 10) """ if not isinstance(size, int): raise TypeError('Size must be a int.') if size <= 0: raise ValueError('Size must be positive.') if not isinstance(value, (int, float, str, array.array, datetime.datetime, list, dict)): raise TypeError("Cannot create xarray of value type '{}'.".format(type(value).__name__)) return cls(impl=XArrayImpl.load_from_const(value, size))
@classmethod
[docs] def from_sequence(cls, start, stop=None): """ Constructs an XArray by generating a sequence of consecutive numbers. Parameters ---------- start : int If `stop` is not given, the sequence consists of numbers 0 .. `start`-1. Otherwise, the sequence starts with `start`. stop : int, optional If given, the sequence consists of the numbers `start`, `start`+1 ... `end`-1. The sequence will not contain this value. Examples -------- >>> from_sequence(1000) Construct an XArray of integer values from 0 to 999 This is equivalent, but more efficient than: >>> XArray(range(1000)) >>> from_sequence(10, 1000) Construct an XArray of integer values from 10 to 999 This is equivalent, but more efficient than: >>> XArray(range(10, 1000)) """ if not isinstance(start, int) or (stop is not None and not isinstance(stop, int)): raise TypeError("'Start' and 'stop' must be int.") if stop is None: return _create_sequential_xarray(start) size = stop - start # this matches the behavior of range # i.e. range(100,10) just returns an empty array if size < 0: size = 0 return _create_sequential_xarray(size, start)
def _get_content_identifier(self): """ Returns the unique identifier of the content that backs the XArray Notes ----- Meant for internal use only. """ return self._impl.get_content_identifier() # noinspection PyShadowingBuiltins
[docs] def save(self, filename, format=None): """ Saves the XArray to file. The saved XArray will be in a directory named with the `filename` parameter. Parameters ---------- filename : string A local path or a remote URL. If format is 'text', it will be saved as a text file. If format is 'binary', a directory will be created at the location which will contain the XArray. format : {'binary', 'text', 'csv'}, optional Format in which to save the XFrame. Binary saved XArrays can be loaded much faster and without any format conversion losses. The values 'text' and 'csv' are synonymous: Each XArray row will be written as a single line in an output text file. If not given, will try to infer the format from filename given. If file name ends with 'csv', or 'txt', then save as 'csv' format, otherwise save as 'binary' format. """ if format is None: if filename.endswith('.txt'): format = 'text' elif filename.endswith('.csv'): format = 'csv' else: format = 'binary' url = make_internal_url(filename) check_output_uri(url) if format == 'binary': self._impl.save(url) elif format == 'text': self._impl.save_as_text(url) elif format == 'csv': self._impl.save_as_csv(url)
[docs] def to_rdd(self, number_of_partitions=4): """ Convert the current XArray to the Spark RDD. Parameters ---------- number_of_partitions: int, optional The number of partitions to create in the rdd. Defaults to 4. Returns ------- out: RDD The internal RDD used to stores XArray instances. """ if not isinstance(number_of_partitions, int): raise ValueError('Number_of_partitions parameter expects an integer type.') if number_of_partitions == 0: raise ValueError('Number_of_partitions can not be initialized to zero.') return self._impl.to_rdd(number_of_partitions)
@classmethod
[docs] def from_rdd(cls, rdd, dtype, lineage=None): """ Convert a Spark RDD into an XArray Parameters ---------- rdd : pyspark.rdd.RDD The Spark RDD containing the XArray values. dtype : type The values in `rdd` should have the data type `dtype`. lineage: dict, optional The lineage to apply to the rdd. Returns ------- class:`.XArray` This incorporates the given RDD. """ return cls(impl=XArrayImpl.from_rdd(rdd, dtype, lineage=lineage))
def __repr__(self): """ A string description of the XArray. Returns ------- str A string representation of the XArray. """ ret = 'dtype: {}\n'.format(self.dtype().__name__) ret += 'Rows: {}\n'.format(self.size()) ret += str(self) return ret def __str__(self): """ A string containing the first 100 elements of the array. Returns ------- str Returns a string containing the first 100 elements of the array. """ h = self._impl.head_as_list(100) headln = str(h) if self.size() > 100: # cut the last close bracket # and replace it with ... headln = headln[0:-1] + ', ... ]' return headln def __nonzero__(self): """ Returns True if the array is not empty. """ return self.size() != 0 def __len__(self): """ Returns the length of the array """ return self.size() def __iter__(self): """ Provides an iterator to the contents of the array. """ def generator(): elems_at_a_time = 262144 self._impl.begin_iterator() ret = self._impl.iterator_get_next(elems_at_a_time) while True: for j in ret: yield j if len(ret) == elems_at_a_time: ret = self._impl.iterator_get_next(elems_at_a_time) else: break return generator() def __add__(self, other): """ If other is a scalar value, adds it to the current array, returning the new result. If other is an XArray, performs an element-wise addition of the two arrays. """ if isinstance(other, XArray): return XArray(impl=self._impl.vector_operator(other._impl, '+')) else: return XArray(impl=self._impl.left_scalar_operator(other, '+')) def __sub__(self, other): """ If other is a scalar value, subtracts it from the current array, returning the new result. If other is an XArray, performs an element-wise subtraction of the two arrays. """ if isinstance(other, XArray): return XArray(impl=self._impl.vector_operator(other._impl, '-')) else: return XArray(impl=self._impl.left_scalar_operator(other, '-')) def __mul__(self, other): """ If other is a scalar value, multiplies it to the current array, returning the new result. If other is an XArray, performs an element-wise multiplication of the two arrays. """ if isinstance(other, XArray): return XArray(impl=self._impl.vector_operator(other._impl, '*')) else: return XArray(impl=self._impl.left_scalar_operator(other, '*')) def __div__(self, other): """ If other is a scalar value, divides each element of the current array by the value, returning the result. If other is an XArray, performs an element-wise division of the two arrays. """ if isinstance(other, XArray): return XArray(impl=self._impl.vector_operator(other._impl, '/')) else: return XArray(impl=self._impl.left_scalar_operator(other, '/')) def __pow__(self, other): """ Oher must be a scalar value, raises to the current array to thet power, returning the new result. """ if is_numeric_val(other): return XArray(impl=self._impl.left_scalar_operator(other, '**')) def __lt__(self, other): """ If other is a scalar value, compares each element of the current array by the value, returning the result. If other is an XArray, performs an element-wise comparison of the two arrays. """ if isinstance(other, XArray): return XArray(impl=self._impl.vector_operator(other._impl, '<')) else: return XArray(impl=self._impl.left_scalar_operator(other, '<')) def __gt__(self, other): """ If other is a scalar value, compares each element of the current array by the value, returning the result. If other is an XArray, performs an element-wise comparison of the two arrays. """ if isinstance(other, XArray): return XArray(impl=self._impl.vector_operator(other._impl, '>')) else: return XArray(impl=self._impl.left_scalar_operator(other, '>')) def __le__(self, other): """ If other is a scalar value, compares each element of the current array by the value, returning the result. If other is an XArray, performs an element-wise comparison of the two arrays. """ if isinstance(other, XArray): return XArray(impl=self._impl.vector_operator(other._impl, '<=')) else: return XArray(impl=self._impl.left_scalar_operator(other, '<=')) def __ge__(self, other): """ If other is a scalar value, compares each element of the current array by the value, returning the result. If other is an XArray, performs an element-wise comparison of the two arrays. """ if isinstance(other, XArray): return XArray(impl=self._impl.vector_operator(other._impl, '>=')) else: return XArray(impl=self._impl.left_scalar_operator(other, '>=')) def __radd__(self, other): """ Adds a scalar value to the current array. Returned array has the same type as the array on the right hand side """ return XArray(impl=self._impl.right_scalar_operator(other, '+')) def __rsub__(self, other): """ Subtracts a scalar value from the current array. Returned array has the same type as the array on the right hand side """ return XArray(impl=self._impl.right_scalar_operator(other, '-')) def __rmul__(self, other): """ Multiplies a scalar value to the current array. Returned array has the same type as the array on the right hand side """ return XArray(impl=self._impl.right_scalar_operator(other, '*')) def __rdiv__(self, other): """ Divides a scalar value by each element in the array Returned array has the same type as the array on the right hand side """ return XArray(impl=self._impl.right_scalar_operator(other, '/')) def __neg__(self): """ Negates each element in the array. """ return XArray(impl=self._impl.unary_operator('-')) def __pos__(self): """ Implements the unary plus operator. """ return XArray(impl=self._impl.unary_operator('+')) def __abs__(self): """ Takes the absolute value of each element in the array """ return XArray(impl=self._impl.unary_operator('abs')) def __eq__(self, other): """ If other is a scalar value, compares each element of the current array by the value, returning the new result. If other is an XArray, performs an element-wise comparison of the two arrays. """ if isinstance(other, XArray): return XArray(impl=self._impl.vector_operator(other._impl, '==')) else: return XArray(impl=self._impl.left_scalar_operator(other, '==')) def __ne__(self, other): """ If other is a scalar value, compares each element of the current array by the value, returning the new result. If other is an XArray, performs an element-wise comparison of the two arrays. """ if isinstance(other, XArray): return XArray(impl=self._impl.vector_operator(other._impl, '!=')) else: return XArray(impl=self._impl.left_scalar_operator(other, '!=')) def __and__(self, other): """ Perform a logical element-wise 'and' against another XArray. Note that this is not the "and" operator, which cannot be overridden, but the "&" operator. """ if isinstance(other, XArray): return XArray(impl=self._impl.vector_operator(other._impl, '&')) else: raise TypeError('XArray can only perform logical and against another XArray.') def __or__(self, other): """ Perform a logical element-wise 'or' against another XArray. Note that this is not the "or" operator, which cannot be overridden, but the "|" operator. """ if isinstance(other, XArray): return XArray(impl=self._impl.vector_operator(other._impl, '|')) else: raise TypeError('XArray can only perform logical or against another XArray.') def __getitem__(self, other): """ If the key is an XArray of identical length, this function performs a logical filter: i.e. it subselects all the elements in this array where the corresponding value in the other array evaluates to true. If the key is an integer this returns a single row of the XArray. If the key is a slice, this returns an XArray with the sliced rows. """ if isinstance(other, XArray): if len(other) != len(self): raise IndexError('Cannot perform logical indexing on arrays of different length.') return XArray(impl=self._impl.logical_filter(other._impl)) elif isinstance(other, int): if other < 0: other += len(self) if other >= len(self): raise IndexError('XArray index out of range.') return list(XArray(impl=self._impl.copy_range(other, 1, other + 1)))[0] elif isinstance(other, slice): start = other.start stop = other.stop step = other.step if start is None: start = 0 if stop is None: stop = len(self) if step is None: step = 1 # handle negative indices if start < 0: start += len(self) if stop < 0: stop += len(self) return XArray(impl=self._impl.copy_range(start, step, stop)) else: raise IndexError('Invalid type to use for indexing.') def _materialize(self): """ For a XArray that is lazily evaluated, force persist this xarray to disk, committing all lazy evaluated operations. """ self._impl.materialize() def _is_materialized(self): """ Returns whether or not the xarray has been materialized. """ return self._impl.is_materialized()
[docs] def size(self): """ The size of the XArray. """ return self._impl.size()
[docs] def impl(self): """ Get the impl. For internal use. """ return self._impl
[docs] def dtype(self): """ The data type of the XArray. Returns ------- type The type of the XArray. Examples -------- >>> xa = XArray(['The quick brown fox jumps over the lazy dog.']) >>> xa.dtype() str >>> xa = XArray(range(10)) >>> xa.dtype() int """ return self._impl.dtype()
[docs] def lineage(self): """ The lineage: the files that went into building this array. Returns ------- dict * key 'table': set[filename] The files that were used to build the XArray * key 'column': dict{column_name: set[filename]} The set of files that were used to build each column """ return self._impl.lineage_as_dict()
[docs] def head(self, n=10): """ Returns an XArray which contains the first n rows of this XArray. Parameters ---------- n : int The number of rows to fetch. Returns ------- :class:`.XArray` A new XArray which contains the first n rows of the current XArray. Examples -------- >>> XArray(range(10)).head(5) dtype: int Rows: 5 [0, 1, 2, 3, 4] """ return XArray(impl=self._impl.head(n))
[docs] def vector_slice(self, start, end=None): """ If this XArray contains vectors or recursive types, this returns a new XArray containing each individual vector sliced, between start and end, exclusive. Parameters ---------- start : int The start position of the slice. end : int, optional. The end position of the slice. Note that the end position is NOT included in the slice. Thus a g.vector_slice(1,3) will extract entries in position 1 and 2. Returns ------- :class:`.XArray` Each individual vector sliced according to the arguments. Examples -------- If g is a vector of floats: >>> g = XArray([[1,2,3],[2,3,4]]) >>> g dtype: array Rows: 2 [array('d', [1.0, 2.0, 3.0]), array('d', [2.0, 3.0, 4.0])] >>> g.vector_slice(0) # extracts the first element of each vector dtype: float Rows: 2 [1.0, 2.0] >>> g.vector_slice(0, 2) # extracts the first two elements of each vector dtype: array.array Rows: 2 [array('d', [1.0, 2.0]), array('d', [2.0, 3.0])] If a vector cannot be sliced, the result will be None: >>> g = XArray([[1],[1,2],[1,2,3]]) >>> g dtype: array.array Rows: 3 [array('d', [1.0]), array('d', [1.0, 2.0]), array('d', [1.0, 2.0, 3.0])] >>> g.vector_slice(2) dtype: float Rows: 3 [None, None, 3.0] >>> g.vector_slice(0,2) dtype: list Rows: 3 [None, array('d', [1.0, 2.0]), array('d', [1.0, 2.0])] If g is a vector of mixed types (float, int, str, array, list, etc.): >>> g = XArray([['a',1,1.0],['b',2,2.0]]) >>> g dtype: list Rows: 2 [['a', 1, 1.0], ['b', 2, 2.0]] >>> g.vector_slice(0) # extracts the first element of each vector dtype: list Rows: 2 [['a'], ['b']] """ if isinstance(self.dtype(), array.array) and not isinstance(self.dtype(), list): raise RuntimeError("Only 'array' and 'list' type can be sliced.") if end is None: end = start + 1 return XArray(impl=self._impl.vector_slice(start, end))
def _count_words(self, to_lower=True): """ Count words in the XArray. Return an XArray of dictionary type where each element contains the word count for each word that appeared in the corresponding input element. The words are split on all whitespace and punctuation characters. Only works if this XArray is of string type. Parameters ---------- to_lower : bool, optional If True, all words are converted to lower case before counting. Returns ------- :class:`.XArray` The XArray of dictionary type, where each element contains the word count for each word that appeared in corresponding input element. See Also -------- xframes.XArray.count_ngrams Examples -------- >>> xa = xframes.XArray(['The quick brown fox jumps.', "Word word WORD, word!!!word"]) >>> xa.count_words() dtype: dict Rows: 2 [{'quick': 1, 'brown': 1, 'jumps': 1, 'fox': 1, 'the': 1}, {'word': 5}] """ if not isinstance(self.dtype(), basestring): raise TypeError('Only XArray of string type is supported for counting bag of words.') # construct options, will extend over time options = dict() options['to_lower'] = True if to_lower else False return XArray(impl=self._impl.count_bag_of_words(options)) def _count_ngrams(self, n=2, method="word", to_lower=True, ignore_space=True): """ Return an XArray of ``dict`` type where each element contains the count for each of the n-grams that appear in the corresponding input element. The n-grams can be specified to be either character n-grams or word n-grams. The input XArray must contain strings. Parameters ---------- n : int, optional The number of words in each n-gram. An `n` value of 1 returns word counts. method : {'word', 'character'}, optional If "word", the function performs a count of word n-grams. If "character", does a character n-gram count. to_lower : bool, optional If True, all words are converted to lower case before counting. ignore_space : bool, optional If method is "character", indicates if spaces between words are counted as part of the n-gram. For instance, with the input XArray element of "fun games", if this parameter is set to False one tri-gram would be 'n g'. If `ignore_space` is set to True, there would be no such tri-gram (there would still be 'nga'). This parameter has no effect if the method is set to "word". Returns ------- :class:`.XArray` An XArray of dictionary type, where each key is the n-gram string and each value is its count. See Also -------- xframes.XArray.count_words Notes ----- - Ignoring case (with `to_lower`) involves a full string copy of the XArray data. To increase speed for large documents, set `to_lower` to False. - Punctuation and spaces are both delimiters when counting word n-grams. When counting character n-grams, punctuation is always ignored. References ---------- - `N-gram wikipedia article <http://en.wikipedia.org/wiki/N-gram>`_ Examples -------- Counting word n-grams: >>> from xframes import XArray >>> xa = XArray(['I like big dogs. I LIKE BIG DOGS.']) >>> xa.count_ngrams(xa, 3) dtype: dict Rows: 1 [{'big dogs i': 1, 'like big dogs': 2, 'dogs i like': 1, 'i like big': 2}] Counting character n-grams: >>> xa = XArray(['Fun. Is. Fun']) >>> xa.count_ngrams(xa, 3, 'character') dtype: dict Rows: 1 {'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}] """ if not issubclass(self.dtype(), str): raise TypeError('Only XArray of string type is supported for counting n-grams.') if not isinstance(n, int): raise TypeError("Input 'n' must be of type int.") if n < 1: raise ValueError("Input 'n' must be greater than 0.") if n > 5: warnings.warn('It is unusual for n-grams to be of size larger than 5.') # construct options, will extend over time options = dict() options['to_lower'] = True if to_lower else False options['ignore_space'] = True if ignore_space else False if method == 'word': return XArray(impl=self._impl.count_ngrams(n, options)) elif method == 'character': return XArray(impl=self._impl.count_character_ngrams(n, options)) else: raise ValueError("Invalid 'method' input value. Please input either " + "'word' or 'character' ")
[docs] def dict_trim_by_keys(self, keys, exclude=True): """ Filter an XArray of dictionary type by the given keys. By default, all keys that are in the provided list in `keys` are *excluded* from the returned XArray. Parameters ---------- keys : list A collection of keys to trim down the elements in the XArray. exclude : bool, optional If True, all keys that are in the input key list are removed. If False, only keys that are in the input key list are retained. Returns ------- :class:`.XArray` A XArray of dictionary type, with each dictionary element trimmed according to the input criteria. See Also -------- xframes.XArray.dict_trim_by_values Examples -------- >>> xa = xframes.XArray([{"this":1, "is":1, "dog":2}, {"this": 2, "are": 2, "cat": 1}]) >>> xa.dict_trim_by_keys(["this", "is", "and", "are"], exclude=True) dtype: dict Rows: 2 [{'dog': 2}, {'cat': 1}] """ if isinstance(keys, str) or (not hasattr(keys, "__iter__")): keys = [keys] return XArray(impl=self._impl.dict_trim_by_keys(keys, exclude))
[docs] def dict_trim_by_values(self, lower=None, upper=None): """ Filter dictionary values to a given range (inclusive). Trimming is only performed on values which can be compared to the bound values. Fails on XArrays whose data type is not ``dict``. Parameters ---------- lower : int or long or float, optional The lowest dictionary value that would be retained in the result. If not given, lower bound is not applied. upper : int or long or float, optional The highest dictionary value that would be retained in the result. If not given, upper bound is not applied. Returns ------- :class:`.XArray` An XArray of dictionary type, with each dict element trimmed according to the input criteria. See Also -------- xframes.XArray.dict_trim_by_keys Examples -------- >>> xa = xframes.XArray([{"this":1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}]) >>> xa.dict_trim_by_values(2,5) dtype: dict Rows: 2 [{'is': 5}, {'this': 2, 'cat': 5}] >>> xa.dict_trim_by_values(upper=5) dtype: dict Rows: 2 [{'this': 1, 'is': 5}, {'this': 2, 'are': 1, 'cat': 5}] """ if lower is not None and not is_numeric_val(lower): raise TypeError('Lower bound has to be a numeric value.') if upper is not None and not is_numeric_val(upper): raise TypeError('Upper bound has to be a numeric value.') return XArray(impl=self._impl.dict_trim_by_values(lower, upper))
[docs] def dict_keys(self): """ Create an XArray that contains all the keys from each dictionary element as a list. Fails on XArrays whose data type is not ``dict``. Returns ------- :class:`.XArray` A XArray of list type, where each element is a list of keys from the input XArray element. See Also -------- xframes.XArray.dict_values Examples --------- >>> xa = xframes.XArray([{"this":1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}]) >>> xa.dict_keys() dtype: list Rows: 2 [['this', 'is', 'dog'], ['this', 'are', 'cat']] """ return xframes.XFrame(impl=self._impl.dict_keys())
[docs] def dict_values(self): """ Create an XArray that contains all the values from each dictionary element as a list. Fails on XArrays whose data type is not ``dict``. Returns ------- :class:`.XArray` A XArray of list type, where each element is a list of values from the input XArray element. See Also -------- xframes.XArray.dict_keys Examples -------- >>> xa = xframes.XArray([{"this":1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}]) >>> xa.dict_values() dtype: list Rows: 2 [[1, 5, 7], [2, 1, 5]] """ return xframes.XFrame(impl=self._impl.dict_values())
[docs] def dict_has_any_keys(self, keys): """ Create a boolean XArray by checking the keys of an XArray of dictionaries. An element of the output XArray is True if the corresponding input element's dictionary has any of the given keys. Fails on XArrays whose data type is not ``dict``. Parameters ---------- keys : list A list of key values to check each dictionary against. Returns ------- :class:`.XArray` A XArray of int type, where each element indicates whether the input XArray element contains any key in the input list. See Also -------- xframes.XArray.dict_has_all_keys Examples -------- >>> xa = xframes.XArray([{"this":1, "is":5, "dog":7}, {"animal":1}, {"this": 2, "are": 1, "cat": 5}]) >>> xa.dict_has_any_keys(["is", "this", "are"]) dtype: int Rows: 3 [1, 1, 0] """ if isinstance(keys, str) or not hasattr(keys, "__iter__"): keys = [keys] return XArray(impl=self._impl.dict_has_any_keys(keys))
[docs] def dict_has_all_keys(self, keys): """ Create a boolean XArray by checking the keys of an XArray of dictionaries. An element of the output XArray is True if the corresponding input element's dictionary has all of the given keys. Fails on XArrays whose data type is not ``dict``. Parameters ---------- keys : list A list of key values to check each dictionary against. Returns ------- :class:`.XArray` An XArray of int type, where each element indicates whether the input XArray element contains all keys in the input list. See Also -------- xframes.XArray.dict_has_any_keys Examples -------- >>> xa = xframes.XArray([{"this":1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}]) >>> xa.dict_has_all_keys(["is", "this"]) dtype: int Rows: 2 [1, 0] """ if isinstance(keys, str) or (not hasattr(keys, "__iter__")): keys = [keys] return XArray(impl=self._impl.dict_has_all_keys(keys))
[docs] def apply(self, fn, dtype=None, skip_undefined=True, seed=None): """ Transform each element of the XArray by a given function. The result XArray is of type `dtype`. `fn` should be a function that returns exactly one value which can be cast into the type specified by `dtype`. If `dtype` is not specified, the first 100 elements of the XArray are used to make a guess about the data type. Parameters ---------- fn : function The function to transform each element. Must return exactly one value which can be cast into the type specified by `dtype`. dtype : {int, float, str, list, array.array, dict}, optional The data type of the new XArray. If not supplied, the first 100 elements of the array are used to guess the target data type. skip_undefined : bool, optional If True, will not apply `fn` to any missing values. seed : int, optional Used as the seed if a random number generator is included in `fn`. Returns ------- :class:`.XArray` The XArray transformed by `fn`. Each element of the XArray is of type `dtype`. See Also -------- xframes.XFrame.apply Applies a function to a column of an XFrame. Note that the functions differ in these two cases: on an XArray the function receives one value, on an XFrame it receives a dict of the column name/value pairs. Examples -------- >>> xa = xframes.XArray([1,2,3]) >>> xa.apply(lambda x: x*2) dtype: int Rows: 3 [2, 4, 6] """ if not inspect.isfunction(fn): raise TypeError('Input must be a function.') if dtype is None: h = self._impl.head_as_list(100) dryrun = [fn(i) for i in h if i is not None] dtype = infer_type_of_list(dryrun) if not seed: seed = time.time() return XArray(impl=self._impl.transform(fn, dtype, skip_undefined, seed))
[docs] def flat_map(self, fn=None, dtype=None, skip_undefined=True, seed=None): """ Transform each element of the XArray by a given function, which must return a list. Each item in the result XArray is made up of a list element. The result XArray is of type `dtype`. `fn` should be a function that returns a list of values which can be cast into the type specified by `dtype`. If `dtype` is not specified, the first 100 elements of the XArray are used to make a guess about the data type. Parameters ---------- fn : function The function to transform each element. Must return a list of values which can be cast into the type specified by `dtype`. dtype : {None, int, float, str, list, array.array, dict}, optional The data type of the new XArray. If None, the first 100 elements of the array are used to guess the target data type. skip_undefined : bool, optional If True, will not apply `fn` to any undefined values. seed : int, optional Used as the seed if a random number generator is included in `fn`. Returns ------- :class:`.XArray` The XArray transformed by `fn` and flattened. Each element of the XArray is of type `dtype`. See Also -------- xframes.XFrame.flat_map Examples -------- >>> xa = xframes.XArray([[1], [1, 2], [1, 2, 3]]) >>> xa.apply(lambda x: x*2) dtype: int Rows: 3 [2, 2, 4, 2, 4, 6] """ if fn is None: def fn(x): return x if not inspect.isfunction(fn): raise TypeError('Input must be a function.') if dtype is None: h = self._impl.head_as_list(100) dryrun = [fn(i) for i in h if i is not None] dryrun = [item for lst in dryrun for item in lst] dtype = infer_type_of_list(dryrun) if not seed: seed = time.time() return XArray(impl=self._impl.flat_map(fn, dtype, skip_undefined, seed))
[docs] def filter(self, fn, skip_undefined=True, seed=None): """ Filter this XArray by a function. Returns a new XArray filtered by a function. If `fn` evaluates an element to true, this element is copied to the new XArray. If not, it isn't. Throws an exception if the return type of `fn` is not castable to a boolean value. Parameters ---------- fn : function Function that filters the XArray. Must evaluate to bool or int. skip_undefined : bool, optional If True, will not apply fn to any undefined values. seed : int, optional Used as the seed if a random number generator is included in fn. Returns ------- :class:`.XArray` The XArray filtered by fn. Each element of the XArray is of type int. Examples -------- >>> xa = xframes.XArray([1,2,3]) >>> xa.filter(lambda x: x < 3) dtype: int Rows: 2 [1, 2] """ if not inspect.isfunction(fn): raise TypeError('Input must be a function.') if not seed: seed = time.time() return XArray(impl=self._impl.filter(fn, skip_undefined, seed))
[docs] def sample(self, fraction, max_partitions=None, seed=None): """ Create an XArray which contains a subsample of the current XArray. Parameters ---------- fraction : float The fraction of the rows to fetch. Must be between 0 and 1. max_partitions : int, optional After sampling, coalesce to this number of partition. If not given, do not perform this step. seed : int The random seed for the random number generator. Returns ------- :class:`.XArray` The new XArray which contains the subsampled rows. Examples -------- >>> xa = xframes.XArray(range(10)) >>> xa.sample(.3) dtype: int Rows: 3 [2, 6, 9] """ if fraction > 1 or fraction < 0: raise ValueError('Invalid sampling rate: {}.'.format(fraction)) if self.size() == 0: return XArray() if not seed: seed = int(time.time()) return XArray(impl=self._impl.sample(fraction, max_partitions, seed))
def _save_as_text(self, url): """ Save the XArray to disk as text file. """ raise NotImplementedError
[docs] def all(self): """ Return True if every element of the XArray evaluates to True. For numeric XArrays zeros and missing values (None) evaluate to False, while all non-zero, non-missing values evaluate to True. For string, list, and dictionary XArrays, empty values (zero length strings, lists or dictionaries) or missing values (None) evaluate to False. All other values evaluate to True. Returns True on an empty XArray. Returns ------- bool See Also -------- xframes.XArray.any Examples -------- >>> xframes.XArray([1, None]).all() False >>> xframes.XArray([1, 0]).all() False >>> xframes.XArray([1, 2]).all() True >>> xframes.XArray(["hello", "world"]).all() True >>> xframes.XArray(["hello", ""]).all() False >>> xframes.XArray([]).all() True """ return self._impl.all()
[docs] def any(self): """ Return True if any element of the XArray evaluates to True. For numeric XArrays any non-zero value evaluates to True. For string, list, and dictionary XArrays, any element of non-zero length evaluates to True. Returns False on an empty XArray. Returns ------- bool See Also -------- xframes.XArray.all Examples -------- >>> xframes.XArray([1, None]).any() True >>> xframes.XArray([1, 0]).any() True >>> xframes.XArray([0, 0]).any() False >>> xframes.XArray(["hello", "world"]).any() True >>> xframes.XArray(["hello", ""]).any() True >>> xframes.XArray(["", ""]).any() False >>> xframes.XArray([]).any() False """ return self._impl.any()
[docs] def max(self): """ Get maximum numeric value in XArray. Returns None on an empty XArray. Raises an exception if called on an XArray with non-numeric type. Returns ------- type of XArray Maximum value of XArray See Also -------- xframes.XArray.min Examples -------- >>> xframes.XArray([14, 62, 83, 72, 77, 96, 5, 25, 69, 66]).max() 96 """ return self._impl.max()
[docs] def min(self): """ Get minimum numeric value in XArray. Returns None on an empty XArray. Raises an exception if called on an XArray with non-numeric type. Returns ------- type of XArray Minimum value of XArray See Also -------- xframes.XArray.max Examples -------- >>> xframes.XArray([14, 62, 83, 72, 77, 96, 5, 25, 69, 66]).min() """ return self._impl.min()
[docs] def sum(self): """ Sum of all values in this XArray. Raises an exception if called on an XArray of strings. If the XArray contains numeric arrays (list or array.array) and all the lists or arrays are the same length, the sum over all the arrays will be returned. If the XArray contains dictionaries whose values are numeric, then the sum of values whose keys appear in every row. Returns None on an empty XArray. For large values, this may overflow without warning. Returns ------- type of XArray Sum of all values in XArray """ return self._impl.sum()
[docs] def mean(self): """ Mean of all the values in the XArray. Returns None on an empty XArray. Raises an exception if called on an XArray with non-numeric type. Returns ------- float Mean of all values in XArray. """ return self._impl.mean()
[docs] def std(self, ddof=0): """ Standard deviation of all the values in the XArray. Returns None on an empty XArray. Raises an exception if called on an XArray with non-numeric type or if `ddof` >= length of XArray. Parameters ---------- ddof : int, optional "delta degrees of freedom" in the variance calculation. Returns ------- float The standard deviation of all the values. """ return self._impl.std(ddof)
[docs] def var(self, ddof=0): """ Variance of all the values in the XArray. Returns None on an empty XArray. Raises an exception if called on an XArray with non-numeric type or if `ddof` >= length of XArray. Parameters ---------- ddof : int, optional "delta degrees of freedom" in the variance calculation. Returns ------- float Variance of all values in XArray. """ return self._impl.var(ddof)
[docs] def num_missing(self): """ Number of missing elements in the XArray. Returns ------- int Number of missing values. """ return self._impl.num_missing()
[docs] def nnz(self): """ Number of non-zero elements in the XArray. Returns ------- int Number of non-zero elements. """ return self._impl.nnz()
[docs] def datetime_to_str(self, str_format='%Y-%m-%dT%H:%M:%S%ZP'): """ Create a new XArray with all the values cast to str. The string format is specified by the 'str_format' parameter. Parameters ---------- str_format : str The format to output the string. Default format is "%Y-%m-%dT%H:%M:%S%ZP". Returns ------- :class:`.XArray` of str The XArray converted to the type 'str'. Examples -------- >>> dt = datetime.datetime(2011, 10, 20, 9, 30, 10, tzinfo=GMT(-5)) >>> xa = xframes.XArray([dt]) >>> xa.datetime_to_str('%e %b %Y %T %ZP') dtype: str Rows: 1 [20 Oct 2011 09:30:10 GMT-05:00] See Also ---------- xframes.XArray.str_to_datetime """ if not issubclass(self.dtype(), datetime.datetime): raise TypeError('Datetime_to_str expects XArray of datetime as input XArray.') return XArray(impl=self._impl.datetime_to_str(str_format))
[docs] def str_to_datetime(self, str_format=None): """ Create a new XArray whose column type is datetime. The string format is specified by the 'str_format' parameter. Parameters ---------- str_format : str, optional The string format of the input XArray. If not given, dateutil parser is used. Returns ------- :class:`.XArray` of datetime.datetime The XArray converted to the type 'datetime'. Examples -------- >>> xa = xframes.XArray(['20-Oct-2011 09:30:10 GMT-05:30']) >>> xa.str_to_datetime('%d-%b-%Y %H:%M:%S %ZP') dtype: datetime.datetime Rows: 1 datetime.datetime(2011, 10, 20, 9, 30, 10) >>> xa = xframes.XArray(['Aug 23, 2015']) >>> xa.str_to_datetime() dtype: datetime.datetime Rows: 1 datetime.datetime(2015, 8, 23, 0, 0, 0) See Also ---------- xframes.XArray.datetime_to_str """ if not issubclass(self.dtype(), basestring): raise TypeError("'Str_to_datetime' expects XArray of str as input XArray.") return XArray(impl=self._impl.str_to_datetime(str_format))
[docs] def astype(self, dtype, undefined_on_failure=False): """ Create a new XArray with all values cast to the given type. Throws an exception if the types are not castable to the given type. Parameters ---------- dtype : {int, float, str, list, array.array, dict, datetime.datetime} The type to cast the elements to in XArray undefined_on_failure: bool, optional If set to True, runtime cast failures will be emitted as missing values rather than failing. Returns ------- :class:`.XArray` of dtype The XArray converted to the type `dtype`. Notes ----- - The string parsing techniques used to handle conversion to dictionary and list types are quite generic and permit a variety of interesting formats to be interpreted. For instance, a JSON string can usually be interpreted as a list or a dictionary type. See the examples below. - For datetime-to-string and string-to-datetime conversions, use xa.datetime_to_str() and xa.str_to_datetime() functions. Examples -------- >>> xa = xframes.XArray(['1','2','3','4']) >>> xa.astype(int) dtype: int Rows: 4 [1, 2, 3, 4] Given an XArray of strings that look like dicts, convert to a dictionary type: >>> xa = xframes.XArray(['{1:2 3:4}', '{a:b c:d}']) >>> xa.astype(dict) dtype: dict Rows: 2 [{1: 2, 3: 4}, {'a': 'b', 'c': 'd'}] """ return XArray(impl=self._impl.astype(dtype, undefined_on_failure))
[docs] def clip(self, lower=None, upper=None): """ Create a new XArray with each value clipped to be within the given bounds. In this case, "clipped" means that values below the lower bound will be set to the lower bound value. Values above the upper bound will be set to the upper bound value. This function can operate on XArrays of numeric type as well as array type, in which case each individual element in each array is clipped. By default `lower` and `upper` are set to ``None`` which indicates the respective bound should be ignored. The method fails if invoked on an XArray of non-numeric type. Parameters ---------- lower : int, optional The lower bound used to clip. Ignored if equal to ``None`` (the default). upper : int, optional The upper bound used to clip. Ignored if equal to ``None`` (the default). Returns ------- :class:`.XArray` See Also -------- xframes.XArray.clip_lower xframes.XArray.clip_upper Examples -------- >>> xa = xframes.XArray([1,2,3]) >>> xa.clip(2,2) dtype: int Rows: 3 [2, 2, 2] """ return XArray(impl=self._impl.clip(lower, upper))
[docs] def clip_lower(self, threshold): """ Create new XArray with all values clipped to the given lower bound. This function can operate on numeric arrays, as well as vector arrays, in which case each individual element in each vector is clipped. Throws an exception if the XArray is empty or the types are non-numeric. Parameters ---------- threshold : float The lower bound used to clip values. Returns ------- :class:`.XArray` See Also -------- xframes.XArray.clip xframes.XArray.clip_upper Examples -------- >>> xa = xframes.XArray([1,2,3]) >>> xa.clip_lower(2) dtype: int Rows: 3 [2, 2, 3] """ return XArray(impl=self._impl.clip(threshold, None))
[docs] def clip_upper(self, threshold): """ Create new XArray with all values clipped to the given upper bound. This function can operate on numeric arrays, as well as vector arrays, in which case each individual element in each vector is clipped. Parameters ---------- threshold : float The upper bound used to clip values. Returns ------- :class:`.XArray` See Also -------- xframes.XArray.clip xframes.XArray.clip_lower Examples -------- >>> xa = xframes.XArray([1,2,3]) >>> xa.clip_upper(2) dtype: int Rows: 3 [1, 2, 2] """ return XArray(impl=self._impl.clip(None, threshold))
[docs] def tail(self, n=10): """ Creates an XArray that contains the last n elements in the given XArray. Parameters ---------- n : int The number of elements. Returns ------- :class:`.XArray` A new XArray which contains the last n rows of the current XArray. """ return XArray(impl=self._impl.tail(n))
[docs] def countna(self): """ Count the number of missing values in the XArray. A missing value is represented in a float XArray as 'NaN' or None. A missing value in other types of XArrays is None. Returns ------- int The count of missing values. """ return self._impl.count_missing_values()
[docs] def dropna(self): """ Create new XArray containing only the non-missing values of the XArray. A missing value is represented in a float XArray as 'NaN' on None. A missing value in other types of XArrays is None. Returns ------- :class:`.XArray` The new XArray with missing values removed. """ return XArray(impl=self._impl.drop_missing_values())
[docs] def fillna(self, value): """ Create new XArray with all missing values (None or NaN) filled in with the given value. The size of the new XArray will be the same as the original XArray. If the given value is not the same type as the values in the XArray, `fillna` will attempt to convert the value to the original XArray's type. If this fails, an error will be raised. Parameters ---------- value : type convertible to XArray's type The value used to replace all missing values. Returns ------- :class:`.XArray` A new XArray with all missing values filled. """ return XArray(impl=self._impl.fill_missing_values(value))
[docs] def topk_index(self, topk=10, reverse=False): """ Create an XArray indicating which elements are in the top k. Entries are '1' if the corresponding element in the current XArray is a part of the top k elements, and '0' if that corresponding element is not. Order is descending by default. Parameters ---------- topk : int The number of elements to determine if 'top' reverse: bool If True, return the topk elements in ascending order Returns ------- :class:`.XArray` of int Notes ----- This is used internally by XFrame's topk function. """ if not isinstance(topk, int): raise TypeError("'Topk_index': topk must be an integer ({})".format(topk)) return XArray(impl=self._impl.topk_index(topk, reverse))
[docs] def sketch_summary(self, sub_sketch_keys=None): """ Summary statistics that can be calculated with one pass over the XArray. Returns a :class:`~xframes.Sketch` object which can be further queried for many descriptive statistics over this XArray. Many of the statistics are approximate. See the :class:`~xframes.Sketch` documentation for more detail. Parameters ---------- sub_sketch_keys: int | str | list of int | list of str, optional For XArray of dict type, also constructs sketches for a given set of keys, For XArray of array type, also constructs sketches for the given indexes. The sub sketches may be queried using: :py:func:`~xframes.Sketch.element_sub_sketch()` Defaults to None in which case no subsketches will be constructed. Returns ------- :class:`.Sketch` Sketch object that contains descriptive statistics for this XArray. Many of the statistics are approximate. """ from xframes.sketch import Sketch if sub_sketch_keys is not None: if not issubclass(self.dtype(), (dict, array.array)): raise TypeError("'Sub_sketch'_keys is only supported for " + 'XArray of dictionary or array type') if not hasattr(sub_sketch_keys, "__iter__"): sub_sketch_keys = [sub_sketch_keys] value_types = set([type(i) for i in sub_sketch_keys]) if len(value_types) != 1: raise ValueError("'Sub_sketch_keys' member values need to have the same type.") value_type = value_types.pop() if issubclass(self.dtype(), dict) and not isinstance(value_type, basestring): raise TypeError("Only string value(s) can be passed to 'sub_sketch_keys' " + 'for XArray of dictionary type. ' + 'For dictionary types, sketch summary is ' + 'computed by casting keys to string values.') if issubclass(self.dtype(), array.array) and not isinstance(value_type, int): raise TypeError("Only int value(s) can be passed to 'sub_sketch_keys' " + 'for XArray of array type') return Sketch(self, sub_sketch_keys=sub_sketch_keys)
[docs] def append(self, other): """ Append an XArray to the current XArray. Creates a new XArray with the rows from both XArrays. Both XArrays must be of the same data type. Parameters ---------- other : :class:`.XArray` Another XArray whose rows are appended to current XArray. Returns ------- :class:`.XArray` A new XArray that contains rows from both XArrays, with rows from the other XArray coming after all rows from the current XArray. See Also -------- xframes.XFrame.append Appends XFrames Examples -------- >>> xa = xframes.XArray([1, 2, 3]) >>> xa2 = xframes.XArray([4, 5, 6]) >>> xa.append(xa2) dtype: int Rows: 6 [1, 2, 3, 4, 5, 6] """ if not isinstance(other, XArray): raise RuntimeError('XArray append can only work with XArray.') if self.dtype() is not other.dtype(): raise RuntimeError('Data types in both XArrays have to be the same.') return XArray(impl=self._impl.append(other.impl()))
[docs] def unique(self): """ Get all unique values in the current XArray. Will not necessarily preserve the order of the given XArray in the new XArray. Raises a TypeError if the XArray is of dictionary type. Returns ------- :class:`.XArray` A new XArray that contains the unique values of the current XArray. See Also -------- xframes.XFrame.unique Unique rows in XFrames. """ return XArray(impl=self._impl.unique())
[docs] def item_length(self): """ Length of each element in the current XArray. Only works on XArrays of string, dict, array, or list type. If a given element is a missing value, then the output elements is also a missing value. This function is equivalent to the following but more performant: xa_item_len = xa.apply(lambda x: len(x) if x is not None else None) Returns ------- :class:`.XArray` A new XArray, each element in the XArray is the len of the corresponding items in original XArray. Examples -------- >>> xa = XArray([ ... {"is_restaurant": 1, "is_electronics": 0}, ... {"is_restaurant": 1, "is_retail": 1, "is_electronics": 0}, ... {"is_restaurant": 0, "is_retail": 1, "is_electronics": 0}, ... {"is_restaurant": 0}, ... {"is_restaurant": 1, "is_electronics": 1}, ... None]) >>> xa.item_length() dtype: int Rows: 6 [2, 3, 3, 1, 2, None] """ if not issubclass(self.dtype(), (str, list, dict, array.array)): raise TypeError("Item_length() is only applicable for XArray of type 'str', 'list', " + "'dict' and 'array'.") return XArray(impl=self._impl.item_length())
[docs] def split_datetime(self, column_name_prefix='X', limit=None): """ Splits an XArray of datetime type to multiple columns, return a new XFrame that contains expanded columns. A XArray of datetime will be split by default into an XFrame of 6 columns, one for each year/month/day/hour/minute/second element. column naming: When splitting a XArray of datetime type, new columns are named: prefix.year, prefix.month, etc. The prefix is set by the parameter "column_name_prefix" and defaults to 'X'. If column_name_prefix is None or empty, then no prefix is used. Parameters ---------- column_name_prefix: str, optional If provided, expanded column names would start with the given prefix. Defaults to "X". limit: str, list[str], optional Limits the set of datetime elements to expand. Elements may be 'year','month','day','hour','minute', and 'second'. Returns ------- :class:`.XFrame` A new XFrame that contains all expanded columns Examples -------- To expand only day and year elements of a datetime XArray >>> xa = XArray( [datetime.datetime(2011, 1, 21, 7, 7, 21), datetime.datetime(2010, 2, 5, 7, 8, 21]) >>> xa.split_datetime(column_name_prefix=None,limit=['day','year']) Columns: day int year int Rows: 2 Data: +-------+--------+ | day | year | +-------+--------+ | 21 | 2011 | | 5 | 2010 | +-------+--------+ [2 rows x 2 columns] """ if not issubclass(self.dtype(), datetime.datetime): raise TypeError('Only column of datetime type can be split.') if column_name_prefix is None: column_name_prefix = '' if not isinstance(column_name_prefix, str): raise TypeError("'Column_name_prefix' must be a string.") # convert limit to column_keys if limit is not None: if isinstance(limit, str): limit = [limit] if not hasattr(limit, '__iter__'): raise TypeError("'Limit' must be a list.") for lim in limit: if not isinstance(lim, str): raise TypeError("'Limit' must contain string values.") for item in limit: if item not in ['year', 'month', 'day', 'hour', 'minute', 'second']: raise ValueError("'Limit' values may be 'year', 'month', 'day', 'hour', 'minute', or 'second': {}" .format(item)) if limit is not None: column_types = list() for _ in limit: column_types.append(int) else: limit = ['year', 'month', 'day', 'hour', 'minute', 'second'] column_types = [int, int, int, int, int, int] return xframes.XFrame(impl=self._impl.split_datetime(column_name_prefix, limit, column_types))
# noinspection PyTypeChecker
[docs] def unpack(self, column_name_prefix='X', column_types=None, na_value=None, limit=None): """ Convert an XFrame of list, array, or dict type to an XFrame with multiple columns. `unpack` expands an XArray using the values of each list/array/dict as elements in a new XFrame of multiple columns. For example, an XArray of lists each of length 4 will be expanded into an XFrame of 4 columns, one for each list element. An XArray of lists/tuples/arrays of varying size will be expand to a number of columns equal to the longest list/array. An XArray of dictionaries will be expanded into as many columns as there are keys. When unpacking an XArray of list or array type, new columns are named: `column_name_prefix`.0, `column_name_prefix`.1, etc. If unpacking a column of dict type, unpacked columns are named `column_name_prefix`.key1, `column_name_prefix`.key2, etc. When unpacking an XArray of list or dictionary types, missing values in the original element remain as missing values in the resultant columns. If the `na_value` parameter is specified, all values equal to this given value are also replaced with missing values. In an XArray of array.array type, NaN is interpreted as a missing value. :py:func:`xframes.XFrame.pack_columns()` is the reverse effect of unpack Parameters ---------- column_name_prefix: str, optional If provided, unpacked column names would start with the given prefix. column_types: list[type], optional Column types for the unpacked columns. If not provided, column types are automatically inferred from first 100 rows. Defaults to None. na_value: optional Convert all values that are equal to `na_value` to missing value if specified. limit: list, optional Limits the set of list/array/dict keys to unpack. For list/array XArrays, 'limit' must contain integer indices. For dict XArray, 'limit' must contain dictionary keys. Returns ------- :class:`.XFrame` A new XFrame that contains all unpacked columns Examples -------- To unpack a dict XArray >>> xa = XArray([{ 'word': 'a', 'count': 1}, ... { 'word': 'cat', 'count': 2}, ... { 'word': 'is', 'count': 3}, ... { 'word': 'coming','count': 4}]) Normal case of unpacking XArray of type dict: >>> xa.unpack(column_name_prefix=None) Columns: count int word str <BLANKLINE> Rows: 4 <BLANKLINE> Data: +-------+--------+ | count | word | +-------+--------+ | 1 | a | | 2 | cat | | 3 | is | | 4 | coming | +-------+--------+ [4 rows x 2 columns] <BLANKLINE> Unpack only keys with 'word': >>> xa.unpack(limit=['word']) Columns: X.word str <BLANKLINE> Rows: 4 <BLANKLINE> Data: +--------+ | X.word | +--------+ | a | | cat | | is | | coming | +--------+ [4 rows x 1 columns] <BLANKLINE> >>> xa2 = XArray([ ... [1, 0, 1], ... [1, 1, 1], ... [0, 1]]) Convert all zeros to missing values: >>> xa2.unpack(column_types=[int, int, int], na_value=0) Columns: X.0 int X.1 int X.2 int <BLANKLINE> Rows: 3 <BLANKLINE> Data: +------+------+------+ | X.0 | X.1 | X.2 | +------+------+------+ | 1 | None | 1 | | 1 | 1 | 1 | | None | 1 | None | +------+------+------+ [3 rows x 3 columns] <BLANKLINE> """ def is_missing(val): if val is None: return True if isinstance(val, float) and math.isnan(val): return True return False def type_from_typecode(typecode): if typecode in 'cbBuhHiIlL': return int if typecode in 'fd': return float return None # noinspection PyShadowingNames def make_column_types(head_rows, keys): column_types = {} for row in head_rows: for key in row.keys(): val = row[key] if key not in column_types and not is_missing(val): column_types[key] = type(val) return [column_types[key] for key in keys] if not issubclass(self.dtype(), (dict, array.array, list, tuple)): raise TypeError('Only XArray of dict/list/tuple/array type supports unpack: {}.'.format( self.dtype().__name__)) if column_name_prefix is None: column_name_prefix = "" if not isinstance(column_name_prefix, str): raise TypeError("'Column_name_prefix' must be a string.") # validdate 'limit' if limit is not None: if not hasattr(limit, '__iter__'): raise TypeError("'Limit' must be a list.") name_types = set([type(i) for i in limit]) if len(name_types) != 1: raise TypeError("'Limit' contains values that are different types.") # limit value should be numeric if unpacking xarray.array value if not issubclass(self.dtype(), dict) and not issubclass(name_types.pop(), int): raise TypeError("'Limit' must contain integer values.") if len(set(limit)) != len(limit): raise ValueError("'Limit' contains duplicate values.") if column_types is not None: if not hasattr(column_types, '__iter__'): raise TypeError("'column_types' must be a list.") for column_type in column_types: if column_type not in (int, float, str, list, dict, array.array): raise TypeError("'Column_types' contains unsupported types. " + "Supported types are ['float', 'int', 'list', " + "'dict', 'str', 'array.array'].") if limit is not None: if len(limit) != len(column_types): raise ValueError("'Limit' and 'column_types' do not have the same length.") elif issubclass(self.dtype(), dict): raise ValueError("If 'column_types' is given, " + "'limit' has to be provided to unpack dict type.") else: limit = range(len(column_types)) else: head_rows = self.head(100).dropna() lengths = [len(i) for i in head_rows] if len(lengths) == 0 or max(lengths) == 0: raise RuntimeError('Cannot infer number of items from the XArray. ' + 'XArray may be empty. ' + 'Please explicitly provide column types.') # infer column types for dict type at server side, # for list and array, infer from client side if not issubclass(self.dtype(), dict): length = max(lengths) if limit is None: limit = range(length) else: # adjust the length length = len(limit) if issubclass(self.dtype(), array.array): typ = type_from_typecode(head_rows[0].typecode) column_types = [typ for _ in range(length)] else: column_types = list() for i in limit: t = [(x[i] if ((x is not None) and len(x) > i) else None) for x in head_rows] column_types.append(infer_type_of_list(t)) else: # self.dtype() is dict if limit is None: key_set = set() for row in head_rows: key_set |= set(row.keys()) # translate to indexes limit = list(key_set) if column_types is None: column_types = make_column_types(head_rows, limit) return xframes.XFrame(impl=self._impl.unpack(column_name_prefix, limit, column_types, na_value))
[docs] def sort(self, ascending=True): """ Sort all values in this XArray. Sort only works for xarray of type str, int and float, otherwise TypeError will be raised. Creates a new, sorted XArray. Parameters ---------- ascending: boolean, optional If True, the xarray values are sorted in ascending order, otherwise, descending order. Returns ------- :class:`.XArray` The sorted XArray. Examples -------- >>> xa = XArray([3,2,1]) >>> xa.sort() dtype: int Rows: 3 [1, 2, 3] """ if not issubclass(self.dtype(), (int, float, str, datetime.datetime)): raise TypeError("Only xarray with type ('int', 'float', 'str', and 'datetime.datetime)' can be sorted.") return XArray(impl=self._impl.sort(ascending))