mirror of
https://github.com/saymrwulf/zipline.git
synced 2026-05-14 20:58:10 +00:00
* BLD: Try github actions again * new requirements for p36 * fix code different across numpy versions * silence the correct warnings for tests to run * MAINT: Use loc instead of deprecated ix * comment out windows for now Co-authored-by: Richard Frank <rich@quantopian.com>
630 lines
21 KiB
Python
630 lines
21 KiB
Python
from itertools import product
|
|
from operator import eq, ne
|
|
import warnings
|
|
|
|
import numpy as np
|
|
from toolz import take
|
|
|
|
from zipline.lib.labelarray import LabelArray
|
|
from zipline.testing import check_arrays, parameter_space, ZiplineTestCase
|
|
from zipline.testing.predicates import assert_equal
|
|
from zipline.utils.compat import unicode
|
|
|
|
|
|
def rotN(l, N):
|
|
"""
|
|
Rotate a list of elements.
|
|
|
|
Pulls N elements off the end of the list and appends them to the front.
|
|
|
|
>>> rotN(['a', 'b', 'c', 'd'], 2)
|
|
['c', 'd', 'a', 'b']
|
|
>>> rotN(['a', 'b', 'c', 'd'], 3)
|
|
['d', 'a', 'b', 'c']
|
|
"""
|
|
assert len(l) >= N, "Can't rotate list by longer than its length."
|
|
return l[N:] + l[:N]
|
|
|
|
|
|
def all_ufuncs():
|
|
ufunc_type = type(np.isnan)
|
|
return (f for f in vars(np).values() if isinstance(f, ufunc_type))
|
|
|
|
|
|
class LabelArrayTestCase(ZiplineTestCase):
|
|
|
|
@classmethod
|
|
def init_class_fixtures(cls):
|
|
super(LabelArrayTestCase, cls).init_class_fixtures()
|
|
|
|
cls.rowvalues = row = ['', 'a', 'b', 'ab', 'a', '', 'b', 'ab', 'z']
|
|
cls.strs = np.array([rotN(row, i) for i in range(3)], dtype=object)
|
|
|
|
def test_fail_on_direct_construction(self):
|
|
# See https://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html#simple-example-adding-an-extra-attribute-to-ndarray # noqa
|
|
|
|
with self.assertRaises(TypeError) as e:
|
|
np.ndarray.__new__(LabelArray, (5, 5))
|
|
|
|
self.assertEqual(
|
|
str(e.exception),
|
|
"Direct construction of LabelArrays is not supported."
|
|
)
|
|
|
|
@parameter_space(
|
|
__fail_fast=True,
|
|
compval=['', 'a', 'z', 'not in the array'],
|
|
shape=[(27,), (3, 9), (3, 3, 3)],
|
|
array_astype=(bytes, unicode, object),
|
|
missing_value=('', 'a', 'not in the array', None),
|
|
)
|
|
def test_compare_to_str(self,
|
|
compval,
|
|
shape,
|
|
array_astype,
|
|
missing_value):
|
|
|
|
strs = self.strs.reshape(shape).astype(array_astype)
|
|
if missing_value is None:
|
|
# As of numpy 1.9.2, object array != None returns just False
|
|
# instead of an array, with a deprecation warning saying the
|
|
# behavior will change in the future. Work around that by just
|
|
# using the ufunc.
|
|
notmissing = np.not_equal(strs, missing_value)
|
|
else:
|
|
if not isinstance(missing_value, array_astype):
|
|
missing_value = array_astype(missing_value, 'utf-8')
|
|
notmissing = (strs != missing_value)
|
|
|
|
arr = LabelArray(strs, missing_value=missing_value)
|
|
|
|
if not isinstance(compval, array_astype):
|
|
compval = array_astype(compval, 'utf-8')
|
|
|
|
# arr.missing_value should behave like NaN.
|
|
check_arrays(
|
|
arr == compval,
|
|
(strs == compval) & notmissing,
|
|
)
|
|
check_arrays(
|
|
arr != compval,
|
|
(strs != compval) & notmissing,
|
|
)
|
|
|
|
np_startswith = np.vectorize(lambda elem: elem.startswith(compval))
|
|
check_arrays(
|
|
arr.startswith(compval),
|
|
np_startswith(strs) & notmissing,
|
|
)
|
|
|
|
np_endswith = np.vectorize(lambda elem: elem.endswith(compval))
|
|
check_arrays(
|
|
arr.endswith(compval),
|
|
np_endswith(strs) & notmissing,
|
|
)
|
|
|
|
np_contains = np.vectorize(lambda elem: compval in elem)
|
|
check_arrays(
|
|
arr.has_substring(compval),
|
|
np_contains(strs) & notmissing,
|
|
)
|
|
|
|
@parameter_space(
|
|
__fail_fast=True,
|
|
f=[
|
|
lambda s: str(len(s)),
|
|
lambda s: s[0],
|
|
lambda s: ''.join(reversed(s)),
|
|
lambda s: '',
|
|
]
|
|
)
|
|
def test_map(self, f):
|
|
data = np.array(
|
|
[['E', 'GHIJ', 'HIJKLMNOP', 'DEFGHIJ'],
|
|
['CDE', 'ABCDEFGHIJKLMNOPQ', 'DEFGHIJKLMNOPQRS', 'ABCDEFGHIJK'],
|
|
['DEFGHIJKLMNOPQR', 'DEFGHI', 'DEFGHIJ', 'FGHIJK'],
|
|
['EFGHIJKLM', 'EFGHIJKLMNOPQRS', 'ABCDEFGHI', 'DEFGHIJ']],
|
|
dtype=object,
|
|
)
|
|
la = LabelArray(data, missing_value=None)
|
|
|
|
numpy_transformed = np.vectorize(f)(data)
|
|
la_transformed = la.map(f).as_string_array()
|
|
|
|
assert_equal(numpy_transformed, la_transformed)
|
|
|
|
@parameter_space(missing=['A', None])
|
|
def test_map_ignores_missing_value(self, missing):
|
|
data = np.array([missing, 'B', 'C'], dtype=object)
|
|
la = LabelArray(data, missing_value=missing)
|
|
|
|
def increment_char(c):
|
|
return chr(ord(c) + 1)
|
|
|
|
result = la.map(increment_char)
|
|
expected = LabelArray([missing, 'C', 'D'], missing_value=missing)
|
|
assert_equal(result.as_string_array(), expected.as_string_array())
|
|
|
|
@parameter_space(
|
|
__fail_fast=True,
|
|
f=[
|
|
lambda s: 0,
|
|
lambda s: 0.0,
|
|
lambda s: object(),
|
|
]
|
|
)
|
|
def test_map_requires_f_to_return_a_string_or_none(self, f):
|
|
la = LabelArray(self.strs, missing_value=None)
|
|
|
|
with self.assertRaises(TypeError):
|
|
la.map(f)
|
|
|
|
def test_map_can_only_return_none_if_missing_value_is_none(self):
|
|
|
|
# Should work.
|
|
la = LabelArray(self.strs, missing_value=None)
|
|
result = la.map(lambda x: None)
|
|
|
|
check_arrays(
|
|
result,
|
|
LabelArray(np.full_like(self.strs, None), missing_value=None),
|
|
)
|
|
|
|
la = LabelArray(self.strs, missing_value="__MISSING__")
|
|
with self.assertRaises(TypeError):
|
|
la.map(lambda x: None)
|
|
|
|
@parameter_space(
|
|
__fail_fast=True,
|
|
missing_value=('', 'a', 'not in the array', None),
|
|
)
|
|
def test_compare_to_str_array(self, missing_value):
|
|
strs = self.strs
|
|
shape = strs.shape
|
|
arr = LabelArray(strs, missing_value=missing_value)
|
|
|
|
if missing_value is None:
|
|
# As of numpy 1.9.2, object array != None returns just False
|
|
# instead of an array, with a deprecation warning saying the
|
|
# behavior will change in the future. Work around that by just
|
|
# using the ufunc.
|
|
notmissing = np.not_equal(strs, missing_value)
|
|
else:
|
|
notmissing = (strs != missing_value)
|
|
|
|
check_arrays(arr.not_missing(), notmissing)
|
|
check_arrays(arr.is_missing(), ~notmissing)
|
|
|
|
# The arrays are equal everywhere, but comparisons against the
|
|
# missing_value should always produce False
|
|
check_arrays(strs == arr, notmissing)
|
|
check_arrays(strs != arr, np.zeros_like(strs, dtype=bool))
|
|
|
|
def broadcastable_row(value, dtype):
|
|
return np.full((shape[0], 1), value, dtype=strs.dtype)
|
|
|
|
def broadcastable_col(value, dtype):
|
|
return np.full((1, shape[1]), value, dtype=strs.dtype)
|
|
|
|
# Test comparison between arr and a like-shap 2D array, a column
|
|
# vector, and a row vector.
|
|
for comparator, dtype, value in product((eq, ne),
|
|
(bytes, unicode, object),
|
|
set(self.rowvalues)):
|
|
check_arrays(
|
|
comparator(arr, np.full_like(strs, value)),
|
|
comparator(strs, value) & notmissing,
|
|
)
|
|
check_arrays(
|
|
comparator(arr, broadcastable_row(value, dtype=dtype)),
|
|
comparator(strs, value) & notmissing,
|
|
)
|
|
check_arrays(
|
|
comparator(arr, broadcastable_col(value, dtype=dtype)),
|
|
comparator(strs, value) & notmissing,
|
|
)
|
|
|
|
@parameter_space(
|
|
__fail_fast=True,
|
|
slice_=[
|
|
0, 1, -1,
|
|
slice(None),
|
|
slice(0, 0),
|
|
slice(0, 3),
|
|
slice(1, 4),
|
|
slice(0),
|
|
slice(None, 1),
|
|
slice(0, 4, 2),
|
|
(slice(None), 1),
|
|
(slice(None), slice(None)),
|
|
(slice(None), slice(1, 2)),
|
|
]
|
|
)
|
|
def test_slicing_preserves_attributes(self, slice_):
|
|
arr = LabelArray(self.strs.reshape((9, 3)), missing_value='')
|
|
sliced = arr[slice_]
|
|
self.assertIsInstance(sliced, LabelArray)
|
|
self.assertIs(sliced.categories, arr.categories)
|
|
self.assertIs(sliced.reverse_categories, arr.reverse_categories)
|
|
self.assertIs(sliced.missing_value, arr.missing_value)
|
|
|
|
def test_infer_categories(self):
|
|
"""
|
|
Test that categories are inferred in sorted order if they're not
|
|
explicitly passed.
|
|
"""
|
|
arr1d = LabelArray(self.strs, missing_value='')
|
|
codes1d = arr1d.as_int_array()
|
|
self.assertEqual(arr1d.shape, self.strs.shape)
|
|
self.assertEqual(arr1d.shape, codes1d.shape)
|
|
|
|
categories = arr1d.categories
|
|
unique_rowvalues = set(self.rowvalues)
|
|
|
|
# There should be an entry in categories for each unique row value, and
|
|
# each integer stored in the data array should be an index into
|
|
# categories.
|
|
self.assertEqual(list(categories), sorted(set(self.rowvalues)))
|
|
self.assertEqual(
|
|
set(codes1d.ravel()),
|
|
set(range(len(unique_rowvalues)))
|
|
)
|
|
for idx, value in enumerate(arr1d.categories):
|
|
check_arrays(
|
|
self.strs == value,
|
|
arr1d.as_int_array() == idx,
|
|
)
|
|
|
|
# It should be equivalent to pass the same set of categories manually.
|
|
arr1d_explicit_categories = LabelArray(
|
|
self.strs,
|
|
missing_value='',
|
|
categories=arr1d.categories,
|
|
)
|
|
check_arrays(arr1d, arr1d_explicit_categories)
|
|
|
|
for shape in (9, 3), (3, 9), (3, 3, 3):
|
|
strs2d = self.strs.reshape(shape)
|
|
arr2d = LabelArray(strs2d, missing_value='')
|
|
codes2d = arr2d.as_int_array()
|
|
|
|
self.assertEqual(arr2d.shape, shape)
|
|
check_arrays(arr2d.categories, categories)
|
|
|
|
for idx, value in enumerate(arr2d.categories):
|
|
check_arrays(strs2d == value, codes2d == idx)
|
|
|
|
def test_reject_ufuncs(self):
|
|
"""
|
|
The internal values of a LabelArray should be opaque to numpy ufuncs.
|
|
|
|
Test that all unfuncs fail.
|
|
"""
|
|
labels = LabelArray(self.strs, '')
|
|
ints = np.arange(len(labels))
|
|
|
|
with warnings.catch_warnings():
|
|
# Some ufuncs return NotImplemented, but warn that they will fail
|
|
# in the future. Both outcomes are fine, so ignore the warnings.
|
|
warnings.filterwarnings(
|
|
'ignore',
|
|
message="unorderable dtypes.*",
|
|
category=DeprecationWarning,
|
|
)
|
|
warnings.filterwarnings(
|
|
'ignore',
|
|
message="elementwise comparison failed.*",
|
|
category=FutureWarning,
|
|
)
|
|
for func in all_ufuncs():
|
|
# Different ufuncs vary between returning NotImplemented and
|
|
# raising a TypeError when provided with unknown dtypes.
|
|
# This is a bit unfortunate, but still better than silently
|
|
# accepting an int array.
|
|
try:
|
|
if func.nin == 1:
|
|
ret = func(labels)
|
|
elif func.nin == 2:
|
|
ret = func(labels, ints)
|
|
else:
|
|
self.fail("Who added a ternary ufunc !?!")
|
|
except (TypeError, ValueError):
|
|
pass
|
|
else:
|
|
self.assertIs(ret, NotImplemented)
|
|
|
|
@parameter_space(
|
|
__fail_fast=True,
|
|
val=['', 'a', 'not in the array', None],
|
|
missing_value=['', 'a', 'not in the array', None],
|
|
)
|
|
def test_setitem_scalar(self, val, missing_value):
|
|
arr = LabelArray(self.strs, missing_value=missing_value)
|
|
|
|
if not arr.has_label(val):
|
|
self.assertTrue(
|
|
(val == 'not in the array')
|
|
or (val is None and missing_value is not None)
|
|
)
|
|
for slicer in [(0, 0), (0, 1), 1]:
|
|
with self.assertRaises(ValueError):
|
|
arr[slicer] = val
|
|
return
|
|
|
|
arr[0, 0] = val
|
|
self.assertEqual(arr[0, 0], val)
|
|
|
|
arr[0, 1] = val
|
|
self.assertEqual(arr[0, 1], val)
|
|
|
|
arr[1] = val
|
|
if val == missing_value:
|
|
self.assertTrue(arr.is_missing()[1].all())
|
|
else:
|
|
self.assertTrue((arr[1] == val).all())
|
|
self.assertTrue((arr[1].as_string_array() == val).all())
|
|
|
|
arr[:, -1] = val
|
|
if val == missing_value:
|
|
self.assertTrue(arr.is_missing()[:, -1].all())
|
|
else:
|
|
self.assertTrue((arr[:, -1] == val).all())
|
|
self.assertTrue((arr[:, -1].as_string_array() == val).all())
|
|
|
|
arr[:] = val
|
|
if val == missing_value:
|
|
self.assertTrue(arr.is_missing().all())
|
|
else:
|
|
self.assertFalse(arr.is_missing().any())
|
|
self.assertTrue((arr == val).all())
|
|
|
|
def test_setitem_array(self):
|
|
arr = LabelArray(self.strs, missing_value=None)
|
|
orig_arr = arr.copy()
|
|
|
|
# Write a row.
|
|
self.assertFalse(
|
|
(arr[0] == arr[1]).all(),
|
|
"This test doesn't test anything because rows 0"
|
|
" and 1 are already equal!"
|
|
)
|
|
arr[0] = arr[1]
|
|
for i in range(arr.shape[1]):
|
|
self.assertEqual(arr[0, i], arr[1, i])
|
|
|
|
# Write a column.
|
|
self.assertFalse(
|
|
(arr[:, 0] == arr[:, 1]).all(),
|
|
"This test doesn't test anything because columns 0"
|
|
" and 1 are already equal!"
|
|
)
|
|
arr[:, 0] = arr[:, 1]
|
|
for i in range(arr.shape[0]):
|
|
self.assertEqual(arr[i, 0], arr[i, 1])
|
|
|
|
# Write the whole array.
|
|
arr[:] = orig_arr
|
|
check_arrays(arr, orig_arr)
|
|
|
|
@staticmethod
|
|
def check_roundtrip(arr):
|
|
assert_equal(
|
|
arr.as_string_array(),
|
|
LabelArray(
|
|
arr.as_string_array(),
|
|
arr.missing_value,
|
|
).as_string_array(),
|
|
)
|
|
|
|
@staticmethod
|
|
def create_categories(width, plus_one):
|
|
length = int(width / 8) + plus_one
|
|
return [
|
|
''.join(cs)
|
|
for cs in take(
|
|
2 ** width + plus_one,
|
|
product([chr(c) for c in range(256)], repeat=length),
|
|
)
|
|
]
|
|
|
|
def test_narrow_code_storage(self):
|
|
create_categories = self.create_categories
|
|
check_roundtrip = self.check_roundtrip
|
|
|
|
# uint8
|
|
categories = create_categories(8, plus_one=False)
|
|
arr = LabelArray(
|
|
categories,
|
|
missing_value=categories[0],
|
|
categories=categories,
|
|
)
|
|
self.assertEqual(arr.itemsize, 1)
|
|
check_roundtrip(arr)
|
|
|
|
# uint8 inference
|
|
arr = LabelArray(categories, missing_value=categories[0])
|
|
self.assertEqual(arr.itemsize, 1)
|
|
check_roundtrip(arr)
|
|
|
|
# just over uint8
|
|
categories = create_categories(8, plus_one=True)
|
|
arr = LabelArray(
|
|
categories,
|
|
missing_value=categories[0],
|
|
categories=categories,
|
|
)
|
|
self.assertEqual(arr.itemsize, 2)
|
|
check_roundtrip(arr)
|
|
|
|
# fits in uint16
|
|
categories = create_categories(16, plus_one=False)
|
|
arr = LabelArray(
|
|
categories,
|
|
missing_value=categories[0],
|
|
categories=categories,
|
|
)
|
|
self.assertEqual(arr.itemsize, 2)
|
|
check_roundtrip(arr)
|
|
|
|
# uint16 inference
|
|
arr = LabelArray(categories, missing_value=categories[0])
|
|
self.assertEqual(arr.itemsize, 2)
|
|
check_roundtrip(arr)
|
|
|
|
# just over uint16
|
|
categories = create_categories(16, plus_one=True)
|
|
arr = LabelArray(
|
|
categories,
|
|
missing_value=categories[0],
|
|
categories=categories,
|
|
)
|
|
self.assertEqual(arr.itemsize, 4)
|
|
check_roundtrip(arr)
|
|
|
|
# uint32 inference
|
|
arr = LabelArray(categories, missing_value=categories[0])
|
|
self.assertEqual(arr.itemsize, 4)
|
|
check_roundtrip(arr)
|
|
|
|
# NOTE: we could do this for 32 and 64; however, no one has enough RAM
|
|
# or time for that.
|
|
|
|
def test_known_categories_without_missing_at_boundary(self):
|
|
# This tests the case where we have exactly 256 unique categories but
|
|
# we didn't include the missing value in the categories.
|
|
categories = self.create_categories(8, plus_one=False)
|
|
|
|
arr = LabelArray(
|
|
categories,
|
|
None,
|
|
categories=categories,
|
|
)
|
|
self.check_roundtrip(arr)
|
|
# the missing value pushes us into 2 byte storage
|
|
self.assertEqual(arr.itemsize, 2)
|
|
|
|
def test_narrow_condense_back_to_valid_size(self):
|
|
categories = ['a'] * (2 ** 8 + 1)
|
|
arr = LabelArray(categories, missing_value=categories[0])
|
|
assert_equal(arr.itemsize, 1)
|
|
self.check_roundtrip(arr)
|
|
|
|
# longer than int16 but still fits when deduped
|
|
categories = self.create_categories(16, plus_one=False)
|
|
categories.append(categories[0])
|
|
arr = LabelArray(categories, missing_value=categories[0])
|
|
assert_equal(arr.itemsize, 2)
|
|
self.check_roundtrip(arr)
|
|
|
|
def test_map_shrinks_code_storage_if_possible(self):
|
|
arr = LabelArray(
|
|
# Drop the last value so we fit in a uint16 with None as a missing
|
|
# value.
|
|
self.create_categories(16, plus_one=False)[:-1],
|
|
missing_value=None,
|
|
)
|
|
|
|
self.assertEqual(arr.itemsize, 2)
|
|
|
|
def either_A_or_B(s):
|
|
return ('A', 'B')[sum(ord(c) for c in s) % 2]
|
|
|
|
result = arr.map(either_A_or_B)
|
|
|
|
self.assertEqual(set(result.categories), {'A', 'B', None})
|
|
self.assertEqual(result.itemsize, 1)
|
|
|
|
assert_equal(
|
|
np.vectorize(either_A_or_B)(arr.as_string_array()),
|
|
result.as_string_array(),
|
|
)
|
|
|
|
def test_map_never_increases_code_storage_size(self):
|
|
# This tests a pathological case where a user maps an impure function
|
|
# that returns a different label on every invocation, which in a naive
|
|
# implementation could cause us to need to **increase** the size of our
|
|
# codes after a map.
|
|
#
|
|
# This doesn't happen, however, because we guarantee that the user's
|
|
# mapping function will be called on each unique category exactly once,
|
|
# which means we can never increase the number of categories in the
|
|
# LabelArray after mapping.
|
|
|
|
# Using all but one of the categories so that we still fit in a uint8
|
|
# with an extra category for None as a missing value.
|
|
categories = self.create_categories(8, plus_one=False)[:-1]
|
|
|
|
larger_categories = self.create_categories(16, plus_one=False)
|
|
|
|
# Double the length of the categories so that we have to increase the
|
|
# required size after our map.
|
|
categories_twice = categories + categories
|
|
|
|
arr = LabelArray(categories_twice, missing_value=None)
|
|
assert_equal(arr.itemsize, 1)
|
|
|
|
gen_unique_categories = iter(larger_categories)
|
|
|
|
def new_string_every_time(c):
|
|
# Return a new unique category every time so that every result is
|
|
# different.
|
|
return next(gen_unique_categories)
|
|
|
|
result = arr.map(new_string_every_time)
|
|
|
|
# Result should still be of size 1.
|
|
assert_equal(result.itemsize, 1)
|
|
|
|
# Result should be the first `len(categories)` entries from the larger
|
|
# categories, repeated twice.
|
|
expected = LabelArray(
|
|
larger_categories[:len(categories)] * 2,
|
|
missing_value=None,
|
|
)
|
|
assert_equal(result.as_string_array(), expected.as_string_array())
|
|
|
|
def manual_narrow_condense_back_to_valid_size_slow(self):
|
|
"""This test is really slow so we don't want it run by default.
|
|
"""
|
|
# tests that we don't try to create an 'int24' (which is meaningless)
|
|
categories = self.create_categories(24, plus_one=False)
|
|
categories.append(categories[0])
|
|
arr = LabelArray(categories, missing_value=categories[0])
|
|
assert_equal(arr.itemsize, 4)
|
|
self.check_roundtrip(arr)
|
|
|
|
def test_copy_categories_list(self):
|
|
"""regression test for #1927
|
|
"""
|
|
categories = ['a', 'b', 'c']
|
|
|
|
LabelArray(
|
|
[None, 'a', 'b', 'c'],
|
|
missing_value=None,
|
|
categories=categories,
|
|
)
|
|
|
|
# before #1927 we didn't take a copy and would insert the missing value
|
|
# (None) into the list
|
|
assert_equal(categories, ['a', 'b', 'c'])
|
|
|
|
def test_fortran_contiguous_input(self):
|
|
|
|
strs = np.array([['a', 'b', 'c', 'd'],
|
|
['a', 'b', 'c', 'd'],
|
|
['a', 'b', 'c', 'd']], dtype=object)
|
|
strs_F = strs.T
|
|
self.assertTrue(strs_F.flags.f_contiguous)
|
|
|
|
arr = LabelArray(
|
|
strs_F,
|
|
missing_value=None,
|
|
categories=['a', 'b', 'c', 'd', None],
|
|
)
|
|
assert_equal(arr.as_string_array(), strs_F)
|
|
|
|
arr = LabelArray(
|
|
strs_F,
|
|
missing_value=None,
|
|
)
|
|
assert_equal(arr.as_string_array(), strs_F)
|