mirror of
https://github.com/saymrwulf/zipline.git
synced 2026-05-16 21:10:11 +00:00
1769 lines
61 KiB
Python
1769 lines
61 KiB
Python
"""
|
|
Tests for Factor terms.
|
|
"""
|
|
from functools import partial
|
|
from itertools import product
|
|
from nose_parameterized import parameterized
|
|
from unittest import TestCase, skipIf
|
|
|
|
from toolz import compose
|
|
import numpy as np
|
|
from numpy import (
|
|
apply_along_axis,
|
|
arange,
|
|
array,
|
|
datetime64,
|
|
empty,
|
|
eye,
|
|
inf,
|
|
log1p,
|
|
nan,
|
|
ones,
|
|
ones_like,
|
|
rot90,
|
|
where,
|
|
)
|
|
from numpy.random import randn, seed
|
|
import pandas as pd
|
|
from scipy.stats.mstats import winsorize as scipy_winsorize
|
|
|
|
from zipline.errors import BadPercentileBounds, UnknownRankMethod
|
|
from zipline.lib.labelarray import LabelArray
|
|
from zipline.lib.rank import masked_rankdata_2d
|
|
from zipline.lib.normalize import naive_grouped_rowwise_apply as grouped_apply
|
|
from zipline.pipeline import Classifier, Factor, Filter, Pipeline
|
|
from zipline.pipeline.data import DataSet, Column, EquityPricing
|
|
from zipline.pipeline.factors import (
|
|
CustomFactor,
|
|
DailyReturns,
|
|
Returns,
|
|
PercentChange,
|
|
)
|
|
from zipline.pipeline.factors.factor import (
|
|
summary_funcs,
|
|
winsorize as zp_winsorize,
|
|
)
|
|
from zipline.testing import (
|
|
check_allclose,
|
|
check_arrays,
|
|
parameter_space,
|
|
permute_rows,
|
|
)
|
|
from zipline.testing.fixtures import (
|
|
WithUSEquityPricingPipelineEngine,
|
|
ZiplineTestCase,
|
|
)
|
|
from zipline.testing.predicates import assert_equal
|
|
from zipline.utils.numpy_utils import (
|
|
as_column,
|
|
categorical_dtype,
|
|
datetime64ns_dtype,
|
|
float64_dtype,
|
|
ignore_nanwarnings,
|
|
int64_dtype,
|
|
NaTns,
|
|
)
|
|
from zipline.utils.math_utils import nanmean, nanstd
|
|
from zipline.utils.pandas_utils import new_pandas, skip_pipeline_new_pandas
|
|
|
|
from .base import BaseUSEquityPipelineTestCase
|
|
|
|
|
|
class F(Factor):
|
|
dtype = float64_dtype
|
|
inputs = ()
|
|
window_length = 0
|
|
|
|
|
|
class OtherF(Factor):
|
|
dtype = float64_dtype
|
|
inputs = ()
|
|
window_length = 0
|
|
|
|
|
|
class C(Classifier):
|
|
dtype = int64_dtype
|
|
missing_value = -1
|
|
inputs = ()
|
|
window_length = 0
|
|
|
|
|
|
class OtherC(Classifier):
|
|
dtype = int64_dtype
|
|
missing_value = -1
|
|
inputs = ()
|
|
window_length = 0
|
|
|
|
|
|
class Mask(Filter):
|
|
inputs = ()
|
|
window_length = 0
|
|
|
|
|
|
for_each_factor_dtype = parameterized.expand([
|
|
('datetime64[ns]', datetime64ns_dtype),
|
|
('float', float64_dtype),
|
|
])
|
|
|
|
|
|
def scipy_winsorize_with_nan_handling(array, limits):
|
|
"""
|
|
Wrapper around scipy.stats.mstats.winsorize that handles NaNs correctly.
|
|
|
|
scipy's winsorize sorts NaNs to the end of the array when calculating
|
|
percentiles.
|
|
"""
|
|
# The basic idea of this function is to do the following:
|
|
# 1. Sort the input, sorting nans to the end of the array.
|
|
# 2. Call scipy winsorize on the non-nan portion of the input.
|
|
# 3. Undo the sorting to put the winsorized values back in their original
|
|
# locations.
|
|
|
|
nancount = np.isnan(array).sum()
|
|
if nancount == len(array):
|
|
return array.copy()
|
|
|
|
sorter = array.argsort()
|
|
unsorter = sorter.argsort() # argsorting a permutation gives its inverse!
|
|
|
|
if nancount:
|
|
sorted_non_nans = array[sorter][:-nancount]
|
|
else:
|
|
sorted_non_nans = array[sorter]
|
|
|
|
sorted_winsorized = np.hstack([
|
|
scipy_winsorize(sorted_non_nans, limits).data,
|
|
np.full(nancount, np.nan),
|
|
])
|
|
|
|
return sorted_winsorized[unsorter]
|
|
|
|
|
|
class FactorTestCase(BaseUSEquityPipelineTestCase):
|
|
|
|
def init_instance_fixtures(self):
|
|
super(FactorTestCase, self).init_instance_fixtures()
|
|
self.f = F()
|
|
|
|
def test_bad_input(self):
|
|
with self.assertRaises(UnknownRankMethod):
|
|
self.f.rank("not a real rank method")
|
|
|
|
@parameter_space(method_name=['isnan', 'notnan', 'isfinite'])
|
|
def test_float64_only_ops(self, method_name):
|
|
class NotFloat(Factor):
|
|
dtype = datetime64ns_dtype
|
|
inputs = ()
|
|
window_length = 0
|
|
|
|
nf = NotFloat()
|
|
meth = getattr(nf, method_name)
|
|
with self.assertRaises(TypeError):
|
|
meth()
|
|
|
|
@parameter_space(custom_missing_value=[-1, 0])
|
|
def test_isnull_int_dtype(self, custom_missing_value):
|
|
|
|
class CustomMissingValue(Factor):
|
|
dtype = int64_dtype
|
|
window_length = 0
|
|
missing_value = custom_missing_value
|
|
inputs = ()
|
|
|
|
factor = CustomMissingValue()
|
|
|
|
data = arange(25).reshape(5, 5)
|
|
data[eye(5, dtype=bool)] = custom_missing_value
|
|
|
|
self.check_terms(
|
|
{
|
|
'isnull': factor.isnull(),
|
|
'notnull': factor.notnull(),
|
|
},
|
|
{
|
|
'isnull': eye(5, dtype=bool),
|
|
'notnull': ~eye(5, dtype=bool),
|
|
},
|
|
initial_workspace={factor: data},
|
|
mask=self.build_mask(ones((5, 5))),
|
|
)
|
|
|
|
def test_isnull_datetime_dtype(self):
|
|
class DatetimeFactor(Factor):
|
|
dtype = datetime64ns_dtype
|
|
window_length = 0
|
|
inputs = ()
|
|
|
|
factor = DatetimeFactor()
|
|
|
|
data = arange(25).reshape(5, 5).astype('datetime64[ns]')
|
|
data[eye(5, dtype=bool)] = NaTns
|
|
|
|
self.check_terms(
|
|
{
|
|
'isnull': factor.isnull(),
|
|
'notnull': factor.notnull(),
|
|
},
|
|
{
|
|
'isnull': eye(5, dtype=bool),
|
|
'notnull': ~eye(5, dtype=bool),
|
|
},
|
|
initial_workspace={factor: data},
|
|
mask=self.build_mask(ones((5, 5))),
|
|
)
|
|
|
|
@for_each_factor_dtype
|
|
def test_rank_ascending(self, name, factor_dtype):
|
|
|
|
f = F(dtype=factor_dtype)
|
|
|
|
# Generated with:
|
|
# data = arange(25).reshape(5, 5).transpose() % 4
|
|
data = array([[0, 1, 2, 3, 0],
|
|
[1, 2, 3, 0, 1],
|
|
[2, 3, 0, 1, 2],
|
|
[3, 0, 1, 2, 3],
|
|
[0, 1, 2, 3, 0]], dtype=factor_dtype)
|
|
|
|
expected_ranks = {
|
|
'ordinal': array([[1., 3., 4., 5., 2.],
|
|
[2., 4., 5., 1., 3.],
|
|
[3., 5., 1., 2., 4.],
|
|
[4., 1., 2., 3., 5.],
|
|
[1., 3., 4., 5., 2.]]),
|
|
'average': array([[1.5, 3., 4., 5., 1.5],
|
|
[2.5, 4., 5., 1., 2.5],
|
|
[3.5, 5., 1., 2., 3.5],
|
|
[4.5, 1., 2., 3., 4.5],
|
|
[1.5, 3., 4., 5., 1.5]]),
|
|
'min': array([[1., 3., 4., 5., 1.],
|
|
[2., 4., 5., 1., 2.],
|
|
[3., 5., 1., 2., 3.],
|
|
[4., 1., 2., 3., 4.],
|
|
[1., 3., 4., 5., 1.]]),
|
|
'max': array([[2., 3., 4., 5., 2.],
|
|
[3., 4., 5., 1., 3.],
|
|
[4., 5., 1., 2., 4.],
|
|
[5., 1., 2., 3., 5.],
|
|
[2., 3., 4., 5., 2.]]),
|
|
'dense': array([[1., 2., 3., 4., 1.],
|
|
[2., 3., 4., 1., 2.],
|
|
[3., 4., 1., 2., 3.],
|
|
[4., 1., 2., 3., 4.],
|
|
[1., 2., 3., 4., 1.]]),
|
|
}
|
|
|
|
def check(terms):
|
|
self.check_terms(
|
|
terms,
|
|
expected={name: expected_ranks[name] for name in terms},
|
|
initial_workspace={f: data},
|
|
mask=self.build_mask(ones((5, 5))),
|
|
)
|
|
|
|
check({meth: f.rank(method=meth) for meth in expected_ranks})
|
|
check({
|
|
meth: f.rank(method=meth, ascending=True)
|
|
for meth in expected_ranks
|
|
})
|
|
# Not passing a method should default to ordinal.
|
|
check({'ordinal': f.rank()})
|
|
check({'ordinal': f.rank(ascending=True)})
|
|
|
|
@for_each_factor_dtype
|
|
def test_rank_descending(self, name, factor_dtype):
|
|
|
|
f = F(dtype=factor_dtype)
|
|
|
|
# Generated with:
|
|
# data = arange(25).reshape(5, 5).transpose() % 4
|
|
data = array([[0, 1, 2, 3, 0],
|
|
[1, 2, 3, 0, 1],
|
|
[2, 3, 0, 1, 2],
|
|
[3, 0, 1, 2, 3],
|
|
[0, 1, 2, 3, 0]], dtype=factor_dtype)
|
|
expected_ranks = {
|
|
'ordinal': array([[4., 3., 2., 1., 5.],
|
|
[3., 2., 1., 5., 4.],
|
|
[2., 1., 5., 4., 3.],
|
|
[1., 5., 4., 3., 2.],
|
|
[4., 3., 2., 1., 5.]]),
|
|
'average': array([[4.5, 3., 2., 1., 4.5],
|
|
[3.5, 2., 1., 5., 3.5],
|
|
[2.5, 1., 5., 4., 2.5],
|
|
[1.5, 5., 4., 3., 1.5],
|
|
[4.5, 3., 2., 1., 4.5]]),
|
|
'min': array([[4., 3., 2., 1., 4.],
|
|
[3., 2., 1., 5., 3.],
|
|
[2., 1., 5., 4., 2.],
|
|
[1., 5., 4., 3., 1.],
|
|
[4., 3., 2., 1., 4.]]),
|
|
'max': array([[5., 3., 2., 1., 5.],
|
|
[4., 2., 1., 5., 4.],
|
|
[3., 1., 5., 4., 3.],
|
|
[2., 5., 4., 3., 2.],
|
|
[5., 3., 2., 1., 5.]]),
|
|
'dense': array([[4., 3., 2., 1., 4.],
|
|
[3., 2., 1., 4., 3.],
|
|
[2., 1., 4., 3., 2.],
|
|
[1., 4., 3., 2., 1.],
|
|
[4., 3., 2., 1., 4.]]),
|
|
}
|
|
|
|
def check(terms):
|
|
self.check_terms(
|
|
terms,
|
|
expected={name: expected_ranks[name] for name in terms},
|
|
initial_workspace={f: data},
|
|
mask=self.build_mask(ones((5, 5))),
|
|
)
|
|
|
|
check({
|
|
meth: f.rank(method=meth, ascending=False)
|
|
for meth in expected_ranks
|
|
})
|
|
# Not passing a method should default to ordinal.
|
|
check({'ordinal': f.rank(ascending=False)})
|
|
|
|
@for_each_factor_dtype
|
|
def test_rank_after_mask(self, name, factor_dtype):
|
|
|
|
f = F(dtype=factor_dtype)
|
|
# data = arange(25).reshape(5, 5).transpose() % 4
|
|
data = array([[0, 1, 2, 3, 0],
|
|
[1, 2, 3, 0, 1],
|
|
[2, 3, 0, 1, 2],
|
|
[3, 0, 1, 2, 3],
|
|
[0, 1, 2, 3, 0]], dtype=factor_dtype)
|
|
mask_data = ~eye(5, dtype=bool)
|
|
initial_workspace = {f: data, Mask(): mask_data}
|
|
|
|
terms = {
|
|
"ascending_nomask": f.rank(ascending=True),
|
|
"ascending_mask": f.rank(ascending=True, mask=Mask()),
|
|
"descending_nomask": f.rank(ascending=False),
|
|
"descending_mask": f.rank(ascending=False, mask=Mask()),
|
|
}
|
|
|
|
expected = {
|
|
"ascending_nomask": array([[1., 3., 4., 5., 2.],
|
|
[2., 4., 5., 1., 3.],
|
|
[3., 5., 1., 2., 4.],
|
|
[4., 1., 2., 3., 5.],
|
|
[1., 3., 4., 5., 2.]]),
|
|
"descending_nomask": array([[4., 3., 2., 1., 5.],
|
|
[3., 2., 1., 5., 4.],
|
|
[2., 1., 5., 4., 3.],
|
|
[1., 5., 4., 3., 2.],
|
|
[4., 3., 2., 1., 5.]]),
|
|
# Diagonal should be all nans, and anything whose rank was less
|
|
# than the diagonal in the unmasked calc should go down by 1.
|
|
"ascending_mask": array([[nan, 2., 3., 4., 1.],
|
|
[2., nan, 4., 1., 3.],
|
|
[2., 4., nan, 1., 3.],
|
|
[3., 1., 2., nan, 4.],
|
|
[1., 2., 3., 4., nan]]),
|
|
"descending_mask": array([[nan, 3., 2., 1., 4.],
|
|
[2., nan, 1., 4., 3.],
|
|
[2., 1., nan, 4., 3.],
|
|
[1., 4., 3., nan, 2.],
|
|
[4., 3., 2., 1., nan]]),
|
|
}
|
|
|
|
self.check_terms(
|
|
terms,
|
|
expected,
|
|
initial_workspace,
|
|
mask=self.build_mask(ones((5, 5))),
|
|
)
|
|
|
|
@for_each_factor_dtype
|
|
def test_grouped_rank_ascending(self, name, factor_dtype=float64_dtype):
|
|
|
|
f = F(dtype=factor_dtype)
|
|
c = C()
|
|
str_c = C(dtype=categorical_dtype, missing_value=None)
|
|
|
|
# Generated with:
|
|
# data = arange(25).reshape(5, 5).transpose() % 4
|
|
data = array([[0, 1, 2, 3, 0],
|
|
[1, 2, 3, 0, 1],
|
|
[2, 3, 0, 1, 2],
|
|
[3, 0, 1, 2, 3],
|
|
[0, 1, 2, 3, 0]], dtype=factor_dtype)
|
|
|
|
# Generated with:
|
|
# classifier_data = arange(25).reshape(5, 5).transpose() % 2
|
|
classifier_data = array([[0, 1, 0, 1, 0],
|
|
[1, 0, 1, 0, 1],
|
|
[0, 1, 0, 1, 0],
|
|
[1, 0, 1, 0, 1],
|
|
[0, 1, 0, 1, 0]], dtype=int64_dtype)
|
|
string_classifier_data = LabelArray(
|
|
classifier_data.astype(str).astype(object),
|
|
missing_value=None,
|
|
)
|
|
|
|
expected_ranks = {
|
|
'ordinal': array(
|
|
[[1., 1., 3., 2., 2.],
|
|
[1., 2., 3., 1., 2.],
|
|
[2., 2., 1., 1., 3.],
|
|
[2., 1., 1., 2., 3.],
|
|
[1., 1., 3., 2., 2.]]
|
|
),
|
|
'average': array(
|
|
[[1.5, 1., 3., 2., 1.5],
|
|
[1.5, 2., 3., 1., 1.5],
|
|
[2.5, 2., 1., 1., 2.5],
|
|
[2.5, 1., 1., 2., 2.5],
|
|
[1.5, 1., 3., 2., 1.5]]
|
|
),
|
|
'min': array(
|
|
[[1., 1., 3., 2., 1.],
|
|
[1., 2., 3., 1., 1.],
|
|
[2., 2., 1., 1., 2.],
|
|
[2., 1., 1., 2., 2.],
|
|
[1., 1., 3., 2., 1.]]
|
|
),
|
|
'max': array(
|
|
[[2., 1., 3., 2., 2.],
|
|
[2., 2., 3., 1., 2.],
|
|
[3., 2., 1., 1., 3.],
|
|
[3., 1., 1., 2., 3.],
|
|
[2., 1., 3., 2., 2.]]
|
|
),
|
|
'dense': array(
|
|
[[1., 1., 2., 2., 1.],
|
|
[1., 2., 2., 1., 1.],
|
|
[2., 2., 1., 1., 2.],
|
|
[2., 1., 1., 2., 2.],
|
|
[1., 1., 2., 2., 1.]]
|
|
),
|
|
}
|
|
|
|
def check(terms):
|
|
self.check_terms(
|
|
terms,
|
|
expected={name: expected_ranks[name] for name in terms},
|
|
initial_workspace={
|
|
f: data,
|
|
c: classifier_data,
|
|
str_c: string_classifier_data,
|
|
},
|
|
mask=self.build_mask(ones((5, 5))),
|
|
)
|
|
|
|
# Not specifying the value of ascending param should default to True
|
|
check({
|
|
meth: f.rank(method=meth, groupby=c)
|
|
for meth in expected_ranks
|
|
})
|
|
check({
|
|
meth: f.rank(method=meth, groupby=str_c)
|
|
for meth in expected_ranks
|
|
})
|
|
check({
|
|
meth: f.rank(method=meth, groupby=c, ascending=True)
|
|
for meth in expected_ranks
|
|
})
|
|
check({
|
|
meth: f.rank(method=meth, groupby=str_c, ascending=True)
|
|
for meth in expected_ranks
|
|
})
|
|
|
|
# Not passing a method should default to ordinal
|
|
check({'ordinal': f.rank(groupby=c)})
|
|
check({'ordinal': f.rank(groupby=str_c)})
|
|
check({'ordinal': f.rank(groupby=c, ascending=True)})
|
|
check({'ordinal': f.rank(groupby=str_c, ascending=True)})
|
|
|
|
@for_each_factor_dtype
|
|
def test_grouped_rank_descending(self, name, factor_dtype):
|
|
|
|
f = F(dtype=factor_dtype)
|
|
c = C()
|
|
str_c = C(dtype=categorical_dtype, missing_value=None)
|
|
|
|
# Generated with:
|
|
# data = arange(25).reshape(5, 5).transpose() % 4
|
|
data = array([[0, 1, 2, 3, 0],
|
|
[1, 2, 3, 0, 1],
|
|
[2, 3, 0, 1, 2],
|
|
[3, 0, 1, 2, 3],
|
|
[0, 1, 2, 3, 0]], dtype=factor_dtype)
|
|
|
|
# Generated with:
|
|
# classifier_data = arange(25).reshape(5, 5).transpose() % 2
|
|
classifier_data = array([[0, 1, 0, 1, 0],
|
|
[1, 0, 1, 0, 1],
|
|
[0, 1, 0, 1, 0],
|
|
[1, 0, 1, 0, 1],
|
|
[0, 1, 0, 1, 0]], dtype=int64_dtype)
|
|
|
|
string_classifier_data = LabelArray(
|
|
classifier_data.astype(str).astype(object),
|
|
missing_value=None,
|
|
)
|
|
|
|
expected_ranks = {
|
|
'ordinal': array(
|
|
[[2., 2., 1., 1., 3.],
|
|
[2., 1., 1., 2., 3.],
|
|
[1., 1., 3., 2., 2.],
|
|
[1., 2., 3., 1., 2.],
|
|
[2., 2., 1., 1., 3.]]
|
|
),
|
|
'average': array(
|
|
[[2.5, 2., 1., 1., 2.5],
|
|
[2.5, 1., 1., 2., 2.5],
|
|
[1.5, 1., 3., 2., 1.5],
|
|
[1.5, 2., 3., 1., 1.5],
|
|
[2.5, 2., 1., 1., 2.5]]
|
|
),
|
|
'min': array(
|
|
[[2., 2., 1., 1., 2.],
|
|
[2., 1., 1., 2., 2.],
|
|
[1., 1., 3., 2., 1.],
|
|
[1., 2., 3., 1., 1.],
|
|
[2., 2., 1., 1., 2.]]
|
|
),
|
|
'max': array(
|
|
[[3., 2., 1., 1., 3.],
|
|
[3., 1., 1., 2., 3.],
|
|
[2., 1., 3., 2., 2.],
|
|
[2., 2., 3., 1., 2.],
|
|
[3., 2., 1., 1., 3.]]
|
|
),
|
|
'dense': array(
|
|
[[2., 2., 1., 1., 2.],
|
|
[2., 1., 1., 2., 2.],
|
|
[1., 1., 2., 2., 1.],
|
|
[1., 2., 2., 1., 1.],
|
|
[2., 2., 1., 1., 2.]]
|
|
),
|
|
}
|
|
|
|
def check(terms):
|
|
self.check_terms(
|
|
terms,
|
|
expected={name: expected_ranks[name] for name in terms},
|
|
initial_workspace={
|
|
f: data,
|
|
c: classifier_data,
|
|
str_c: string_classifier_data,
|
|
},
|
|
mask=self.build_mask(ones((5, 5))),
|
|
)
|
|
|
|
check({
|
|
meth: f.rank(method=meth, groupby=c, ascending=False)
|
|
for meth in expected_ranks
|
|
})
|
|
check({
|
|
meth: f.rank(method=meth, groupby=str_c, ascending=False)
|
|
for meth in expected_ranks
|
|
})
|
|
|
|
# Not passing a method should default to ordinal
|
|
check({'ordinal': f.rank(groupby=c, ascending=False)})
|
|
check({'ordinal': f.rank(groupby=str_c, ascending=False)})
|
|
|
|
@parameterized.expand([
|
|
(100, 15),
|
|
(101, 4),
|
|
(102, 100),
|
|
])
|
|
def test_returns(self, seed_value, window_length):
|
|
|
|
returns = Returns(window_length=window_length)
|
|
|
|
today = datetime64(1, 'ns')
|
|
assets = arange(3)
|
|
|
|
seed(seed_value) # Seed so we get deterministic results.
|
|
test_data = abs(randn(window_length, 3))
|
|
|
|
# Calculate the expected returns
|
|
expected = (test_data[-1] - test_data[0]) / test_data[0]
|
|
|
|
out = empty((3,), dtype=float)
|
|
returns.compute(today, assets, out, test_data)
|
|
|
|
check_allclose(expected, out)
|
|
|
|
@parameterized.expand([
|
|
(100, 15),
|
|
(101, 4),
|
|
(102, 100),
|
|
])
|
|
def test_percentchange(self, seed_value, window_length):
|
|
|
|
pct_change = PercentChange(
|
|
inputs=[EquityPricing.close],
|
|
window_length=window_length,
|
|
)
|
|
|
|
today = datetime64(1, 'ns')
|
|
assets = arange(8)
|
|
|
|
seed(seed_value) # Seed so we get deterministic results.
|
|
middle_rows = randn(window_length - 2, 8)
|
|
first_row = array([1, 2, 2, 1, -1, -1, 0, nan])
|
|
end_row = array([2, 1, 2, -2, 2, -2, 1, 1])
|
|
test_data = np.vstack([first_row, middle_rows, end_row])
|
|
|
|
# Calculate the expected percent change
|
|
expected = array([1, -0.5, 0, -3, 3, -1, inf, nan])
|
|
|
|
out = empty((8,), dtype=float)
|
|
pct_change.compute(today, assets, out, test_data)
|
|
|
|
check_allclose(expected, out)
|
|
|
|
with self.assertRaises(ValueError):
|
|
PercentChange(inputs=(), window_length=2)
|
|
|
|
with self.assertRaises(ValueError):
|
|
PercentChange(inputs=[EquityPricing.close], window_length=1)
|
|
|
|
def gen_ranking_cases():
|
|
seeds = range(int(1e4), int(1e5), int(1e4))
|
|
methods = ('ordinal', 'average')
|
|
use_mask_values = (True, False)
|
|
set_missing_values = (True, False)
|
|
ascending_values = (True, False)
|
|
return product(
|
|
seeds,
|
|
methods,
|
|
use_mask_values,
|
|
set_missing_values,
|
|
ascending_values,
|
|
)
|
|
|
|
@parameterized.expand(gen_ranking_cases())
|
|
def test_masked_rankdata_2d(self,
|
|
seed_value,
|
|
method,
|
|
use_mask,
|
|
set_missing,
|
|
ascending):
|
|
eyemask = ~eye(5, dtype=bool)
|
|
nomask = ones((5, 5), dtype=bool)
|
|
|
|
seed(seed_value)
|
|
asfloat = (randn(5, 5) * seed_value)
|
|
asdatetime = (asfloat).copy().view('datetime64[ns]')
|
|
|
|
mask = eyemask if use_mask else nomask
|
|
if set_missing:
|
|
asfloat[:, 2] = nan
|
|
asdatetime[:, 2] = NaTns
|
|
|
|
float_result = masked_rankdata_2d(
|
|
data=asfloat,
|
|
mask=mask,
|
|
missing_value=nan,
|
|
method=method,
|
|
ascending=True,
|
|
)
|
|
datetime_result = masked_rankdata_2d(
|
|
data=asdatetime,
|
|
mask=mask,
|
|
missing_value=NaTns,
|
|
method=method,
|
|
ascending=True,
|
|
)
|
|
|
|
check_arrays(float_result, datetime_result)
|
|
|
|
def test_normalizations_hand_computed(self):
|
|
"""
|
|
Test the hand-computed example in factor.demean.
|
|
"""
|
|
f = self.f
|
|
m = Mask()
|
|
c = C()
|
|
str_c = C(dtype=categorical_dtype, missing_value=None)
|
|
|
|
factor_data = array(
|
|
[[1.0, 2.0, 3.0, 4.0],
|
|
[1.5, 2.5, 3.5, 1.0],
|
|
[2.0, 3.0, 4.0, 1.5],
|
|
[2.5, 3.5, 1.0, 2.0]],
|
|
)
|
|
filter_data = array(
|
|
[[False, True, True, True],
|
|
[True, False, True, True],
|
|
[True, True, False, True],
|
|
[True, True, True, False]],
|
|
dtype=bool,
|
|
)
|
|
classifier_data = array(
|
|
[[1, 1, 2, 2],
|
|
[1, 1, 2, 2],
|
|
[1, 1, 2, 2],
|
|
[1, 1, 2, 2]],
|
|
dtype=int64_dtype,
|
|
)
|
|
string_classifier_data = LabelArray(
|
|
classifier_data.astype(str).astype(object),
|
|
missing_value=None,
|
|
)
|
|
|
|
terms = {
|
|
'vanilla': f.demean(),
|
|
'masked': f.demean(mask=m),
|
|
'grouped': f.demean(groupby=c),
|
|
'grouped_str': f.demean(groupby=str_c),
|
|
'grouped_masked': f.demean(mask=m, groupby=c),
|
|
'grouped_masked_str': f.demean(mask=m, groupby=str_c),
|
|
}
|
|
expected = {
|
|
'vanilla': array(
|
|
[[-1.500, -0.500, 0.500, 1.500],
|
|
[-0.625, 0.375, 1.375, -1.125],
|
|
[-0.625, 0.375, 1.375, -1.125],
|
|
[0.250, 1.250, -1.250, -0.250]],
|
|
),
|
|
'masked': array(
|
|
[[nan, -1.000, 0.000, 1.000],
|
|
[-0.500, nan, 1.500, -1.000],
|
|
[-0.166, 0.833, nan, -0.666],
|
|
[0.166, 1.166, -1.333, nan]],
|
|
),
|
|
'grouped': array(
|
|
[[-0.500, 0.500, -0.500, 0.500],
|
|
[-0.500, 0.500, 1.250, -1.250],
|
|
[-0.500, 0.500, 1.250, -1.250],
|
|
[-0.500, 0.500, -0.500, 0.500]],
|
|
),
|
|
'grouped_masked': array(
|
|
[[nan, 0.000, -0.500, 0.500],
|
|
[0.000, nan, 1.250, -1.250],
|
|
[-0.500, 0.500, nan, 0.000],
|
|
[-0.500, 0.500, 0.000, nan]]
|
|
),
|
|
}
|
|
# Changing the classifier dtype shouldn't affect anything.
|
|
expected['grouped_str'] = expected['grouped']
|
|
expected['grouped_masked_str'] = expected['grouped_masked']
|
|
|
|
self.check_terms(
|
|
terms,
|
|
expected,
|
|
initial_workspace={
|
|
f: factor_data,
|
|
c: classifier_data,
|
|
str_c: string_classifier_data,
|
|
m: filter_data,
|
|
},
|
|
mask=self.build_mask(self.ones_mask(shape=factor_data.shape)),
|
|
# The hand-computed values aren't very precise (in particular,
|
|
# we truncate repeating decimals at 3 places) This is just
|
|
# asserting that the example isn't misleading by being totally
|
|
# wrong.
|
|
check=partial(check_allclose, atol=0.001),
|
|
)
|
|
|
|
def test_winsorize_hand_computed(self):
|
|
"""
|
|
Test the hand-computed example in factor.winsorize.
|
|
"""
|
|
f = self.f
|
|
m = Mask()
|
|
c = C()
|
|
str_c = C(dtype=categorical_dtype, missing_value=None)
|
|
|
|
factor_data = array([
|
|
[1., 2., 3., 4., 5., 6., 7., 8., 9.],
|
|
[1., 2., 3., 4., 5., 6., nan, nan, nan],
|
|
[1., 8., 27., 64., 125., 216., nan, nan, nan],
|
|
[6., 5., 4., 3., 2., 1., nan, nan, nan],
|
|
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
|
|
])
|
|
filter_data = array(
|
|
[[1, 1, 1, 1, 1, 1, 1, 1, 1],
|
|
[0, 1, 1, 1, 1, 1, 1, 1, 1],
|
|
[1, 0, 1, 1, 1, 1, 1, 1, 1],
|
|
[1, 1, 0, 1, 1, 1, 1, 1, 1],
|
|
[1, 1, 1, 0, 1, 1, 1, 1, 1]],
|
|
dtype=bool,
|
|
)
|
|
classifier_data = array(
|
|
[[1, 1, 1, 2, 2, 2, 1, 1, 1],
|
|
[1, 1, 1, 2, 2, 2, 1, 1, 1],
|
|
[1, 1, 1, 2, 2, 2, 1, 1, 1],
|
|
[1, 1, 1, 2, 2, 2, 1, 1, 1],
|
|
[1, 1, 1, 2, 2, 2, 1, 1, 1]],
|
|
dtype=int64_dtype,
|
|
)
|
|
string_classifier_data = LabelArray(
|
|
classifier_data.astype(str).astype(object),
|
|
missing_value=None,
|
|
)
|
|
|
|
terms = {
|
|
'winsor_1': f.winsorize(
|
|
min_percentile=0.33,
|
|
max_percentile=0.67
|
|
),
|
|
'winsor_2': f.winsorize(
|
|
min_percentile=0.49,
|
|
max_percentile=1
|
|
),
|
|
'winsor_3': f.winsorize(
|
|
min_percentile=0,
|
|
max_percentile=.67
|
|
),
|
|
'masked': f.winsorize(
|
|
min_percentile=0.33,
|
|
max_percentile=0.67,
|
|
mask=m
|
|
),
|
|
'grouped': f.winsorize(
|
|
min_percentile=0.34,
|
|
max_percentile=0.66,
|
|
groupby=c
|
|
),
|
|
'grouped_str': f.winsorize(
|
|
min_percentile=0.34,
|
|
max_percentile=0.66,
|
|
groupby=str_c
|
|
),
|
|
'grouped_masked': f.winsorize(
|
|
min_percentile=0.34,
|
|
max_percentile=0.66,
|
|
mask=m,
|
|
groupby=c
|
|
),
|
|
'grouped_masked_str': f.winsorize(
|
|
min_percentile=0.34,
|
|
max_percentile=0.66,
|
|
mask=m,
|
|
groupby=str_c
|
|
),
|
|
}
|
|
expected = {
|
|
'winsor_1': array([
|
|
[3., 3., 3., 4., 5., 6., 7., 7., 7.],
|
|
[2., 2., 3., 4., 5., 5., nan, nan, nan],
|
|
[8., 8., 27., 64., 125., 125., nan, nan, nan],
|
|
[5., 5., 4., 3., 2., 2., nan, nan, nan],
|
|
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
|
|
]),
|
|
'winsor_2': array([
|
|
[5., 5., 5., 5., 5., 6., 7., 8., 9.],
|
|
[3.0, 3., 3., 4., 5., 6., nan, nan, nan],
|
|
[27., 27., 27., 64., 125., 216., nan, nan, nan],
|
|
[6.0, 5., 4., 3., 3., 3., nan, nan, nan],
|
|
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
|
|
]),
|
|
'winsor_3': array([
|
|
[1., 2., 3., 4., 5., 6., 7., 7., 7.],
|
|
[1., 2., 3., 4., 5., 5., nan, nan, nan],
|
|
[1., 8., 27., 64., 125., 125., nan, nan, nan],
|
|
[5., 5., 4., 3., 2., 1., nan, nan, nan],
|
|
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
|
|
]),
|
|
'masked': array([
|
|
# no mask on first row
|
|
[3., 3., 3., 4., 5., 6., 7., 7., 7.],
|
|
[nan, 3., 3., 4., 5., 5., nan, nan, nan],
|
|
[27., nan, 27., 64., 125., 125., nan, nan, nan],
|
|
[5.0, 5., nan, 3., 2., 2., nan, nan, nan],
|
|
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
|
|
]),
|
|
'grouped': array([
|
|
[3., 3., 3., 5., 5., 5., 7., 7., 7.],
|
|
[2., 2., 2., 5., 5., 5., nan, nan, nan],
|
|
[8., 8., 8., 125., 125., 125., nan, nan, nan],
|
|
[5., 5., 5., 2., 2., 2., nan, nan, nan],
|
|
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
|
|
]),
|
|
'grouped_masked': array([
|
|
[3., 3., 3., 5., 5., 5., 7., 7., 7.],
|
|
[nan, 2., 3., 5., 5., 5., nan, nan, nan],
|
|
[1.0, nan, 27., 125., 125., 125., nan, nan, nan],
|
|
[6.0, 5., nan, 2., 2., 2., nan, nan, nan],
|
|
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
|
|
]),
|
|
}
|
|
# Changing the classifier dtype shouldn't affect anything.
|
|
expected['grouped_str'] = expected['grouped']
|
|
expected['grouped_masked_str'] = expected['grouped_masked']
|
|
|
|
self.check_terms(
|
|
terms,
|
|
expected,
|
|
initial_workspace={
|
|
f: factor_data,
|
|
c: classifier_data,
|
|
str_c: string_classifier_data,
|
|
m: filter_data,
|
|
},
|
|
mask=self.build_mask(self.ones_mask(shape=factor_data.shape)),
|
|
check=partial(check_allclose, atol=0.001),
|
|
)
|
|
|
|
def test_winsorize_no_nans(self):
|
|
data = array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])
|
|
permutation = array([2, 1, 6, 8, 7, 5, 3, 9, 4, 0])
|
|
|
|
for perm in slice(None), permutation:
|
|
# Winsorize both tails at 90%.
|
|
result = zp_winsorize(data[perm], 0.1, 0.9)
|
|
expected = array([1., 1., 2., 3., 4., 5., 6., 7., 8., 8.])[perm]
|
|
assert_equal(result, expected)
|
|
|
|
# Winsorize both tails at 80%.
|
|
result = zp_winsorize(data[perm], 0.2, 0.8)
|
|
expected = array([2., 2., 2., 3., 4., 5., 6., 7., 7., 7.])[perm]
|
|
assert_equal(result, expected)
|
|
|
|
# Winsorize just the upper tail.
|
|
result = zp_winsorize(data[perm], 0.0, 0.8)
|
|
expected = array([0., 1., 2., 3., 4., 5., 6., 7., 7., 7.])[perm]
|
|
assert_equal(result, expected)
|
|
|
|
# Winsorize just the lower tail.
|
|
result = zp_winsorize(data[perm], 0.2, 1.0)
|
|
expected = array([2., 2., 2., 3., 4., 5., 6., 7., 8., 9.])[perm]
|
|
assert_equal(result, expected)
|
|
|
|
# Don't winsorize.
|
|
result = zp_winsorize(data[perm], 0.0, 1.0)
|
|
expected = array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])[perm]
|
|
assert_equal(result, expected)
|
|
|
|
def test_winsorize_nans(self):
|
|
# 5 low non-nan values, then some nans, then 5 high non-nans.
|
|
data = array([4.0, 3.0, 0.0, 1.0, 2.0,
|
|
nan, nan, nan,
|
|
9.0, 5.0, 6.0, 8.0, 7.0])
|
|
|
|
# Winsorize both tails at 10%.
|
|
# 0.0 -> 1.0
|
|
# 9.0 -> 8.0
|
|
result = zp_winsorize(data, 0.10, 0.90)
|
|
expected = array([4.0, 3.0, 1.0, 1.0, 2.0,
|
|
nan, nan, nan,
|
|
8.0, 5.0, 6.0, 8.0, 7.0])
|
|
assert_equal(result, expected)
|
|
|
|
# Winsorize both tails at 20%.
|
|
# 0.0 and 1.0 -> 2.0
|
|
# 9.0 and 8.0 -> 7.0
|
|
result = zp_winsorize(data, 0.20, 0.80)
|
|
expected = array([4.0, 3.0, 2.0, 2.0, 2.0,
|
|
nan, nan, nan,
|
|
7.0, 5.0, 6.0, 7.0, 7.0])
|
|
assert_equal(result, expected)
|
|
|
|
# Winsorize just the upper tail.
|
|
result = zp_winsorize(data, 0, 0.8)
|
|
expected = array([4.0, 3.0, 0.0, 1.0, 2.0,
|
|
nan, nan, nan,
|
|
7.0, 5.0, 6.0, 7.0, 7.0])
|
|
assert_equal(result, expected)
|
|
|
|
# Winsorize just the lower tail.
|
|
result = zp_winsorize(data, 0.2, 1.0)
|
|
expected = array([4.0, 3.0, 2.0, 2.0, 2.0,
|
|
nan, nan, nan,
|
|
9.0, 5.0, 6.0, 8.0, 7.0])
|
|
assert_equal(result, expected)
|
|
|
|
def test_winsorize_bad_bounds(self):
|
|
"""
|
|
Test out of bounds input for factor.winsorize.
|
|
"""
|
|
f = self.f
|
|
|
|
bad_percentiles = [
|
|
(-.1, 1),
|
|
(0, 95),
|
|
(5, 95),
|
|
(5, 5),
|
|
(.6, .4)
|
|
]
|
|
for min_, max_ in bad_percentiles:
|
|
with self.assertRaises(BadPercentileBounds):
|
|
f.winsorize(min_percentile=min_, max_percentile=max_)
|
|
|
|
@skipIf(new_pandas, skip_pipeline_new_pandas)
|
|
@parameter_space(
|
|
seed_value=[1, 2],
|
|
normalizer_name_and_func=[
|
|
('demean', {}, lambda row: row - nanmean(row)),
|
|
('zscore', {}, lambda row: (row - nanmean(row)) / nanstd(row)),
|
|
(
|
|
'winsorize',
|
|
{"min_percentile": 0.25, "max_percentile": 0.75},
|
|
lambda row: scipy_winsorize_with_nan_handling(
|
|
row,
|
|
limits=0.25,
|
|
)
|
|
),
|
|
],
|
|
add_nulls_to_factor=(False, True,),
|
|
)
|
|
def test_normalizations_randomized(self,
|
|
seed_value,
|
|
normalizer_name_and_func,
|
|
add_nulls_to_factor):
|
|
|
|
name, kwargs, func = normalizer_name_and_func
|
|
|
|
shape = (20, 20)
|
|
|
|
# All Trues.
|
|
nomask = self.ones_mask(shape=shape)
|
|
# Falses on main diagonal.
|
|
eyemask = self.eye_mask(shape=shape)
|
|
# Falses on other diagonal.
|
|
eyemask90 = rot90(eyemask)
|
|
# Falses on both diagonals.
|
|
xmask = eyemask & eyemask90
|
|
|
|
# Block of random data.
|
|
factor_data = self.randn_data(seed=seed_value, shape=shape)
|
|
if add_nulls_to_factor:
|
|
factor_data = where(eyemask, factor_data, nan)
|
|
|
|
# Cycles of 0, 1, 2, 0, 1, 2, ...
|
|
classifier_data = (
|
|
(self.arange_data(shape=shape, dtype=int64_dtype) + seed_value) % 3
|
|
)
|
|
# With -1s on main diagonal.
|
|
classifier_data_eyenulls = where(eyemask, classifier_data, -1)
|
|
# With -1s on opposite diagonal.
|
|
classifier_data_eyenulls90 = where(eyemask90, classifier_data, -1)
|
|
# With -1s on both diagonals.
|
|
classifier_data_xnulls = where(xmask, classifier_data, -1)
|
|
|
|
f = self.f
|
|
c = C()
|
|
c_with_nulls = OtherC()
|
|
m = Mask()
|
|
method = partial(getattr(f, name), **kwargs)
|
|
terms = {
|
|
'vanilla': method(),
|
|
'masked': method(mask=m),
|
|
'grouped': method(groupby=c),
|
|
'grouped_with_nulls': method(groupby=c_with_nulls),
|
|
'both': method(mask=m, groupby=c),
|
|
'both_with_nulls': method(mask=m, groupby=c_with_nulls),
|
|
}
|
|
|
|
expected = {
|
|
'vanilla': apply_along_axis(func, 1, factor_data,),
|
|
'masked': where(
|
|
eyemask,
|
|
grouped_apply(factor_data, eyemask, func),
|
|
nan,
|
|
),
|
|
'grouped': grouped_apply(
|
|
factor_data,
|
|
classifier_data,
|
|
func,
|
|
),
|
|
# If the classifier has nulls, we should get NaNs in the
|
|
# corresponding locations in the output.
|
|
'grouped_with_nulls': where(
|
|
eyemask90,
|
|
grouped_apply(factor_data, classifier_data_eyenulls90, func),
|
|
nan,
|
|
),
|
|
# Passing a mask with a classifier should behave as though the
|
|
# classifier had nulls where the mask was False.
|
|
'both': where(
|
|
eyemask,
|
|
grouped_apply(
|
|
factor_data,
|
|
classifier_data_eyenulls,
|
|
func,
|
|
),
|
|
nan,
|
|
),
|
|
'both_with_nulls': where(
|
|
xmask,
|
|
grouped_apply(
|
|
factor_data,
|
|
classifier_data_xnulls,
|
|
func,
|
|
),
|
|
nan,
|
|
)
|
|
}
|
|
|
|
self.check_terms(
|
|
terms=terms,
|
|
expected=expected,
|
|
initial_workspace={
|
|
f: factor_data,
|
|
c: classifier_data,
|
|
c_with_nulls: classifier_data_eyenulls90,
|
|
Mask(): eyemask,
|
|
},
|
|
mask=self.build_mask(nomask),
|
|
)
|
|
|
|
@parameter_space(method_name=['demean', 'zscore'])
|
|
def test_cant_normalize_non_float(self, method_name):
|
|
class DateFactor(Factor):
|
|
dtype = datetime64ns_dtype
|
|
inputs = ()
|
|
window_length = 0
|
|
|
|
d = DateFactor()
|
|
with self.assertRaises(TypeError) as e:
|
|
getattr(d, method_name)()
|
|
|
|
errmsg = str(e.exception)
|
|
expected = (
|
|
"{normalizer}() is only defined on Factors of dtype float64,"
|
|
" but it was called on a Factor of dtype datetime64[ns]."
|
|
).format(normalizer=method_name)
|
|
|
|
self.assertEqual(errmsg, expected)
|
|
|
|
@parameter_space(seed=[1, 2, 3])
|
|
def test_quantiles_unmasked(self, seed):
|
|
permute = partial(permute_rows, seed)
|
|
|
|
shape = (6, 6)
|
|
|
|
# Shuffle the input rows to verify that we don't depend on the order.
|
|
# Take the log to ensure that we don't depend on linear scaling or
|
|
# integrality of inputs
|
|
factor_data = permute(log1p(arange(36, dtype=float).reshape(shape)))
|
|
|
|
f = self.f
|
|
|
|
# Apply the same shuffle we applied to the input rows to our
|
|
# expectations. Doing it this way makes it obvious that our
|
|
# expectation corresponds to our input, while still testing against
|
|
# a range of input orderings.
|
|
permuted_array = compose(permute, partial(array, dtype=int64_dtype))
|
|
self.check_terms(
|
|
terms={
|
|
'2': f.quantiles(bins=2),
|
|
'3': f.quantiles(bins=3),
|
|
'6': f.quantiles(bins=6),
|
|
},
|
|
initial_workspace={
|
|
f: factor_data,
|
|
},
|
|
expected={
|
|
# The values in the input are all increasing, so the first half
|
|
# of each row should be in the bottom bucket, and the second
|
|
# half should be in the top bucket.
|
|
'2': permuted_array([[0, 0, 0, 1, 1, 1],
|
|
[0, 0, 0, 1, 1, 1],
|
|
[0, 0, 0, 1, 1, 1],
|
|
[0, 0, 0, 1, 1, 1],
|
|
[0, 0, 0, 1, 1, 1],
|
|
[0, 0, 0, 1, 1, 1]]),
|
|
# Similar for three buckets.
|
|
'3': permuted_array([[0, 0, 1, 1, 2, 2],
|
|
[0, 0, 1, 1, 2, 2],
|
|
[0, 0, 1, 1, 2, 2],
|
|
[0, 0, 1, 1, 2, 2],
|
|
[0, 0, 1, 1, 2, 2],
|
|
[0, 0, 1, 1, 2, 2]]),
|
|
# In the limiting case, we just have every column different.
|
|
'6': permuted_array([[0, 1, 2, 3, 4, 5],
|
|
[0, 1, 2, 3, 4, 5],
|
|
[0, 1, 2, 3, 4, 5],
|
|
[0, 1, 2, 3, 4, 5],
|
|
[0, 1, 2, 3, 4, 5],
|
|
[0, 1, 2, 3, 4, 5]]),
|
|
},
|
|
mask=self.build_mask(self.ones_mask(shape=shape)),
|
|
)
|
|
|
|
@parameter_space(seed=[1, 2, 3])
|
|
def test_quantiles_masked(self, seed):
|
|
permute = partial(permute_rows, seed)
|
|
|
|
# 7 x 7 so that we divide evenly into 2/3/6-tiles after including the
|
|
# nan value in each row.
|
|
shape = (7, 7)
|
|
|
|
# Shuffle the input rows to verify that we don't depend on the order.
|
|
# Take the log to ensure that we don't depend on linear scaling or
|
|
# integrality of inputs
|
|
factor_data = permute(log1p(arange(49, dtype=float).reshape(shape)))
|
|
factor_data_w_nans = where(
|
|
permute(rot90(self.eye_mask(shape=shape))),
|
|
factor_data,
|
|
nan,
|
|
)
|
|
mask_data = permute(self.eye_mask(shape=shape))
|
|
|
|
f = F()
|
|
f_nans = OtherF()
|
|
m = Mask()
|
|
|
|
# Apply the same shuffle we applied to the input rows to our
|
|
# expectations. Doing it this way makes it obvious that our
|
|
# expectation corresponds to our input, while still testing against
|
|
# a range of input orderings.
|
|
permuted_array = compose(permute, partial(array, dtype=int64_dtype))
|
|
|
|
self.check_terms(
|
|
terms={
|
|
'2_masked': f.quantiles(bins=2, mask=m),
|
|
'3_masked': f.quantiles(bins=3, mask=m),
|
|
'6_masked': f.quantiles(bins=6, mask=m),
|
|
'2_nans': f_nans.quantiles(bins=2),
|
|
'3_nans': f_nans.quantiles(bins=3),
|
|
'6_nans': f_nans.quantiles(bins=6),
|
|
},
|
|
initial_workspace={
|
|
f: factor_data,
|
|
f_nans: factor_data_w_nans,
|
|
m: mask_data,
|
|
},
|
|
expected={
|
|
# Expected results here are the same as in
|
|
# test_quantiles_unmasked, except with diagonals of -1s
|
|
# interpolated to match the effects of masking and/or input
|
|
# nans.
|
|
'2_masked': permuted_array([[-1, 0, 0, 0, 1, 1, 1],
|
|
[0, -1, 0, 0, 1, 1, 1],
|
|
[0, 0, -1, 0, 1, 1, 1],
|
|
[0, 0, 0, -1, 1, 1, 1],
|
|
[0, 0, 0, 1, -1, 1, 1],
|
|
[0, 0, 0, 1, 1, -1, 1],
|
|
[0, 0, 0, 1, 1, 1, -1]]),
|
|
'3_masked': permuted_array([[-1, 0, 0, 1, 1, 2, 2],
|
|
[0, -1, 0, 1, 1, 2, 2],
|
|
[0, 0, -1, 1, 1, 2, 2],
|
|
[0, 0, 1, -1, 1, 2, 2],
|
|
[0, 0, 1, 1, -1, 2, 2],
|
|
[0, 0, 1, 1, 2, -1, 2],
|
|
[0, 0, 1, 1, 2, 2, -1]]),
|
|
'6_masked': permuted_array([[-1, 0, 1, 2, 3, 4, 5],
|
|
[0, -1, 1, 2, 3, 4, 5],
|
|
[0, 1, -1, 2, 3, 4, 5],
|
|
[0, 1, 2, -1, 3, 4, 5],
|
|
[0, 1, 2, 3, -1, 4, 5],
|
|
[0, 1, 2, 3, 4, -1, 5],
|
|
[0, 1, 2, 3, 4, 5, -1]]),
|
|
'2_nans': permuted_array([[0, 0, 0, 1, 1, 1, -1],
|
|
[0, 0, 0, 1, 1, -1, 1],
|
|
[0, 0, 0, 1, -1, 1, 1],
|
|
[0, 0, 0, -1, 1, 1, 1],
|
|
[0, 0, -1, 0, 1, 1, 1],
|
|
[0, -1, 0, 0, 1, 1, 1],
|
|
[-1, 0, 0, 0, 1, 1, 1]]),
|
|
'3_nans': permuted_array([[0, 0, 1, 1, 2, 2, -1],
|
|
[0, 0, 1, 1, 2, -1, 2],
|
|
[0, 0, 1, 1, -1, 2, 2],
|
|
[0, 0, 1, -1, 1, 2, 2],
|
|
[0, 0, -1, 1, 1, 2, 2],
|
|
[0, -1, 0, 1, 1, 2, 2],
|
|
[-1, 0, 0, 1, 1, 2, 2]]),
|
|
'6_nans': permuted_array([[0, 1, 2, 3, 4, 5, -1],
|
|
[0, 1, 2, 3, 4, -1, 5],
|
|
[0, 1, 2, 3, -1, 4, 5],
|
|
[0, 1, 2, -1, 3, 4, 5],
|
|
[0, 1, -1, 2, 3, 4, 5],
|
|
[0, -1, 1, 2, 3, 4, 5],
|
|
[-1, 0, 1, 2, 3, 4, 5]]),
|
|
},
|
|
mask=self.build_mask(self.ones_mask(shape=shape)),
|
|
)
|
|
|
|
def test_quantiles_uneven_buckets(self):
|
|
permute = partial(permute_rows, 5)
|
|
shape = (5, 5)
|
|
|
|
factor_data = permute(log1p(arange(25, dtype=float).reshape(shape)))
|
|
mask_data = permute(self.eye_mask(shape=shape))
|
|
|
|
f = F()
|
|
m = Mask()
|
|
|
|
permuted_array = compose(permute, partial(array, dtype=int64_dtype))
|
|
self.check_terms(
|
|
terms={
|
|
'3_masked': f.quantiles(bins=3, mask=m),
|
|
'7_masked': f.quantiles(bins=7, mask=m),
|
|
},
|
|
initial_workspace={
|
|
f: factor_data,
|
|
m: mask_data,
|
|
},
|
|
expected={
|
|
'3_masked': permuted_array([[-1, 0, 0, 1, 2],
|
|
[0, -1, 0, 1, 2],
|
|
[0, 0, -1, 1, 2],
|
|
[0, 0, 1, -1, 2],
|
|
[0, 0, 1, 2, -1]]),
|
|
'7_masked': permuted_array([[-1, 0, 2, 4, 6],
|
|
[0, -1, 2, 4, 6],
|
|
[0, 2, -1, 4, 6],
|
|
[0, 2, 4, -1, 6],
|
|
[0, 2, 4, 6, -1]]),
|
|
},
|
|
mask=self.build_mask(self.ones_mask(shape=shape)),
|
|
)
|
|
|
|
def test_quantile_helpers(self):
|
|
f = self.f
|
|
m = Mask()
|
|
|
|
self.assertIs(f.quartiles(), f.quantiles(bins=4))
|
|
self.assertIs(f.quartiles(mask=m), f.quantiles(bins=4, mask=m))
|
|
self.assertIsNot(f.quartiles(), f.quartiles(mask=m))
|
|
|
|
self.assertIs(f.quintiles(), f.quantiles(bins=5))
|
|
self.assertIs(f.quintiles(mask=m), f.quantiles(bins=5, mask=m))
|
|
self.assertIsNot(f.quintiles(), f.quintiles(mask=m))
|
|
|
|
self.assertIs(f.deciles(), f.quantiles(bins=10))
|
|
self.assertIs(f.deciles(mask=m), f.quantiles(bins=10, mask=m))
|
|
self.assertIsNot(f.deciles(), f.deciles(mask=m))
|
|
|
|
@parameter_space(seed=[1, 2, 3])
|
|
def test_clip(self, seed):
|
|
rand = np.random.RandomState(seed)
|
|
shape = (5, 5)
|
|
original_min = -10
|
|
original_max = +10
|
|
input_array = rand.uniform(
|
|
original_min,
|
|
original_max,
|
|
size=shape,
|
|
)
|
|
min_, max_ = np.percentile(input_array, [25, 75])
|
|
self.assertGreater(min_, original_min)
|
|
self.assertLess(max_, original_max)
|
|
|
|
f = F()
|
|
|
|
self.check_terms(
|
|
terms={
|
|
'clip': f.clip(min_, max_)
|
|
},
|
|
initial_workspace={
|
|
f: input_array,
|
|
},
|
|
expected={
|
|
'clip': np.clip(input_array, min_, max_),
|
|
},
|
|
mask=self.build_mask(self.ones_mask(shape=shape)),
|
|
)
|
|
|
|
|
|
class ReprTestCase(TestCase):
|
|
"""
|
|
Tests for term reprs.
|
|
"""
|
|
|
|
def test_demean(self):
|
|
r = F().demean().graph_repr()
|
|
self.assertEqual(r, "GroupedRowTransform('demean')")
|
|
|
|
def test_zscore(self):
|
|
r = F().zscore().graph_repr()
|
|
self.assertEqual(r, "GroupedRowTransform('zscore')")
|
|
|
|
def test_winsorize(self):
|
|
r = F().winsorize(min_percentile=.05, max_percentile=.95).graph_repr()
|
|
self.assertEqual(r, "GroupedRowTransform('winsorize')")
|
|
|
|
def test_recarray_field_repr(self):
|
|
class MultipleOutputs(CustomFactor):
|
|
outputs = ['a', 'b']
|
|
inputs = ()
|
|
window_length = 5
|
|
|
|
def recursive_repr(self):
|
|
return "CustomRepr()"
|
|
|
|
a = MultipleOutputs().a
|
|
b = MultipleOutputs().b
|
|
|
|
self.assertEqual(a.graph_repr(), "CustomRepr().a")
|
|
self.assertEqual(b.graph_repr(), "CustomRepr().b")
|
|
|
|
def test_latest_repr(self):
|
|
|
|
class SomeDataSet(DataSet):
|
|
a = Column(dtype=float64_dtype)
|
|
b = Column(dtype=float64_dtype)
|
|
|
|
self.assertEqual(
|
|
SomeDataSet.a.latest.graph_repr(),
|
|
"Latest"
|
|
)
|
|
self.assertEqual(
|
|
SomeDataSet.b.latest.graph_repr(),
|
|
"Latest"
|
|
)
|
|
|
|
def test_recursive_repr(self):
|
|
|
|
class DS(DataSet):
|
|
a = Column(dtype=float64_dtype)
|
|
b = Column(dtype=float64_dtype)
|
|
|
|
class Input(CustomFactor):
|
|
inputs = ()
|
|
window_safe = True
|
|
|
|
class HasInputs(CustomFactor):
|
|
inputs = [Input(window_length=3), DS.a, DS.b]
|
|
window_length = 3
|
|
|
|
result = repr(HasInputs())
|
|
expected = "HasInputs([Input(...), DS.a, DS.b], 3)"
|
|
self.assertEqual(result, expected)
|
|
|
|
def test_rank_repr(self):
|
|
rank = DailyReturns().rank()
|
|
result = repr(rank)
|
|
expected = "Rank(DailyReturns(...), method='ordinal')"
|
|
self.assertEqual(result, expected)
|
|
|
|
recursive_repr = rank.recursive_repr()
|
|
self.assertEqual(recursive_repr, "Rank(...)")
|
|
|
|
def test_rank_repr_with_mask(self):
|
|
rank = DailyReturns().rank(mask=Mask())
|
|
result = repr(rank)
|
|
expected = "Rank(DailyReturns(...), method='ordinal', mask=Mask(...))"
|
|
self.assertEqual(result, expected)
|
|
|
|
recursive_repr = rank.recursive_repr()
|
|
self.assertEqual(recursive_repr, "Rank(...)")
|
|
|
|
|
|
class TestWindowSafety(TestCase):
|
|
|
|
def test_zscore_is_window_safe(self):
|
|
self.assertTrue(F().zscore().window_safe)
|
|
|
|
@parameter_space(__fail_fast=True, is_window_safe=[True, False])
|
|
def test_window_safety_propagates_to_recarray_fields(self, is_window_safe):
|
|
|
|
class MultipleOutputs(CustomFactor):
|
|
outputs = ['a', 'b']
|
|
inputs = ()
|
|
window_length = 5
|
|
window_safe = is_window_safe
|
|
|
|
mo = MultipleOutputs()
|
|
|
|
for attr in mo.a, mo.b:
|
|
self.assertEqual(attr.window_safe, mo.window_safe)
|
|
|
|
def test_demean_is_window_safe_if_input_is_window_safe(self):
|
|
self.assertFalse(F().demean().window_safe)
|
|
self.assertFalse(F(window_safe=False).demean().window_safe)
|
|
self.assertTrue(F(window_safe=True).demean().window_safe)
|
|
|
|
def test_winsorize_is_window_safe_if_input_is_window_safe(self):
|
|
self.assertFalse(
|
|
F().winsorize(min_percentile=.05, max_percentile=.95).window_safe
|
|
)
|
|
self.assertFalse(
|
|
F(window_safe=False).winsorize(
|
|
min_percentile=.05,
|
|
max_percentile=.95
|
|
).window_safe
|
|
)
|
|
self.assertTrue(
|
|
F(window_safe=True).winsorize(
|
|
min_percentile=.05,
|
|
max_percentile=.95
|
|
).window_safe
|
|
)
|
|
|
|
|
|
class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
|
|
@parameter_space(dtype_=(float64_dtype, datetime64ns_dtype))
|
|
def test_reversability(self, dtype_):
|
|
class F(Factor):
|
|
inputs = ()
|
|
dtype = dtype_
|
|
window_length = 0
|
|
|
|
f = F()
|
|
column_data = array(
|
|
[[0, f.missing_value],
|
|
[1, f.missing_value],
|
|
[2, 3]],
|
|
dtype=dtype_,
|
|
)
|
|
|
|
assert_equal(f.postprocess(column_data.ravel()), column_data.ravel())
|
|
|
|
# only include the non-missing data
|
|
pipeline_output = pd.Series(
|
|
data=array([0, 1, 2, 3], dtype=dtype_),
|
|
index=pd.MultiIndex.from_arrays([
|
|
[pd.Timestamp('2014-01-01'),
|
|
pd.Timestamp('2014-01-02'),
|
|
pd.Timestamp('2014-01-03'),
|
|
pd.Timestamp('2014-01-03')],
|
|
[0, 0, 0, 1],
|
|
]),
|
|
)
|
|
|
|
assert_equal(
|
|
f.to_workspace_value(pipeline_output, pd.Index([0, 1])),
|
|
column_data,
|
|
)
|
|
|
|
|
|
class TestSpecialCases(WithUSEquityPricingPipelineEngine,
|
|
ZiplineTestCase):
|
|
ASSET_FINDER_COUNTRY_CODE = 'US'
|
|
|
|
def check_equivalent_terms(self, terms):
|
|
self.assertTrue(len(terms) > 1, "Need at least two terms to compare")
|
|
pipe = Pipeline(terms)
|
|
|
|
start, end = self.trading_days[[-10, -1]]
|
|
results = self.pipeline_engine.run_pipeline(pipe, start, end)
|
|
first_column = results.iloc[:, 0]
|
|
for name in terms:
|
|
assert_equal(results.loc[:, name], first_column, check_names=False)
|
|
|
|
def test_daily_returns_is_special_case_of_returns(self):
|
|
self.check_equivalent_terms({
|
|
'daily': DailyReturns(),
|
|
'manual_daily': Returns(window_length=2),
|
|
})
|
|
|
|
|
|
class SummaryTestCase(BaseUSEquityPipelineTestCase, ZiplineTestCase):
|
|
|
|
@parameter_space(
|
|
seed=[1, 2, 3],
|
|
mask=[
|
|
np.zeros((10, 5), dtype=bool),
|
|
ones((10, 5), dtype=bool),
|
|
eye(10, 5, dtype=bool),
|
|
~eye(10, 5, dtype=bool),
|
|
]
|
|
)
|
|
def test_summary_methods(self, seed, mask):
|
|
"""Test that summary funcs work the same as numpy NaN-aware funcs.
|
|
"""
|
|
rand = np.random.RandomState(seed)
|
|
shape = (10, 5)
|
|
data = rand.randn(*shape)
|
|
data[~mask] = np.nan
|
|
|
|
workspace = {F(): data}
|
|
terms = {
|
|
'mean': F().mean(),
|
|
'sum': F().sum(),
|
|
'median': F().median(),
|
|
'min': F().min(),
|
|
'max': F().max(),
|
|
'stddev': F().stddev(),
|
|
'notnull_count': F().notnull_count(),
|
|
}
|
|
|
|
with ignore_nanwarnings():
|
|
expected = {
|
|
'mean': as_column(np.nanmean(data, axis=1)),
|
|
'sum': as_column(np.nansum(data, axis=1)),
|
|
'median': as_column(np.nanmedian(data, axis=1)),
|
|
'min': as_column(np.nanmin(data, axis=1)),
|
|
'max': as_column(np.nanmax(data, axis=1)),
|
|
'stddev': as_column(np.nanstd(data, axis=1)),
|
|
'notnull_count': as_column((~np.isnan(data)).sum(axis=1)),
|
|
}
|
|
|
|
# Make sure we have test coverage for all summary funcs.
|
|
self.assertEqual(set(expected), summary_funcs.names)
|
|
|
|
self.check_terms(
|
|
terms=terms,
|
|
expected=expected,
|
|
initial_workspace=workspace,
|
|
mask=self.build_mask(ones(shape)),
|
|
)
|
|
|
|
@parameter_space(
|
|
seed=[4, 5, 6],
|
|
mask=[
|
|
np.zeros((10, 5), dtype=bool),
|
|
ones((10, 5), dtype=bool),
|
|
eye(10, 5, dtype=bool),
|
|
~eye(10, 5, dtype=bool),
|
|
]
|
|
)
|
|
def test_built_in_vs_summary(self, seed, mask):
|
|
"""Test that summary funcs match normalization functions.
|
|
"""
|
|
rand = np.random.RandomState(seed)
|
|
shape = (10, 5)
|
|
data = rand.randn(*shape)
|
|
data[~mask] = np.nan
|
|
|
|
workspace = {F(): data}
|
|
terms = {
|
|
'demean': F().demean(),
|
|
'alt_demean': F() - F().mean(),
|
|
|
|
'zscore': F().zscore(),
|
|
'alt_zscore': (F() - F().mean()) / F().stddev(),
|
|
|
|
'mean': F().mean(),
|
|
'alt_mean': F().sum() / F().notnull_count(),
|
|
}
|
|
|
|
result = self.run_terms(
|
|
terms,
|
|
initial_workspace=workspace,
|
|
mask=self.build_mask(ones(shape)),
|
|
)
|
|
|
|
assert_equal(result['demean'], result['alt_demean'])
|
|
assert_equal(result['zscore'], result['alt_zscore'])
|
|
|
|
@parameter_space(
|
|
seed=[100, 200, 300],
|
|
mask=[
|
|
np.zeros((10, 5), dtype=bool),
|
|
ones((10, 5), dtype=bool),
|
|
eye(10, 5, dtype=bool),
|
|
~eye(10, 5, dtype=bool),
|
|
]
|
|
)
|
|
def test_complex_expression(self, seed, mask):
|
|
rand = np.random.RandomState(seed)
|
|
shape = (10, 5)
|
|
data = rand.randn(*shape)
|
|
data[~mask] = np.nan
|
|
|
|
workspace = {F(): data}
|
|
terms = {
|
|
'rescaled': (F() - F().min()) / (F().max() - F().min()),
|
|
}
|
|
|
|
with ignore_nanwarnings():
|
|
mins = as_column(np.nanmin(data, axis=1))
|
|
maxes = as_column(np.nanmax(data, axis=1))
|
|
|
|
expected = {
|
|
'rescaled': (data - mins) / (maxes - mins),
|
|
}
|
|
|
|
self.check_terms(
|
|
terms,
|
|
expected,
|
|
initial_workspace=workspace,
|
|
mask=self.build_mask(ones(shape)),
|
|
)
|
|
|
|
@parameter_space(
|
|
seed=[40, 41, 42],
|
|
mask=[
|
|
np.zeros((10, 5), dtype=bool),
|
|
ones((10, 5), dtype=bool),
|
|
eye(10, 5, dtype=bool),
|
|
~eye(10, 5, dtype=bool),
|
|
],
|
|
# Three ways to mask:
|
|
# 1. Don't mask.
|
|
# 2. Mask by passing mask parameter to summary methods.
|
|
# 3. Mask by having non-True values in the root mask.
|
|
mask_mode=('none', 'param', 'root'),
|
|
)
|
|
def test_summaries_after_fillna(self, seed, mask, mask_mode):
|
|
rand = np.random.RandomState(seed)
|
|
shape = (10, 5)
|
|
|
|
# Create data with a mix of NaN and non-NaN values.
|
|
with_nans = np.where(mask, rand.randn(*shape), np.nan)
|
|
|
|
# Create a version with NaNs filled with -1s.
|
|
with_minus_1s = np.where(mask, with_nans, -1)
|
|
|
|
kwargs = {}
|
|
workspace = {F(): with_nans}
|
|
|
|
# Call each summary method with mask=Mask().
|
|
if mask_mode == 'param':
|
|
kwargs['mask'] = Mask()
|
|
workspace[Mask()] = mask
|
|
|
|
# Take the mean after applying a fillna of -1 to ensure that we ignore
|
|
# masked locations properly.
|
|
terms = {
|
|
'mean': F().fillna(-1).mean(**kwargs),
|
|
'sum': F().fillna(-1).sum(**kwargs),
|
|
'median': F().fillna(-1).median(**kwargs),
|
|
'min': F().fillna(-1).min(**kwargs),
|
|
'max': F().fillna(-1).max(**kwargs),
|
|
'stddev': F().fillna(-1).stddev(**kwargs),
|
|
'notnull_count': F().fillna(-1).notnull_count(**kwargs),
|
|
}
|
|
|
|
with ignore_nanwarnings():
|
|
if mask_mode == 'none':
|
|
# If we aren't masking, we should expect the results to see the
|
|
# -1s.
|
|
expected_input = with_minus_1s
|
|
else:
|
|
# If we are masking, we should expect the results to see NaNs.
|
|
expected_input = with_nans
|
|
|
|
expected = {
|
|
'mean': as_column(np.nanmean(expected_input, axis=1)),
|
|
'sum': as_column(np.nansum(expected_input, axis=1)),
|
|
'median': as_column(np.nanmedian(expected_input, axis=1)),
|
|
'min': as_column(np.nanmin(expected_input, axis=1)),
|
|
'max': as_column(np.nanmax(expected_input, axis=1)),
|
|
'stddev': as_column(np.nanstd(expected_input, axis=1)),
|
|
'notnull_count': as_column(
|
|
(~np.isnan(expected_input)).sum(axis=1),
|
|
),
|
|
}
|
|
|
|
# Make sure we have test coverage for all summary funcs.
|
|
self.assertEqual(set(expected), summary_funcs.names)
|
|
|
|
if mask_mode == 'root':
|
|
root_mask = self.build_mask(mask)
|
|
else:
|
|
root_mask = self.build_mask(ones_like(mask))
|
|
|
|
self.check_terms(
|
|
terms=terms,
|
|
expected=expected,
|
|
initial_workspace=workspace,
|
|
mask=root_mask,
|
|
)
|
|
|
|
def test_repr(self):
|
|
|
|
class MyFactor(CustomFactor):
|
|
window_length = 1
|
|
inputs = ()
|
|
|
|
def recursive_repr(self):
|
|
return "MyFactor()"
|
|
|
|
f = MyFactor()
|
|
|
|
for method in summary_funcs.names:
|
|
summarized = getattr(f, method)()
|
|
self.assertEqual(
|
|
repr(summarized),
|
|
"MyFactor().{}()".format(method),
|
|
)
|
|
self.assertEqual(
|
|
summarized.recursive_repr(),
|
|
"MyFactor().{}()".format(method),
|
|
)
|