zipline/tests/pipeline/test_statistical.py
2020-01-21 11:32:16 -05:00

1145 lines
45 KiB
Python

"""
Tests for statistical pipeline terms.
"""
import numpy as np
from numpy import (
arange,
full,
full_like,
nan,
where,
)
from pandas import (
DataFrame,
date_range,
Int64Index,
Timestamp,
)
from pandas.util.testing import assert_frame_equal
from scipy.stats import linregress, pearsonr, spearmanr
from empyrical.stats import beta_aligned as empyrical_beta
from zipline.assets import Equity, ExchangeInfo
from zipline.errors import IncompatibleTerms, NonExistentAssetInTimeFrame
from zipline.pipeline import CustomFactor, Pipeline
from zipline.pipeline.data import USEquityPricing
from zipline.pipeline.data.testing import TestingDataSet
from zipline.pipeline.domain import US_EQUITIES
from zipline.pipeline.engine import SimplePipelineEngine
from zipline.pipeline.factors import (
Returns,
RollingLinearRegressionOfReturns,
RollingPearsonOfReturns,
RollingSpearmanOfReturns,
SimpleBeta,
)
from zipline.pipeline.factors.statistical import (
vectorized_beta,
vectorized_pearson_r,
)
from zipline.pipeline.loaders.frame import DataFrameLoader
from zipline.pipeline.sentinels import NotSpecified
from zipline.testing import (
AssetID,
AssetIDPlusDay,
check_arrays,
make_alternating_boolean_array,
make_cascading_boolean_array,
parameter_space,
)
import zipline.testing.fixtures as zf
from zipline.testing.predicates import assert_equal
from zipline.utils.numpy_utils import (
as_column,
bool_dtype,
datetime64ns_dtype,
float64_dtype,
)
class StatisticalBuiltInsTestCase(zf.WithAssetFinder,
zf.WithTradingCalendars,
zf.ZiplineTestCase):
sids = ASSET_FINDER_EQUITY_SIDS = Int64Index([1, 2, 3])
START_DATE = Timestamp('2015-01-31', tz='UTC')
END_DATE = Timestamp('2015-03-01', tz='UTC')
ASSET_FINDER_EQUITY_SYMBOLS = ('A', 'B', 'C')
ASSET_FINDER_COUNTRY_CODE = 'US'
@classmethod
def init_class_fixtures(cls):
super(StatisticalBuiltInsTestCase, cls).init_class_fixtures()
day = cls.trading_calendar.day
cls.dates = dates = date_range(
'2015-02-01', '2015-02-28', freq=day, tz='UTC',
)
# Using these start and end dates because they are a contigous span of
# 5 days (Monday - Friday) and they allow for plenty of days to look
# back on when computing correlations and regressions.
cls.start_date_index = start_date_index = 14
cls.end_date_index = end_date_index = 18
cls.pipeline_start_date = dates[start_date_index]
cls.pipeline_end_date = dates[end_date_index]
cls.num_days = num_days = end_date_index - start_date_index + 1
sids = cls.sids
cls.assets = assets = cls.asset_finder.retrieve_all(sids)
cls.my_asset_column = my_asset_column = 0
cls.my_asset = assets[my_asset_column]
cls.num_assets = num_assets = len(assets)
cls.raw_data = raw_data = DataFrame(
data=arange(len(dates) * len(sids), dtype=float64_dtype).reshape(
len(dates), len(sids),
),
index=dates,
columns=assets,
)
# Using mock 'close' data here because the correlation and regression
# built-ins use USEquityPricing.close as the input to their `Returns`
# factors. Since there is no way to change that when constructing an
# instance of these built-ins, we need to test with mock 'close' data
# to most accurately reflect their true behavior and results.
close_loader = DataFrameLoader(USEquityPricing.close, raw_data)
cls.run_pipeline = SimplePipelineEngine(
{USEquityPricing.close: close_loader}.__getitem__,
cls.asset_finder,
default_domain=US_EQUITIES,
).run_pipeline
cls.cascading_mask = \
AssetIDPlusDay() < (sids[-1] + dates[start_date_index].day)
cls.expected_cascading_mask_result = make_cascading_boolean_array(
shape=(num_days, num_assets),
)
cls.alternating_mask = (AssetIDPlusDay() % 2).eq(0)
cls.expected_alternating_mask_result = make_alternating_boolean_array(
shape=(num_days, num_assets),
)
cls.expected_no_mask_result = full(
shape=(num_days, num_assets), fill_value=True, dtype=bool_dtype,
)
@parameter_space(returns_length=[2, 3], correlation_length=[3, 4])
def test_correlation_factors(self, returns_length, correlation_length):
"""
Tests for the built-in factors `RollingPearsonOfReturns` and
`RollingSpearmanOfReturns`.
"""
assets = self.assets
my_asset = self.my_asset
my_asset_column = self.my_asset_column
dates = self.dates
start_date = self.pipeline_start_date
end_date = self.pipeline_end_date
start_date_index = self.start_date_index
end_date_index = self.end_date_index
num_days = self.num_days
run_pipeline = self.run_pipeline
returns = Returns(window_length=returns_length)
masks = (self.cascading_mask, self.alternating_mask, NotSpecified)
expected_mask_results = (
self.expected_cascading_mask_result,
self.expected_alternating_mask_result,
self.expected_no_mask_result,
)
for mask, expected_mask in zip(masks, expected_mask_results):
pearson_factor = RollingPearsonOfReturns(
target=my_asset,
returns_length=returns_length,
correlation_length=correlation_length,
mask=mask,
)
spearman_factor = RollingSpearmanOfReturns(
target=my_asset,
returns_length=returns_length,
correlation_length=correlation_length,
mask=mask,
)
columns = {
'pearson_factor': pearson_factor,
'spearman_factor': spearman_factor,
}
pipeline = Pipeline(columns=columns)
if mask is not NotSpecified:
pipeline.add(mask, 'mask')
results = run_pipeline(pipeline, start_date, end_date)
pearson_results = results['pearson_factor'].unstack()
spearman_results = results['spearman_factor'].unstack()
if mask is not NotSpecified:
mask_results = results['mask'].unstack()
check_arrays(mask_results.values, expected_mask)
# Run a separate pipeline that calculates returns starting
# (correlation_length - 1) days prior to our start date. This is
# because we need (correlation_length - 1) extra days of returns to
# compute our expected correlations.
results = run_pipeline(
Pipeline(columns={'returns': returns}),
dates[start_date_index - (correlation_length - 1)],
dates[end_date_index],
)
returns_results = results['returns'].unstack()
# On each day, calculate the expected correlation coefficients
# between the asset we are interested in and each other asset. Each
# correlation is calculated over `correlation_length` days.
expected_pearson_results = full_like(pearson_results, nan)
expected_spearman_results = full_like(spearman_results, nan)
for day in range(num_days):
todays_returns = returns_results.iloc[
day:day + correlation_length
]
my_asset_returns = todays_returns.iloc[:, my_asset_column]
for asset, other_asset_returns in todays_returns.iteritems():
asset_column = int(asset) - 1
expected_pearson_results[day, asset_column] = pearsonr(
my_asset_returns, other_asset_returns,
)[0]
expected_spearman_results[day, asset_column] = spearmanr(
my_asset_returns, other_asset_returns,
)[0]
expected_pearson_results = DataFrame(
data=where(expected_mask, expected_pearson_results, nan),
index=dates[start_date_index:end_date_index + 1],
columns=assets,
)
assert_frame_equal(pearson_results, expected_pearson_results)
expected_spearman_results = DataFrame(
data=where(expected_mask, expected_spearman_results, nan),
index=dates[start_date_index:end_date_index + 1],
columns=assets,
)
assert_frame_equal(spearman_results, expected_spearman_results)
@parameter_space(returns_length=[2, 3], regression_length=[3, 4])
def test_regression_of_returns_factor(self,
returns_length,
regression_length):
"""
Tests for the built-in factor `RollingLinearRegressionOfReturns`.
"""
assets = self.assets
my_asset = self.my_asset
my_asset_column = self.my_asset_column
dates = self.dates
start_date = self.pipeline_start_date
end_date = self.pipeline_end_date
start_date_index = self.start_date_index
end_date_index = self.end_date_index
num_days = self.num_days
run_pipeline = self.run_pipeline
# The order of these is meant to align with the output of `linregress`.
outputs = ['beta', 'alpha', 'r_value', 'p_value', 'stderr']
returns = Returns(window_length=returns_length)
masks = self.cascading_mask, self.alternating_mask, NotSpecified
expected_mask_results = (
self.expected_cascading_mask_result,
self.expected_alternating_mask_result,
self.expected_no_mask_result,
)
for mask, expected_mask in zip(masks, expected_mask_results):
regression_factor = RollingLinearRegressionOfReturns(
target=my_asset,
returns_length=returns_length,
regression_length=regression_length,
mask=mask,
)
columns = {
output: getattr(regression_factor, output)
for output in outputs
}
pipeline = Pipeline(columns=columns)
if mask is not NotSpecified:
pipeline.add(mask, 'mask')
results = run_pipeline(pipeline, start_date, end_date)
if mask is not NotSpecified:
mask_results = results['mask'].unstack()
check_arrays(mask_results.values, expected_mask)
output_results = {}
expected_output_results = {}
for output in outputs:
output_results[output] = results[output].unstack()
expected_output_results[output] = full_like(
output_results[output], nan,
)
# Run a separate pipeline that calculates returns starting
# (regression_length - 1) days prior to our start date. This is
# because we need (regression_length - 1) extra days of returns to
# compute our expected regressions.
results = run_pipeline(
Pipeline(columns={'returns': returns}),
dates[start_date_index - (regression_length - 1)],
dates[end_date_index],
)
returns_results = results['returns'].unstack()
# On each day, calculate the expected regression results for Y ~ X
# where Y is the asset we are interested in and X is each other
# asset. Each regression is calculated over `regression_length`
# days of data.
for day in range(num_days):
todays_returns = returns_results.iloc[
day:day + regression_length
]
my_asset_returns = todays_returns.iloc[:, my_asset_column]
for asset, other_asset_returns in todays_returns.iteritems():
asset_column = int(asset) - 1
expected_regression_results = linregress(
y=other_asset_returns, x=my_asset_returns,
)
for i, output in enumerate(outputs):
expected_output_results[output][day, asset_column] = \
expected_regression_results[i]
for output in outputs:
output_result = output_results[output]
expected_output_result = DataFrame(
where(expected_mask, expected_output_results[output], nan),
index=dates[start_date_index:end_date_index + 1],
columns=assets,
)
assert_frame_equal(output_result, expected_output_result)
def test_simple_beta_matches_regression(self):
run_pipeline = self.run_pipeline
simple_beta = SimpleBeta(target=self.my_asset, regression_length=10)
complex_beta = RollingLinearRegressionOfReturns(
target=self.my_asset,
returns_length=2,
regression_length=10,
).beta
pipe = Pipeline({'simple': simple_beta, 'complex': complex_beta})
results = run_pipeline(
pipe,
self.pipeline_start_date,
self.pipeline_end_date,
)
assert_equal(results['simple'], results['complex'], check_names=False)
def test_simple_beta_allowed_missing_calculation(self):
for percentage, expected in [(0.651, 65),
(0.659, 65),
(0.66, 66),
(0.0, 0),
(1.0, 100)]:
beta = SimpleBeta(
target=self.my_asset,
regression_length=100,
allowed_missing_percentage=percentage,
)
self.assertEqual(beta.params['allowed_missing_count'], expected)
def test_correlation_and_regression_with_bad_asset(self):
"""
Test that `RollingPearsonOfReturns`, `RollingSpearmanOfReturns` and
`RollingLinearRegressionOfReturns` raise the proper exception when
given a nonexistent target asset.
"""
my_asset = Equity(
0,
exchange_info=ExchangeInfo('TEST', 'TEST FULL', 'US'),
)
start_date = self.pipeline_start_date
end_date = self.pipeline_end_date
run_pipeline = self.run_pipeline
# This filter is arbitrary; the important thing is that we test each
# factor both with and without a specified mask.
my_asset_filter = AssetID().eq(1)
for mask in (NotSpecified, my_asset_filter):
pearson_factor = RollingPearsonOfReturns(
target=my_asset,
returns_length=3,
correlation_length=3,
mask=mask,
)
spearman_factor = RollingSpearmanOfReturns(
target=my_asset,
returns_length=3,
correlation_length=3,
mask=mask,
)
regression_factor = RollingLinearRegressionOfReturns(
target=my_asset,
returns_length=3,
regression_length=3,
mask=mask,
)
with self.assertRaises(NonExistentAssetInTimeFrame):
run_pipeline(
Pipeline(columns={'pearson_factor': pearson_factor}),
start_date,
end_date,
)
with self.assertRaises(NonExistentAssetInTimeFrame):
run_pipeline(
Pipeline(columns={'spearman_factor': spearman_factor}),
start_date,
end_date,
)
with self.assertRaises(NonExistentAssetInTimeFrame):
run_pipeline(
Pipeline(columns={'regression_factor': regression_factor}),
start_date,
end_date,
)
def test_require_length_greater_than_one(self):
my_asset = Equity(
0,
exchange_info=ExchangeInfo('TEST', 'TEST FULL', 'US'),
)
with self.assertRaises(ValueError):
RollingPearsonOfReturns(
target=my_asset,
returns_length=3,
correlation_length=1,
)
with self.assertRaises(ValueError):
RollingSpearmanOfReturns(
target=my_asset,
returns_length=3,
correlation_length=1,
)
with self.assertRaises(ValueError):
RollingLinearRegressionOfReturns(
target=my_asset,
returns_length=3,
regression_length=1,
)
def test_simple_beta_input_validation(self):
with self.assertRaises(TypeError) as e:
SimpleBeta(
target="SPY",
regression_length=100,
allowed_missing_percentage=0.5,
)
result = str(e.exception)
expected = (
r"SimpleBeta\(\) expected a value of type"
" .*Asset for argument 'target',"
" but got str instead."
)
self.assertRegexpMatches(result, expected)
with self.assertRaises(ValueError) as e:
SimpleBeta(
target=self.my_asset,
regression_length=1,
allowed_missing_percentage=0.5,
)
result = str(e.exception)
expected = (
"SimpleBeta() expected a value greater than or equal to 3"
" for argument 'regression_length', but got 1 instead."
)
self.assertEqual(result, expected)
with self.assertRaises(ValueError) as e:
SimpleBeta(
target=self.my_asset,
regression_length=100,
allowed_missing_percentage=50,
)
result = str(e.exception)
expected = (
"SimpleBeta() expected a value inclusively between 0.0 and 1.0 "
"for argument 'allowed_missing_percentage', but got 50 instead."
)
self.assertEqual(result, expected)
def test_simple_beta_target(self):
beta = SimpleBeta(
target=self.my_asset,
regression_length=50,
allowed_missing_percentage=0.5,
)
self.assertIs(beta.target, self.my_asset)
def test_simple_beta_repr(self):
beta = SimpleBeta(
target=self.my_asset,
regression_length=50,
allowed_missing_percentage=0.5,
)
result = repr(beta)
expected = "SimpleBeta({}, length=50, allowed_missing=25)".format(
self.my_asset,
)
self.assertEqual(result, expected)
def test_simple_beta_graph_repr(self):
beta = SimpleBeta(
target=self.my_asset,
regression_length=50,
allowed_missing_percentage=0.5,
)
result = beta.graph_repr()
expected = "SimpleBeta('A', 50, 25)".format(self.my_asset)
self.assertEqual(result, expected)
class StatisticalMethodsTestCase(zf.WithSeededRandomPipelineEngine,
zf.ZiplineTestCase):
sids = ASSET_FINDER_EQUITY_SIDS = Int64Index([1, 2, 3])
START_DATE = Timestamp('2015-01-31', tz='UTC')
END_DATE = Timestamp('2015-03-01', tz='UTC')
ASSET_FINDER_COUNTRY_CODE = 'US'
SEEDED_RANDOM_PIPELINE_DEFAULT_DOMAIN = US_EQUITIES
@classmethod
def init_class_fixtures(cls):
super(StatisticalMethodsTestCase, cls).init_class_fixtures()
# Using these start and end dates because they are a contigous span of
# 5 days (Monday - Friday) and they allow for plenty of days to look
# back on when computing correlations and regressions.
cls.dates = dates = cls.trading_days
cls.start_date_index = start_date_index = 14
cls.end_date_index = end_date_index = 18
cls.pipeline_start_date = cls.trading_days[start_date_index]
cls.pipeline_end_date = cls.trading_days[end_date_index]
sids = cls.sids
cls.assets = assets = cls.asset_finder.retrieve_all(sids)
cls.my_asset_column = my_asset_column = 0
cls.my_asset = assets[my_asset_column]
cls.num_days = num_days = end_date_index - start_date_index + 1
cls.num_assets = num_assets = len(assets)
cls.cascading_mask = \
AssetIDPlusDay() < (sids[-1] + dates[start_date_index].day)
cls.expected_cascading_mask_result = make_cascading_boolean_array(
shape=(num_days, num_assets),
)
cls.alternating_mask = (AssetIDPlusDay() % 2).eq(0)
cls.expected_alternating_mask_result = make_alternating_boolean_array(
shape=(num_days, num_assets),
)
cls.expected_no_mask_result = full(
shape=(num_days, num_assets), fill_value=True, dtype=bool_dtype,
)
# Random input for factors.
cls.col = TestingDataSet.float_col
@parameter_space(returns_length=[2, 3], correlation_length=[3, 4])
def test_factor_correlation_methods(self,
returns_length,
correlation_length):
"""
Ensure that `Factor.pearsonr` and `Factor.spearmanr` are consistent
with the built-in factors `RollingPearsonOfReturns` and
`RollingSpearmanOfReturns`.
"""
my_asset = self.my_asset
start_date = self.pipeline_start_date
end_date = self.pipeline_end_date
run_pipeline = self.run_pipeline
returns = Returns(window_length=returns_length, inputs=[self.col])
returns_slice = returns[my_asset]
pearson = returns.pearsonr(
target=returns_slice, correlation_length=correlation_length,
)
spearman = returns.spearmanr(
target=returns_slice, correlation_length=correlation_length,
)
expected_pearson = RollingPearsonOfReturns(
target=my_asset,
returns_length=returns_length,
correlation_length=correlation_length,
)
expected_spearman = RollingSpearmanOfReturns(
target=my_asset,
returns_length=returns_length,
correlation_length=correlation_length,
)
# These built-ins construct their own Returns factor to use as inputs,
# so the only way to set our own inputs is to do so after the fact.
# This should not be done in practice. It is necessary here because we
# want Returns to use our random data as an input, but by default it is
# using USEquityPricing.close.
expected_pearson.inputs = [returns, returns_slice]
expected_spearman.inputs = [returns, returns_slice]
columns = {
'pearson': pearson,
'spearman': spearman,
'expected_pearson': expected_pearson,
'expected_spearman': expected_spearman,
}
results = run_pipeline(Pipeline(columns=columns), start_date, end_date)
pearson_results = results['pearson'].unstack()
spearman_results = results['spearman'].unstack()
expected_pearson_results = results['expected_pearson'].unstack()
expected_spearman_results = results['expected_spearman'].unstack()
assert_frame_equal(pearson_results, expected_pearson_results)
assert_frame_equal(spearman_results, expected_spearman_results)
def test_correlation_methods_bad_type(self):
"""
Make sure we cannot call the Factor correlation methods on factors or
slices that are not of float or int dtype.
"""
# These are arbitrary for the purpose of this test.
returns_length = 2
correlation_length = 10
returns = Returns(window_length=returns_length, inputs=[self.col])
returns_slice = returns[self.my_asset]
class BadTypeFactor(CustomFactor):
inputs = []
window_length = 1
dtype = datetime64ns_dtype
window_safe = True
def compute(self, today, assets, out):
pass
bad_type_factor = BadTypeFactor()
bad_type_factor_slice = bad_type_factor[self.my_asset]
with self.assertRaises(TypeError):
bad_type_factor.pearsonr(
target=returns_slice, correlation_length=correlation_length,
)
with self.assertRaises(TypeError):
bad_type_factor.spearmanr(
target=returns_slice, correlation_length=correlation_length,
)
with self.assertRaises(TypeError):
returns.pearsonr(
target=bad_type_factor_slice,
correlation_length=correlation_length,
)
with self.assertRaises(TypeError):
returns.spearmanr(
target=bad_type_factor_slice,
correlation_length=correlation_length,
)
@parameter_space(returns_length=[2, 3], regression_length=[3, 4])
def test_factor_regression_method(self, returns_length, regression_length):
"""
Ensure that `Factor.linear_regression` is consistent with the built-in
factor `RollingLinearRegressionOfReturns`.
"""
my_asset = self.my_asset
start_date = self.pipeline_start_date
end_date = self.pipeline_end_date
run_pipeline = self.run_pipeline
returns = Returns(window_length=returns_length, inputs=[self.col])
returns_slice = returns[my_asset]
regression = returns.linear_regression(
target=returns_slice, regression_length=regression_length,
)
expected_regression = RollingLinearRegressionOfReturns(
target=my_asset,
returns_length=returns_length,
regression_length=regression_length,
)
# This built-in constructs its own Returns factor to use as an input,
# so the only way to set our own input is to do so after the fact. This
# should not be done in practice. It is necessary here because we want
# Returns to use our random data as an input, but by default it is
# using USEquityPricing.close.
expected_regression.inputs = [returns, returns_slice]
columns = {
'regression': regression,
'expected_regression': expected_regression,
}
results = run_pipeline(Pipeline(columns=columns), start_date, end_date)
regression_results = results['regression'].unstack()
expected_regression_results = results['expected_regression'].unstack()
assert_frame_equal(regression_results, expected_regression_results)
def test_regression_method_bad_type(self):
"""
Make sure we cannot call the Factor linear regression method on factors
or slices that are not of float or int dtype.
"""
# These are arbitrary for the purpose of this test.
returns_length = 2
regression_length = 10
returns = Returns(window_length=returns_length, inputs=[self.col])
returns_slice = returns[self.my_asset]
class BadTypeFactor(CustomFactor):
window_length = 1
inputs = []
dtype = datetime64ns_dtype
window_safe = True
def compute(self, today, assets, out):
pass
bad_type_factor = BadTypeFactor()
bad_type_factor_slice = bad_type_factor[self.my_asset]
with self.assertRaises(TypeError):
bad_type_factor.linear_regression(
target=returns_slice, regression_length=regression_length,
)
with self.assertRaises(TypeError):
returns.linear_regression(
target=bad_type_factor_slice,
regression_length=regression_length,
)
@parameter_space(correlation_length=[2, 3, 4])
def test_factor_correlation_methods_two_factors(self, correlation_length):
"""
Tests for `Factor.pearsonr` and `Factor.spearmanr` when passed another
2D factor instead of a Slice.
"""
assets = self.assets
dates = self.dates
start_date = self.pipeline_start_date
end_date = self.pipeline_end_date
start_date_index = self.start_date_index
end_date_index = self.end_date_index
num_days = self.num_days
run_pipeline = self.run_pipeline
# Ensure that the correlation methods cannot be called with two 2D
# factors which have different masks.
returns_masked_1 = Returns(
window_length=5, inputs=[self.col], mask=AssetID().eq(1),
)
returns_masked_2 = Returns(
window_length=5, inputs=[self.col], mask=AssetID().eq(2),
)
with self.assertRaises(IncompatibleTerms):
returns_masked_1.pearsonr(
target=returns_masked_2, correlation_length=correlation_length,
)
with self.assertRaises(IncompatibleTerms):
returns_masked_1.spearmanr(
target=returns_masked_2, correlation_length=correlation_length,
)
returns_5 = Returns(window_length=5, inputs=[self.col])
returns_10 = Returns(window_length=10, inputs=[self.col])
pearson_factor = returns_5.pearsonr(
target=returns_10, correlation_length=correlation_length,
)
spearman_factor = returns_5.spearmanr(
target=returns_10, correlation_length=correlation_length,
)
columns = {
'pearson_factor': pearson_factor,
'spearman_factor': spearman_factor,
}
pipeline = Pipeline(columns=columns)
results = run_pipeline(pipeline, start_date, end_date)
pearson_results = results['pearson_factor'].unstack()
spearman_results = results['spearman_factor'].unstack()
# Run a separate pipeline that calculates returns starting
# (correlation_length - 1) days prior to our start date. This is
# because we need (correlation_length - 1) extra days of returns to
# compute our expected correlations.
columns = {'returns_5': returns_5, 'returns_10': returns_10}
results = run_pipeline(
Pipeline(columns=columns),
dates[start_date_index - (correlation_length - 1)],
dates[end_date_index],
)
returns_5_results = results['returns_5'].unstack()
returns_10_results = results['returns_10'].unstack()
# On each day, calculate the expected correlation coefficients
# between each asset's 5 and 10 day rolling returns. Each correlation
# is calculated over `correlation_length` days.
expected_pearson_results = full_like(pearson_results, nan)
expected_spearman_results = full_like(spearman_results, nan)
for day in range(num_days):
todays_returns_5 = returns_5_results.iloc[
day:day + correlation_length
]
todays_returns_10 = returns_10_results.iloc[
day:day + correlation_length
]
for asset, asset_returns_5 in todays_returns_5.iteritems():
asset_column = int(asset) - 1
asset_returns_10 = todays_returns_10[asset]
expected_pearson_results[day, asset_column] = pearsonr(
asset_returns_5, asset_returns_10,
)[0]
expected_spearman_results[day, asset_column] = spearmanr(
asset_returns_5, asset_returns_10,
)[0]
expected_pearson_results = DataFrame(
data=expected_pearson_results,
index=dates[start_date_index:end_date_index + 1],
columns=assets,
)
assert_frame_equal(pearson_results, expected_pearson_results)
expected_spearman_results = DataFrame(
data=expected_spearman_results,
index=dates[start_date_index:end_date_index + 1],
columns=assets,
)
assert_frame_equal(spearman_results, expected_spearman_results)
@parameter_space(regression_length=[2, 3, 4])
def test_factor_regression_method_two_factors(self, regression_length):
"""
Tests for `Factor.linear_regression` when passed another 2D factor
instead of a Slice.
"""
assets = self.assets
dates = self.dates
start_date = self.pipeline_start_date
end_date = self.pipeline_end_date
start_date_index = self.start_date_index
end_date_index = self.end_date_index
num_days = self.num_days
run_pipeline = self.run_pipeline
# The order of these is meant to align with the output of `linregress`.
outputs = ['beta', 'alpha', 'r_value', 'p_value', 'stderr']
# Ensure that the `linear_regression` method cannot be called with two
# 2D factors which have different masks.
returns_masked_1 = Returns(
window_length=5, inputs=[self.col], mask=AssetID().eq(1),
)
returns_masked_2 = Returns(
window_length=5, inputs=[self.col], mask=AssetID().eq(2),
)
with self.assertRaises(IncompatibleTerms):
returns_masked_1.linear_regression(
target=returns_masked_2, regression_length=regression_length,
)
returns_5 = Returns(window_length=5, inputs=[self.col])
returns_10 = Returns(window_length=10, inputs=[self.col])
regression_factor = returns_5.linear_regression(
target=returns_10, regression_length=regression_length,
)
columns = {
output: getattr(regression_factor, output)
for output in outputs
}
pipeline = Pipeline(columns=columns)
results = run_pipeline(pipeline, start_date, end_date)
output_results = {}
expected_output_results = {}
for output in outputs:
output_results[output] = results[output].unstack()
expected_output_results[output] = full_like(
output_results[output], nan,
)
# Run a separate pipeline that calculates returns starting
# (regression_length - 1) days prior to our start date. This is because
# we need (regression_length - 1) extra days of returns to compute our
# expected regressions.
columns = {'returns_5': returns_5, 'returns_10': returns_10}
results = run_pipeline(
Pipeline(columns=columns),
dates[start_date_index - (regression_length - 1)],
dates[end_date_index],
)
returns_5_results = results['returns_5'].unstack()
returns_10_results = results['returns_10'].unstack()
# On each day, for each asset, calculate the expected regression
# results of Y ~ X where Y is the asset's rolling 5 day returns and X
# is the asset's rolling 10 day returns. Each regression is calculated
# over `regression_length` days of data.
for day in range(num_days):
todays_returns_5 = returns_5_results.iloc[
day:day + regression_length
]
todays_returns_10 = returns_10_results.iloc[
day:day + regression_length
]
for asset, asset_returns_5 in todays_returns_5.iteritems():
asset_column = int(asset) - 1
asset_returns_10 = todays_returns_10[asset]
expected_regression_results = linregress(
y=asset_returns_5, x=asset_returns_10,
)
for i, output in enumerate(outputs):
expected_output_results[output][day, asset_column] = \
expected_regression_results[i]
for output in outputs:
output_result = output_results[output]
expected_output_result = DataFrame(
expected_output_results[output],
index=dates[start_date_index:end_date_index + 1],
columns=assets,
)
assert_frame_equal(output_result, expected_output_result)
class VectorizedBetaTestCase(zf.ZiplineTestCase):
def compare_with_empyrical(self, dependents, independent):
INFINITY = 1000000 # close enough
result = vectorized_beta(
dependents, independent, allowed_missing=INFINITY,
)
expected = np.array([
empyrical_beta(dependents[:, i].ravel(), independent.ravel())
for i in range(dependents.shape[1])
])
assert_equal(result, expected, array_decimal=7)
return result
@parameter_space(seed=[1, 2, 3], __fail_fast=True)
def test_matches_empyrical_beta_aligned(self, seed):
rand = np.random.RandomState(seed)
true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
independent = as_column(np.linspace(-5., 5., 30))
noise = as_column(rand.uniform(-.1, .1, 30))
dependents = 1.0 + true_betas * independent + noise
result = self.compare_with_empyrical(dependents, independent)
self.assertTrue((np.abs(result - true_betas) < 0.01).all())
@parameter_space(
seed=[1, 2],
pct_dependent=[0.3],
pct_independent=[0.75],
__fail_fast=True,
)
def test_nan_handling_matches_empyrical(self,
seed,
pct_dependent,
pct_independent):
rand = np.random.RandomState(seed)
true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5]) * 10
independent = as_column(np.linspace(-5., 10., 50))
noise = as_column(rand.uniform(-.1, .1, 50))
dependents = 1.0 + true_betas * independent + noise
# Fill 20% of the input arrays with nans randomly.
dependents[rand.uniform(0, 1, dependents.shape) < pct_dependent] = nan
independent[independent > np.nanmean(independent)] = nan
# Sanity check that we actually inserted some nans.
# self.assertTrue(np.count_nonzero(np.isnan(dependents)) > 0)
self.assertTrue(np.count_nonzero(np.isnan(independent)) > 0)
result = self.compare_with_empyrical(dependents, independent)
# compare_with_empyrical uses requred_observations=0, so we shouldn't
# have any nans in the output even though we had some in the input.
self.assertTrue(not np.isnan(result).any())
@parameter_space(nan_offset=[-1, 0, 1])
def test_produce_nans_when_too_much_missing_data(self, nan_offset):
rand = np.random.RandomState(42)
true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
independent = as_column(np.linspace(-5., 5., 30))
noise = as_column(rand.uniform(-.1, .1, 30))
dependents = 1.0 + true_betas * independent + noise
# Write nans in a triangular pattern into the middle of the dependent
# array.
nan_grid = np.array([[1, 0, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 1, 0, 0],
[1, 1, 1, 1, 0],
[1, 1, 1, 1, 1]], dtype=bool)
num_nans = nan_grid.sum(axis=0)
# Move the grid around in the parameterized tests. The positions
# shouldn't matter.
dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
for allowed_missing in range(7):
results = vectorized_beta(dependents, independent, allowed_missing)
for i, expected in enumerate(true_betas):
result = results[i]
expect_nan = num_nans[i] > allowed_missing
true_beta = true_betas[i]
if expect_nan:
self.assertTrue(np.isnan(result))
else:
self.assertTrue(np.abs(result - true_beta) < 0.01)
def test_allowed_missing_doesnt_double_count(self):
# Test that allowed_missing only counts a row as missing one
# observation if it's missing in both the dependent and independent
# variable.
rand = np.random.RandomState(42)
true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
independent = as_column(np.linspace(-5., 5., 30))
noise = as_column(rand.uniform(-.1, .1, 30))
dependents = 1.0 + true_betas * independent + noise
# Each column has three nans in the grid.
dependent_nan_grid = np.array([[0, 1, 1, 1, 0],
[0, 0, 1, 1, 1],
[1, 0, 0, 1, 1],
[1, 1, 0, 0, 1],
[1, 1, 1, 0, 0]], dtype=bool)
# There are also two nans in the independent data.
independent_nan_grid = np.array([[0],
[0],
[1],
[1],
[0]], dtype=bool)
dependents[10:15][dependent_nan_grid] = np.nan
independent[10:15][independent_nan_grid] = np.nan
# With only two allowed missing values, everything should come up nan,
# because column has at least 3 nans in the dependent data.
result2 = vectorized_beta(dependents, independent, allowed_missing=2)
assert_equal(np.isnan(result2),
np.array([True, True, True, True, True]))
# With three allowed missing values, the first and last columns should
# produce a value, because they have nans at the same rows where the
# independent data has nans.
result3 = vectorized_beta(dependents, independent, allowed_missing=3)
assert_equal(np.isnan(result3),
np.array([False, True, True, True, False]))
# With four allowed missing values, everything but the middle column
# should produce a value. The middle column will have 5 nans because
# the dependent nans have no overlap with the independent nans.
result4 = vectorized_beta(dependents, independent, allowed_missing=4)
assert_equal(np.isnan(result4),
np.array([False, False, True, False, False]))
# With five allowed missing values, everything should produce a value.
result5 = vectorized_beta(dependents, independent, allowed_missing=5)
assert_equal(np.isnan(result5),
np.array([False, False, False, False, False]))
class VectorizedCorrelationTestCase(zf.ZiplineTestCase):
def naive_columnwise_func(self, func, left, right):
out = np.empty_like(left[0])
self.assertEqual(left.shape, right.shape)
for col in range(left.shape[1]):
left_col = left[:, col]
right_col = right[:, col]
missing = np.isnan(left_col) | np.isnan(right_col)
left_col = left_col[~missing]
right_col = right_col[~missing]
r, pvalue = func(left_col, right_col)
out[col] = r
return out
def naive_columnwise_pearson(self, left, right):
return self.naive_columnwise_func(pearsonr, left, right)
def naive_columnwise_spearman(self, left, right):
return self.naive_columnwise_func(spearmanr, left, right)
@parameter_space(
seed=[1, 2, 42],
nan_offset=[-1, 0, 1],
nans=['dependent', 'independent', 'both'],
__fail_fast=True,
)
def test_produce_nans_when_too_much_missing_data(self,
seed,
nans,
nan_offset):
rand = np.random.RandomState(seed)
betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
independents = as_column(np.linspace(-5., 5., 30)) + np.arange(5)
noise = as_column(rand.uniform(-2, 2, 30))
dependents = 1.0 + betas * independents + noise
# Write nans in a triangular pattern into the middle of the dependent
# array.
nan_grid = np.array([[1, 1, 1, 1, 1],
[0, 1, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1],
[0, 0, 0, 0, 1]], dtype=bool)
if nans == 'dependent' or nans == 'both':
dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
if nans == 'independent' or nans == 'both':
independents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
expected = self.naive_columnwise_pearson(dependents, independents)
for allowed_missing in list(range(7)) + [10000]:
results = vectorized_pearson_r(
dependents, independents, allowed_missing
)
for i, result in enumerate(results):
# column i has i + 1 missing values.
if i + 1 > allowed_missing:
self.assertTrue(np.isnan(result))
else:
assert_equal(result, expected[i])
def test_broadcasting(self):
_independent = as_column(np.array([1, 2, 3, 4, 5]))
dependent = _independent * [2.5, 1.0, -3.5]
def do_check(independent):
result = vectorized_pearson_r(
dependent, independent, allowed_missing=0
)
assert_equal(result, np.array([1.0, 1.0, -1.0]))
# We should get the same result from passing a N x 1 array or an N x 3
# array with the column tiled 3 times.
do_check(_independent)
do_check(np.tile(_independent, 3))