Merge pull request #2071 from quantopian/speedup-pearson

PERF: Speedup RollingPearson
This commit is contained in:
Scott Sanderson 2020-01-21 10:57:37 -05:00 committed by GitHub
commit 7eeaafbc69
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 167 additions and 10 deletions

View file

@ -34,7 +34,10 @@ from zipline.pipeline.factors import (
RollingSpearmanOfReturns,
SimpleBeta,
)
from zipline.pipeline.factors.statistical import vectorized_beta
from zipline.pipeline.factors.statistical import (
vectorized_beta,
vectorized_pearson_r,
)
from zipline.pipeline.loaders.frame import DataFrameLoader
from zipline.pipeline.sentinels import NotSpecified
from zipline.testing import (
@ -1059,3 +1062,84 @@ class VectorizedBetaTestCase(zf.ZiplineTestCase):
result5 = vectorized_beta(dependents, independent, allowed_missing=5)
assert_equal(np.isnan(result5),
np.array([False, False, False, False, False]))
class VectorizedCorrelationTestCase(ZiplineTestCase):
def naive_columnwise_func(self, func, left, right):
out = np.empty_like(left[0])
self.assertEqual(left.shape, right.shape)
for col in range(left.shape[1]):
left_col = left[:, col]
right_col = right[:, col]
missing = np.isnan(left_col) | np.isnan(right_col)
left_col = left_col[~missing]
right_col = right_col[~missing]
r, pvalue = func(left_col, right_col)
out[col] = r
return out
def naive_columnwise_pearson(self, left, right):
return self.naive_columnwise_func(pearsonr, left, right)
def naive_columnwise_spearman(self, left, right):
return self.naive_columnwise_func(spearmanr, left, right)
@parameter_space(
seed=[1, 2, 42],
nan_offset=[-1, 0, 1],
nans=['dependent', 'independent', 'both'],
__fail_fast=True,
)
def test_produce_nans_when_too_much_missing_data(self,
seed,
nans,
nan_offset):
rand = np.random.RandomState(seed)
betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
independents = as_column(np.linspace(-5., 5., 30)) + np.arange(5)
noise = as_column(rand.uniform(-2, 2, 30))
dependents = 1.0 + betas * independents + noise
# Write nans in a triangular pattern into the middle of the dependent
# array.
nan_grid = np.array([[1, 1, 1, 1, 1],
[0, 1, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1],
[0, 0, 0, 0, 1]], dtype=bool)
if nans == 'dependent' or nans == 'both':
dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
if nans == 'independent' or nans == 'both':
independents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
expected = self.naive_columnwise_pearson(dependents, independents)
for allowed_missing in list(range(7)) + [10000]:
results = vectorized_pearson_r(
dependents, independents, allowed_missing
)
for i, result in enumerate(results):
# column i has i + 1 missing values.
if i + 1 > allowed_missing:
self.assertTrue(np.isnan(result))
else:
assert_equal(result, expected[i])
def test_broadcasting(self):
_independent = as_column(np.array([1, 2, 3, 4, 5]))
dependent = _independent * [2.5, 1.0, -3.5]
def do_check(independent):
result = vectorized_pearson_r(
dependent, independent, allowed_missing=0
)
assert_equal(result, np.array([1.0, 1.0, -1.0]))
# We should get the same result from passing a N x 1 array or an N x 3
# array with the column tiled 3 times.
do_check(_independent)
do_check(np.tile(_independent, 3))

View file

@ -1,8 +1,8 @@
from numexpr import evaluate
import numpy as np
from numpy import broadcast_arrays
from scipy.stats import (
linregress,
pearsonr,
spearmanr,
)
@ -88,13 +88,12 @@ class RollingPearson(_RollingCorrelation):
window_safe = True
def compute(self, today, assets, out, base_data, target_data):
# If `target_data` is a Slice or single column of data, broadcast it
# out to the same shape as `base_data`, then compute column-wise. This
# is efficient because each column of the broadcasted array only refers
# to a single memory location.
target_data = broadcast_arrays(target_data, base_data)[0]
for i in range(len(out)):
out[i] = pearsonr(base_data[:, i], target_data[:, i])[0]
vectorized_pearson_r(
base_data,
target_data,
allowed_missing=0,
out=out,
)
class RollingSpearman(_RollingCorrelation):
@ -582,8 +581,11 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None):
Independent variable of the regression
allowed_missing : int
Number of allowed missing (NaN) observations per column. Columns with
more than this many non-nan observations in both ``dependents`` and
more than this many non-nan observations in either ``dependents`` or
``independents`` will output NaN as the regression coefficient.
out : np.array[M] or None, optional
Output array into which to write results. If None, a new array is
created and returned.
Returns
-------
@ -663,3 +665,74 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None):
out[nanlocs] = nan
return out
def vectorized_pearson_r(dependents, independents, allowed_missing, out=None):
"""
Compute Pearson's r between columns of ``dependents`` and ``independents``.
Parameters
----------
dependents : np.array[N, M]
Array with columns of data to be regressed against ``independent``.
independents : np.array[N, M] or np.array[N, 1]
Independent variable(s) of the regression. If a single column is
passed, it is broadcast to the shape of ``dependents``.
allowed_missing : int
Number of allowed missing (NaN) observations per column. Columns with
more than this many non-nan observations in either ``dependents`` or
``independents`` will output NaN as the correlation coefficient.
out : np.array[M] or None, optional
Output array into which to write results. If None, a new array is
created and returned.
Returns
-------
correlations : np.array[M]
Pearson correlation coefficients for each column of ``dependents``.
See Also
--------
:class:`zipline.pipeline.factors.RollingPearson`
:class:`zipline.pipeline.factors.RollingPearsonOfReturns`
"""
nan = np.nan
isnan = np.isnan
N, M = dependents.shape
if out is None:
out = np.full(M, nan)
if allowed_missing > 0:
# If we're handling nans robustly, we need to mask both arrays to
# locations where either was nan.
either_nan = isnan(dependents) | isnan(independents)
independents = np.where(either_nan, nan, independents)
dependents = np.where(either_nan, nan, dependents)
mean = nanmean
else:
# Otherwise, we can just use mean, which will give us a nan for any
# column where there's ever a nan.
mean = np.mean
# Pearson R is Cov(X, Y) / StdDev(X) * StdDev(Y)
# c.f. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
ind_residual = independents - mean(independents, axis=0)
dep_residual = dependents - mean(dependents, axis=0)
ind_variance = mean(ind_residual ** 2, axis=0)
dep_variance = mean(dep_residual ** 2, axis=0)
covariances = mean(ind_residual * dep_residual, axis=0)
evaluate(
'where(mask, nan, cov / sqrt(ind_variance * dep_variance))',
local_dict={'cov': covariances,
'mask': isnan(independents).sum(axis=0) > allowed_missing,
'nan': np.nan,
'ind_variance': ind_variance,
'dep_variance': dep_variance},
global_dict={},
out=out,
)
return out