mirror of
https://github.com/saymrwulf/zipline.git
synced 2026-05-14 20:58:10 +00:00
Merge pull request #2071 from quantopian/speedup-pearson
PERF: Speedup RollingPearson
This commit is contained in:
commit
7eeaafbc69
2 changed files with 167 additions and 10 deletions
|
|
@ -34,7 +34,10 @@ from zipline.pipeline.factors import (
|
|||
RollingSpearmanOfReturns,
|
||||
SimpleBeta,
|
||||
)
|
||||
from zipline.pipeline.factors.statistical import vectorized_beta
|
||||
from zipline.pipeline.factors.statistical import (
|
||||
vectorized_beta,
|
||||
vectorized_pearson_r,
|
||||
)
|
||||
from zipline.pipeline.loaders.frame import DataFrameLoader
|
||||
from zipline.pipeline.sentinels import NotSpecified
|
||||
from zipline.testing import (
|
||||
|
|
@ -1059,3 +1062,84 @@ class VectorizedBetaTestCase(zf.ZiplineTestCase):
|
|||
result5 = vectorized_beta(dependents, independent, allowed_missing=5)
|
||||
assert_equal(np.isnan(result5),
|
||||
np.array([False, False, False, False, False]))
|
||||
|
||||
|
||||
class VectorizedCorrelationTestCase(ZiplineTestCase):
|
||||
|
||||
def naive_columnwise_func(self, func, left, right):
|
||||
out = np.empty_like(left[0])
|
||||
self.assertEqual(left.shape, right.shape)
|
||||
|
||||
for col in range(left.shape[1]):
|
||||
left_col = left[:, col]
|
||||
right_col = right[:, col]
|
||||
missing = np.isnan(left_col) | np.isnan(right_col)
|
||||
left_col = left_col[~missing]
|
||||
right_col = right_col[~missing]
|
||||
r, pvalue = func(left_col, right_col)
|
||||
out[col] = r
|
||||
|
||||
return out
|
||||
|
||||
def naive_columnwise_pearson(self, left, right):
|
||||
return self.naive_columnwise_func(pearsonr, left, right)
|
||||
|
||||
def naive_columnwise_spearman(self, left, right):
|
||||
return self.naive_columnwise_func(spearmanr, left, right)
|
||||
|
||||
@parameter_space(
|
||||
seed=[1, 2, 42],
|
||||
nan_offset=[-1, 0, 1],
|
||||
nans=['dependent', 'independent', 'both'],
|
||||
__fail_fast=True,
|
||||
)
|
||||
def test_produce_nans_when_too_much_missing_data(self,
|
||||
seed,
|
||||
nans,
|
||||
nan_offset):
|
||||
rand = np.random.RandomState(seed)
|
||||
|
||||
betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
|
||||
independents = as_column(np.linspace(-5., 5., 30)) + np.arange(5)
|
||||
noise = as_column(rand.uniform(-2, 2, 30))
|
||||
dependents = 1.0 + betas * independents + noise
|
||||
|
||||
# Write nans in a triangular pattern into the middle of the dependent
|
||||
# array.
|
||||
nan_grid = np.array([[1, 1, 1, 1, 1],
|
||||
[0, 1, 1, 1, 1],
|
||||
[0, 0, 1, 1, 1],
|
||||
[0, 0, 0, 1, 1],
|
||||
[0, 0, 0, 0, 1]], dtype=bool)
|
||||
|
||||
if nans == 'dependent' or nans == 'both':
|
||||
dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
|
||||
if nans == 'independent' or nans == 'both':
|
||||
independents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
|
||||
|
||||
expected = self.naive_columnwise_pearson(dependents, independents)
|
||||
for allowed_missing in list(range(7)) + [10000]:
|
||||
results = vectorized_pearson_r(
|
||||
dependents, independents, allowed_missing
|
||||
)
|
||||
for i, result in enumerate(results):
|
||||
# column i has i + 1 missing values.
|
||||
if i + 1 > allowed_missing:
|
||||
self.assertTrue(np.isnan(result))
|
||||
else:
|
||||
assert_equal(result, expected[i])
|
||||
|
||||
def test_broadcasting(self):
|
||||
_independent = as_column(np.array([1, 2, 3, 4, 5]))
|
||||
dependent = _independent * [2.5, 1.0, -3.5]
|
||||
|
||||
def do_check(independent):
|
||||
result = vectorized_pearson_r(
|
||||
dependent, independent, allowed_missing=0
|
||||
)
|
||||
assert_equal(result, np.array([1.0, 1.0, -1.0]))
|
||||
|
||||
# We should get the same result from passing a N x 1 array or an N x 3
|
||||
# array with the column tiled 3 times.
|
||||
do_check(_independent)
|
||||
do_check(np.tile(_independent, 3))
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
from numexpr import evaluate
|
||||
import numpy as np
|
||||
from numpy import broadcast_arrays
|
||||
from scipy.stats import (
|
||||
linregress,
|
||||
pearsonr,
|
||||
spearmanr,
|
||||
)
|
||||
|
||||
|
|
@ -88,13 +88,12 @@ class RollingPearson(_RollingCorrelation):
|
|||
window_safe = True
|
||||
|
||||
def compute(self, today, assets, out, base_data, target_data):
|
||||
# If `target_data` is a Slice or single column of data, broadcast it
|
||||
# out to the same shape as `base_data`, then compute column-wise. This
|
||||
# is efficient because each column of the broadcasted array only refers
|
||||
# to a single memory location.
|
||||
target_data = broadcast_arrays(target_data, base_data)[0]
|
||||
for i in range(len(out)):
|
||||
out[i] = pearsonr(base_data[:, i], target_data[:, i])[0]
|
||||
vectorized_pearson_r(
|
||||
base_data,
|
||||
target_data,
|
||||
allowed_missing=0,
|
||||
out=out,
|
||||
)
|
||||
|
||||
|
||||
class RollingSpearman(_RollingCorrelation):
|
||||
|
|
@ -582,8 +581,11 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None):
|
|||
Independent variable of the regression
|
||||
allowed_missing : int
|
||||
Number of allowed missing (NaN) observations per column. Columns with
|
||||
more than this many non-nan observations in both ``dependents`` and
|
||||
more than this many non-nan observations in either ``dependents`` or
|
||||
``independents`` will output NaN as the regression coefficient.
|
||||
out : np.array[M] or None, optional
|
||||
Output array into which to write results. If None, a new array is
|
||||
created and returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
|
@ -663,3 +665,74 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None):
|
|||
out[nanlocs] = nan
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def vectorized_pearson_r(dependents, independents, allowed_missing, out=None):
|
||||
"""
|
||||
Compute Pearson's r between columns of ``dependents`` and ``independents``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dependents : np.array[N, M]
|
||||
Array with columns of data to be regressed against ``independent``.
|
||||
independents : np.array[N, M] or np.array[N, 1]
|
||||
Independent variable(s) of the regression. If a single column is
|
||||
passed, it is broadcast to the shape of ``dependents``.
|
||||
allowed_missing : int
|
||||
Number of allowed missing (NaN) observations per column. Columns with
|
||||
more than this many non-nan observations in either ``dependents`` or
|
||||
``independents`` will output NaN as the correlation coefficient.
|
||||
out : np.array[M] or None, optional
|
||||
Output array into which to write results. If None, a new array is
|
||||
created and returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
correlations : np.array[M]
|
||||
Pearson correlation coefficients for each column of ``dependents``.
|
||||
|
||||
See Also
|
||||
--------
|
||||
:class:`zipline.pipeline.factors.RollingPearson`
|
||||
:class:`zipline.pipeline.factors.RollingPearsonOfReturns`
|
||||
"""
|
||||
nan = np.nan
|
||||
isnan = np.isnan
|
||||
N, M = dependents.shape
|
||||
|
||||
if out is None:
|
||||
out = np.full(M, nan)
|
||||
|
||||
if allowed_missing > 0:
|
||||
# If we're handling nans robustly, we need to mask both arrays to
|
||||
# locations where either was nan.
|
||||
either_nan = isnan(dependents) | isnan(independents)
|
||||
independents = np.where(either_nan, nan, independents)
|
||||
dependents = np.where(either_nan, nan, dependents)
|
||||
mean = nanmean
|
||||
else:
|
||||
# Otherwise, we can just use mean, which will give us a nan for any
|
||||
# column where there's ever a nan.
|
||||
mean = np.mean
|
||||
|
||||
# Pearson R is Cov(X, Y) / StdDev(X) * StdDev(Y)
|
||||
# c.f. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
|
||||
ind_residual = independents - mean(independents, axis=0)
|
||||
dep_residual = dependents - mean(dependents, axis=0)
|
||||
|
||||
ind_variance = mean(ind_residual ** 2, axis=0)
|
||||
dep_variance = mean(dep_residual ** 2, axis=0)
|
||||
|
||||
covariances = mean(ind_residual * dep_residual, axis=0)
|
||||
|
||||
evaluate(
|
||||
'where(mask, nan, cov / sqrt(ind_variance * dep_variance))',
|
||||
local_dict={'cov': covariances,
|
||||
'mask': isnan(independents).sum(axis=0) > allowed_missing,
|
||||
'nan': np.nan,
|
||||
'ind_variance': ind_variance,
|
||||
'dep_variance': dep_variance},
|
||||
global_dict={},
|
||||
out=out,
|
||||
)
|
||||
return out
|
||||
|
|
|
|||
Loading…
Reference in a new issue