From e5066fcfa95f4f743e1b795404931c94988dc027 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Mon, 1 Jan 2018 20:37:36 -0500 Subject: [PATCH 1/4] PERF: Speed up RollingPearson. Use the same techniques used in `SimpleBeta` to re-implement RollingPearson (and RollingPearsonOfReturns, etc.). For a 1-month window length, this provides about a 30x speedup on my machine: ``` pipebench/perf/pearson.stats% stats statistical.py Mon Jan 1 20:00:31 2018 pipebench/perf/pearson.stats 9326197 function calls (9325541 primitive calls) in 8.407 seconds Ordered by: cumulative time List reduced from 766 to 3 due to restriction <'statistical.py'> ncalls tottime percall cumtime percall filename:lineno(function) # Old implementation. 21 0.243 0.012 7.130 0.340 statistical.py:92(compute) # New implementation. 21 0.000 0.000 0.253 0.012 statistical.py:105(compute) 21 0.170 0.008 0.253 0.012 statistical.py:685(vectorized_pearson_r) ``` The new `vectorized_pearson_r` also has the same support for missing data that was implemented in `vectorized_beta`, but I haven't pushed that to logic up to `RollingPearsonR` yet. For backwards compatibility, the default for RollingPearson probably needs to stay at 0% allowed missing. --- tests/pipeline/test_statistical.py | 89 ++++++++++++++++++++++++- zipline/pipeline/factors/statistical.py | 85 +++++++++++++++++++++-- 2 files changed, 166 insertions(+), 8 deletions(-) diff --git a/tests/pipeline/test_statistical.py b/tests/pipeline/test_statistical.py index 7239d89e..6a184f40 100644 --- a/tests/pipeline/test_statistical.py +++ b/tests/pipeline/test_statistical.py @@ -33,7 +33,10 @@ from zipline.pipeline.factors import ( RollingSpearmanOfReturns, SimpleBeta, ) -from zipline.pipeline.factors.statistical import vectorized_beta +from zipline.pipeline.factors.statistical import ( + vectorized_beta, + vectorized_pearson_r, +) from zipline.pipeline.loaders.frame import DataFrameLoader from zipline.pipeline.sentinels import NotSpecified from zipline.testing import ( @@ -1051,3 +1054,87 @@ class VectorizedBetaTestCase(ZiplineTestCase): result5 = vectorized_beta(dependents, independent, allowed_missing=5) assert_equal(np.isnan(result5), np.array([False, False, False, False, False])) + + +class VectorizedCorrelationTestCase(ZiplineTestCase): + + def naive_columnwise_func(self, func, left, right): + out = np.empty_like(left[0]) + self.assertEqual(left.shape, right.shape) + + for col in range(left.shape[1]): + left_col = left[:, col] + right_col = right[:, col] + missing = np.isnan(left_col) | np.isnan(right_col) + left_col = left_col[~missing] + right_col = right_col[~missing] + r, pvalue = func(left_col, right_col) + out[col] = r + + return out + + def naive_columnwise_pearson(self, left, right): + return self.naive_columnwise_func(pearsonr, left, right) + + def naive_columnwise_spearman(self, left, right): + return self.naive_columnwise_func(spearmanr, left, right) + + @parameter_space( + seed=[1, 2, 42], + nan_offset=[-1, 0, 1], + nans=['dependent', 'independent', 'both'], + __fail_fast=True, + ) + def test_produce_nans_when_too_much_missing_data(self, + seed, + nans, + nan_offset): + rand = np.random.RandomState(seed) + + betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5]) + independents = as_column(np.linspace(-5., 5., 30)) + np.arange(5) + noise = as_column(rand.uniform(-2, 2, 30)) + dependents = 1.0 + betas * independents + noise + + # Write nans in a triangular pattern into the middle of the dependent + # array. + nan_grid = np.array([[1, 1, 1, 1, 1], + [0, 1, 1, 1, 1], + [0, 0, 1, 1, 1], + [0, 0, 0, 1, 1], + [0, 0, 0, 0, 1]], dtype=bool) + num_nans = nan_grid.sum(axis=0) + + if nans == 'dependent' or nans == 'both': + dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan + if nans == 'independent' or nans == 'both': + independents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan + + expected = self.naive_columnwise_pearson(dependents, independents) + for allowed_missing in list(range(7)) + [10000]: + results = vectorized_pearson_r( + dependents, independents, allowed_missing + ) + for i, result in enumerate(results): + # column i has i + 1 missing values. + if i + 1 > allowed_missing: + self.assertTrue(np.isnan(result)) + else: + assert_equal(result, expected[i]) + + def test_broadcasting(self): + rand = np.random.RandomState(42) + + _independent = as_column(np.array([1, 2, 3, 4, 5])) + dependent = _independent * [2.5, 1.0, -3.5] + + def do_check(independent): + result = vectorized_pearson_r( + dependent, independent, allowed_missing=0 + ) + assert_equal(result, np.array([1.0, 1.0, -1.0])) + + # We should get the same result from passing a N x 1 array or an N x 3 + # array with the column tiled 3 times. + do_check(_independent) + do_check(np.tile(_independent, 3)) diff --git a/zipline/pipeline/factors/statistical.py b/zipline/pipeline/factors/statistical.py index 01137b88..8b055d13 100644 --- a/zipline/pipeline/factors/statistical.py +++ b/zipline/pipeline/factors/statistical.py @@ -1,3 +1,4 @@ +from numexpr import evaluate import numpy as np from numpy import broadcast_arrays from scipy.stats import ( @@ -88,13 +89,12 @@ class RollingPearson(_RollingCorrelation): window_safe = True def compute(self, today, assets, out, base_data, target_data): - # If `target_data` is a Slice or single column of data, broadcast it - # out to the same shape as `base_data`, then compute column-wise. This - # is efficient because each column of the broadcasted array only refers - # to a single memory location. - target_data = broadcast_arrays(target_data, base_data)[0] - for i in range(len(out)): - out[i] = pearsonr(base_data[:, i], target_data[:, i])[0] + vectorized_pearson_r( + base_data, + target_data, + allowed_missing=0, + out=out, + ) class RollingSpearman(_RollingCorrelation): @@ -663,3 +663,74 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None): out[nanlocs] = nan return out + + +def vectorized_pearson_r(dependents, independents, allowed_missing, out=None): + """ + Compute Pearson's r between columns of ``dependents`` and ``independents``. + + Parameters + ---------- + dependents : np.array[N, M] + Array with columns of data to be regressed against ``independent``. + independents : np.array[N, M] or np.array[N, 1] + Independent variable(s) of the regression. If a single column is + passed, it is broadcast to the shape of ``dependents``. + allowed_missing : int + Number of allowed missing (NaN) observations per column. Columns with + more than this many non-nan observations in either ``dependents`` or + ``independents`` will output NaN as the correlation coefficient. + out : np.array[M] or None, optional + Output array into which to write results. If None, a new array is + created and returned. + + Returns + ------- + correlations : np.array[M] + Pearson correlation coefficients for each column of ``dependents``. + + See Also + -------- + :class:`zipline.pipeline.factors.RollingPearson` + :class:`zipline.pipeline.factors.RollingPearsonOfReturns` + """ + nan = np.nan + isnan = np.isnan + N, M = dependents.shape + + if out is None: + out = np.full(M, nan) + + if allowed_missing > 0: + # If we're handling nans robustly, we need to mask both arrays to + # locations where either was nan. + either_nan = isnan(dependents) | isnan(independents) + independents = np.where(either_nan, nan, independents) + dependents = np.where(either_nan, nan, dependents) + mean = nanmean + else: + # Otherwise, we can just use mean, which will give us a nan for any + # column where there's ever a nan. + mean = np.mean + + # Pearson R is Cov(X, Y) / Var(X) * Var(Y). + # c.f. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient + ind_residual = independents - mean(independents, axis=0) + dep_residual = dependents - mean(dependents, axis=0) + + ind_variance = mean(ind_residual ** 2, axis=0) + dep_variance = mean(dep_residual ** 2, axis=0) + + covariances = mean(ind_residual * dep_residual, axis=0) + + evaluate( + 'where(mask, nan, cov / sqrt(ind_variance * dep_variance))', + local_dict={'cov': covariances, + 'mask': isnan(independents).sum(axis=0) > allowed_missing, + 'nan': np.nan, + 'ind_variance': ind_variance, + 'dep_variance': dep_variance}, + global_dict={}, + out=out, + ) + return out From 69a4c38dfc98a851fc53ba5fcc005db13255fb67 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Mon, 1 Jan 2018 20:51:55 -0500 Subject: [PATCH 2/4] DOC: Clarify docstring for vectorized_beta. --- zipline/pipeline/factors/statistical.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/zipline/pipeline/factors/statistical.py b/zipline/pipeline/factors/statistical.py index 8b055d13..42de86f6 100644 --- a/zipline/pipeline/factors/statistical.py +++ b/zipline/pipeline/factors/statistical.py @@ -582,8 +582,11 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None): Independent variable of the regression allowed_missing : int Number of allowed missing (NaN) observations per column. Columns with - more than this many non-nan observations in both ``dependents`` and + more than this many non-nan observations in either ``dependents`` or ``independents`` will output NaN as the regression coefficient. + out : np.array[M] or None, optional + Output array into which to write results. If None, a new array is + created and returned. Returns ------- From c3553940e3b91266bd0b86f0f9a3480f76dfd58c Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Mon, 1 Jan 2018 21:17:18 -0500 Subject: [PATCH 3/4] STY: Flake8 fixes. --- tests/pipeline/test_statistical.py | 5 +---- zipline/pipeline/factors/statistical.py | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/pipeline/test_statistical.py b/tests/pipeline/test_statistical.py index 6a184f40..2456b6a9 100644 --- a/tests/pipeline/test_statistical.py +++ b/tests/pipeline/test_statistical.py @@ -1103,7 +1103,6 @@ class VectorizedCorrelationTestCase(ZiplineTestCase): [0, 0, 1, 1, 1], [0, 0, 0, 1, 1], [0, 0, 0, 0, 1]], dtype=bool) - num_nans = nan_grid.sum(axis=0) if nans == 'dependent' or nans == 'both': dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan @@ -1117,14 +1116,12 @@ class VectorizedCorrelationTestCase(ZiplineTestCase): ) for i, result in enumerate(results): # column i has i + 1 missing values. - if i + 1 > allowed_missing: + if i + 1 > allowed_missing: self.assertTrue(np.isnan(result)) else: assert_equal(result, expected[i]) def test_broadcasting(self): - rand = np.random.RandomState(42) - _independent = as_column(np.array([1, 2, 3, 4, 5])) dependent = _independent * [2.5, 1.0, -3.5] diff --git a/zipline/pipeline/factors/statistical.py b/zipline/pipeline/factors/statistical.py index 42de86f6..09286ba4 100644 --- a/zipline/pipeline/factors/statistical.py +++ b/zipline/pipeline/factors/statistical.py @@ -3,7 +3,6 @@ import numpy as np from numpy import broadcast_arrays from scipy.stats import ( linregress, - pearsonr, spearmanr, ) From cb61ba9650dbde1e0df9c74b305f8c3d43d082a3 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Tue, 2 Jan 2018 12:28:19 -0500 Subject: [PATCH 4/4] DOC: Fix incorrect formula in comment. --- zipline/pipeline/factors/statistical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zipline/pipeline/factors/statistical.py b/zipline/pipeline/factors/statistical.py index 09286ba4..511e2680 100644 --- a/zipline/pipeline/factors/statistical.py +++ b/zipline/pipeline/factors/statistical.py @@ -715,7 +715,7 @@ def vectorized_pearson_r(dependents, independents, allowed_missing, out=None): # column where there's ever a nan. mean = np.mean - # Pearson R is Cov(X, Y) / Var(X) * Var(Y). + # Pearson R is Cov(X, Y) / StdDev(X) * StdDev(Y) # c.f. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient ind_residual = independents - mean(independents, axis=0) dep_residual = dependents - mean(dependents, axis=0)