Merge pull request #2071 from quantopian/speedup-pearson

PERF: Speedup RollingPearson
2026-05-14 20:58:10 +00:00 · 2020-01-21 10:57:37 -05:00 · 2020-01-21 10:57:37 -05:00 · 7eeaafbc69
commit 7eeaafbc69
parent c825927d6e cb61ba9650
2 changed files with 167 additions and 10 deletions
--- a/tests/pipeline/test_statistical.py
+++ b/tests/pipeline/test_statistical.py
@ -34,7 +34,10 @@ from zipline.pipeline.factors import (
    RollingSpearmanOfReturns,
    SimpleBeta,
 )
-from zipline.pipeline.factors.statistical import vectorized_beta
+from zipline.pipeline.factors.statistical import (
+    vectorized_beta,
+    vectorized_pearson_r,
+)
 from zipline.pipeline.loaders.frame import DataFrameLoader
 from zipline.pipeline.sentinels import NotSpecified
 from zipline.testing import (
@ -1059,3 +1062,84 @@ class VectorizedBetaTestCase(zf.ZiplineTestCase):
        result5 = vectorized_beta(dependents, independent, allowed_missing=5)
        assert_equal(np.isnan(result5),
                     np.array([False, False, False, False, False]))
+
+
+class VectorizedCorrelationTestCase(ZiplineTestCase):
+
+    def naive_columnwise_func(self, func, left, right):
+        out = np.empty_like(left[0])
+        self.assertEqual(left.shape, right.shape)
+
+        for col in range(left.shape[1]):
+            left_col = left[:, col]
+            right_col = right[:, col]
+            missing = np.isnan(left_col) | np.isnan(right_col)
+            left_col = left_col[~missing]
+            right_col = right_col[~missing]
+            r, pvalue = func(left_col, right_col)
+            out[col] = r
+
+        return out
+
+    def naive_columnwise_pearson(self, left, right):
+        return self.naive_columnwise_func(pearsonr, left, right)
+
+    def naive_columnwise_spearman(self, left, right):
+        return self.naive_columnwise_func(spearmanr, left, right)
+
+    @parameter_space(
+        seed=[1, 2, 42],
+        nan_offset=[-1, 0, 1],
+        nans=['dependent', 'independent', 'both'],
+        __fail_fast=True,
+    )
+    def test_produce_nans_when_too_much_missing_data(self,
+                                                     seed,
+                                                     nans,
+                                                     nan_offset):
+        rand = np.random.RandomState(seed)
+
+        betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
+        independents = as_column(np.linspace(-5., 5., 30)) + np.arange(5)
+        noise = as_column(rand.uniform(-2, 2, 30))
+        dependents = 1.0 + betas * independents + noise
+
+        # Write nans in a triangular pattern into the middle of the dependent
+        # array.
+        nan_grid = np.array([[1, 1, 1, 1, 1],
+                             [0, 1, 1, 1, 1],
+                             [0, 0, 1, 1, 1],
+                             [0, 0, 0, 1, 1],
+                             [0, 0, 0, 0, 1]], dtype=bool)
+
+        if nans == 'dependent' or nans == 'both':
+            dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
+        if nans == 'independent' or nans == 'both':
+            independents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
+
+        expected = self.naive_columnwise_pearson(dependents, independents)
+        for allowed_missing in list(range(7)) + [10000]:
+            results = vectorized_pearson_r(
+                dependents, independents, allowed_missing
+            )
+            for i, result in enumerate(results):
+                # column i has i + 1 missing values.
+                if i + 1 > allowed_missing:
+                    self.assertTrue(np.isnan(result))
+                else:
+                    assert_equal(result, expected[i])
+
+    def test_broadcasting(self):
+        _independent = as_column(np.array([1, 2, 3, 4, 5]))
+        dependent = _independent * [2.5, 1.0, -3.5]
+
+        def do_check(independent):
+            result = vectorized_pearson_r(
+                dependent, independent, allowed_missing=0
+            )
+            assert_equal(result, np.array([1.0, 1.0, -1.0]))
+
+        # We should get the same result from passing a N x 1 array or an N x 3
+        # array with the column tiled 3 times.
+        do_check(_independent)
+        do_check(np.tile(_independent, 3))
--- a/zipline/pipeline/factors/statistical.py
+++ b/zipline/pipeline/factors/statistical.py
@ -1,8 +1,8 @@
+from numexpr import evaluate
 import numpy as np
 from numpy import broadcast_arrays
 from scipy.stats import (
    linregress,
-    pearsonr,
    spearmanr,
 )

@ -88,13 +88,12 @@ class RollingPearson(_RollingCorrelation):
    window_safe = True

    def compute(self, today, assets, out, base_data, target_data):
-        # If `target_data` is a Slice or single column of data, broadcast it
-        # out to the same shape as `base_data`, then compute column-wise. This
-        # is efficient because each column of the broadcasted array only refers
-        # to a single memory location.
-        target_data = broadcast_arrays(target_data, base_data)[0]
-        for i in range(len(out)):
-            out[i] = pearsonr(base_data[:, i], target_data[:, i])[0]
+        vectorized_pearson_r(
+            base_data,
+            target_data,
+            allowed_missing=0,
+            out=out,
+        )


 class RollingSpearman(_RollingCorrelation):
@ -582,8 +581,11 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None):
        Independent variable of the regression
    allowed_missing : int
        Number of allowed missing (NaN) observations per column. Columns with
-        more than this many non-nan observations in both ``dependents`` and
+        more than this many non-nan observations in either ``dependents`` or
        ``independents`` will output NaN as the regression coefficient.
+    out : np.array[M] or None, optional
+        Output array into which to write results.  If None, a new array is
+        created and returned.

    Returns
    -------
@ -663,3 +665,74 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None):
    out[nanlocs] = nan

    return out
+
+
+def vectorized_pearson_r(dependents, independents, allowed_missing, out=None):
+    """
+    Compute Pearson's r between columns of ``dependents`` and ``independents``.
+
+    Parameters
+    ----------
+    dependents : np.array[N, M]
+        Array with columns of data to be regressed against ``independent``.
+    independents : np.array[N, M] or np.array[N, 1]
+        Independent variable(s) of the regression. If a single column is
+        passed, it is broadcast to the shape of ``dependents``.
+    allowed_missing : int
+        Number of allowed missing (NaN) observations per column. Columns with
+        more than this many non-nan observations in either ``dependents`` or
+        ``independents`` will output NaN as the correlation coefficient.
+    out : np.array[M] or None, optional
+        Output array into which to write results.  If None, a new array is
+        created and returned.
+
+    Returns
+    -------
+    correlations : np.array[M]
+        Pearson correlation coefficients for each column of ``dependents``.
+
+    See Also
+    --------
+    :class:`zipline.pipeline.factors.RollingPearson`
+    :class:`zipline.pipeline.factors.RollingPearsonOfReturns`
+    """
+    nan = np.nan
+    isnan = np.isnan
+    N, M = dependents.shape
+
+    if out is None:
+        out = np.full(M, nan)
+
+    if allowed_missing > 0:
+        # If we're handling nans robustly, we need to mask both arrays to
+        # locations where either was nan.
+        either_nan = isnan(dependents) | isnan(independents)
+        independents = np.where(either_nan, nan, independents)
+        dependents = np.where(either_nan, nan, dependents)
+        mean = nanmean
+    else:
+        # Otherwise, we can just use mean, which will give us a nan for any
+        # column where there's ever a nan.
+        mean = np.mean
+
+    # Pearson R is Cov(X, Y) / StdDev(X) * StdDev(Y)
+    # c.f. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
+    ind_residual = independents - mean(independents, axis=0)
+    dep_residual = dependents - mean(dependents, axis=0)
+
+    ind_variance = mean(ind_residual ** 2, axis=0)
+    dep_variance = mean(dep_residual ** 2, axis=0)
+
+    covariances = mean(ind_residual * dep_residual, axis=0)
+
+    evaluate(
+        'where(mask, nan, cov / sqrt(ind_variance * dep_variance))',
+        local_dict={'cov': covariances,
+                    'mask': isnan(independents).sum(axis=0) > allowed_missing,
+                    'nan': np.nan,
+                    'ind_variance': ind_variance,
+                    'dep_variance': dep_variance},
+        global_dict={},
+        out=out,
+    )
+    return out