From e5066fcfa95f4f743e1b795404931c94988dc027 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Mon, 1 Jan 2018 20:37:36 -0500
Subject: [PATCH 1/4] PERF: Speed up RollingPearson.

Use the same techniques used in `SimpleBeta` to re-implement
RollingPearson (and RollingPearsonOfReturns, etc.).

For a 1-month window length, this provides about a 30x speedup on my
machine:

```
pipebench/perf/pearson.stats% stats statistical.py
Mon Jan  1 20:00:31 2018    pipebench/perf/pearson.stats

         9326197 function calls (9325541 primitive calls) in 8.407 seconds

   Ordered by: cumulative time
   List reduced from 766 to 3 due to restriction <'statistical.py'>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       # Old implementation.
       21    0.243    0.012    7.130    0.340 statistical.py:92(compute)
       # New implementation.
       21    0.000    0.000    0.253    0.012 statistical.py:105(compute)
       21    0.170    0.008    0.253    0.012 statistical.py:685(vectorized_pearson_r)
```

The new `vectorized_pearson_r` also has the same support for missing
data that was implemented in `vectorized_beta`, but I haven't pushed that
to logic up to `RollingPearsonR` yet.  For backwards compatibility, the
default for RollingPearson probably needs to stay at 0% allowed missing.
---
 tests/pipeline/test_statistical.py      | 89 ++++++++++++++++++++++++-
 zipline/pipeline/factors/statistical.py | 85 +++++++++++++++++++++--
 2 files changed, 166 insertions(+), 8 deletions(-)

diff --git a/tests/pipeline/test_statistical.py b/tests/pipeline/test_statistical.py
index 7239d89e..6a184f40 100644
--- a/tests/pipeline/test_statistical.py
+++ b/tests/pipeline/test_statistical.py
@@ -33,7 +33,10 @@ from zipline.pipeline.factors import (
     RollingSpearmanOfReturns,
     SimpleBeta,
 )
-from zipline.pipeline.factors.statistical import vectorized_beta
+from zipline.pipeline.factors.statistical import (
+    vectorized_beta,
+    vectorized_pearson_r,
+)
 from zipline.pipeline.loaders.frame import DataFrameLoader
 from zipline.pipeline.sentinels import NotSpecified
 from zipline.testing import (
@@ -1051,3 +1054,87 @@ class VectorizedBetaTestCase(ZiplineTestCase):
         result5 = vectorized_beta(dependents, independent, allowed_missing=5)
         assert_equal(np.isnan(result5),
                      np.array([False, False, False, False, False]))
+
+
+class VectorizedCorrelationTestCase(ZiplineTestCase):
+
+    def naive_columnwise_func(self, func, left, right):
+        out = np.empty_like(left[0])
+        self.assertEqual(left.shape, right.shape)
+
+        for col in range(left.shape[1]):
+            left_col = left[:, col]
+            right_col = right[:, col]
+            missing = np.isnan(left_col) | np.isnan(right_col)
+            left_col = left_col[~missing]
+            right_col = right_col[~missing]
+            r, pvalue = func(left_col, right_col)
+            out[col] = r
+
+        return out
+
+    def naive_columnwise_pearson(self, left, right):
+        return self.naive_columnwise_func(pearsonr, left, right)
+
+    def naive_columnwise_spearman(self, left, right):
+        return self.naive_columnwise_func(spearmanr, left, right)
+
+    @parameter_space(
+        seed=[1, 2, 42],
+        nan_offset=[-1, 0, 1],
+        nans=['dependent', 'independent', 'both'],
+        __fail_fast=True,
+    )
+    def test_produce_nans_when_too_much_missing_data(self,
+                                                     seed,
+                                                     nans,
+                                                     nan_offset):
+        rand = np.random.RandomState(seed)
+
+        betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
+        independents = as_column(np.linspace(-5., 5., 30)) + np.arange(5)
+        noise = as_column(rand.uniform(-2, 2, 30))
+        dependents = 1.0 + betas * independents + noise
+
+        # Write nans in a triangular pattern into the middle of the dependent
+        # array.
+        nan_grid = np.array([[1, 1, 1, 1, 1],
+                             [0, 1, 1, 1, 1],
+                             [0, 0, 1, 1, 1],
+                             [0, 0, 0, 1, 1],
+                             [0, 0, 0, 0, 1]], dtype=bool)
+        num_nans = nan_grid.sum(axis=0)
+
+        if nans == 'dependent' or nans == 'both':
+            dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
+        if nans == 'independent' or nans == 'both':
+            independents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
+
+        expected = self.naive_columnwise_pearson(dependents, independents)
+        for allowed_missing in list(range(7)) + [10000]:
+            results = vectorized_pearson_r(
+                dependents, independents, allowed_missing
+            )
+            for i, result in enumerate(results):
+                # column i has i + 1 missing values.
+                if i + 1 >  allowed_missing:
+                    self.assertTrue(np.isnan(result))
+                else:
+                    assert_equal(result, expected[i])
+
+    def test_broadcasting(self):
+        rand = np.random.RandomState(42)
+
+        _independent = as_column(np.array([1, 2, 3, 4, 5]))
+        dependent = _independent * [2.5, 1.0, -3.5]
+
+        def do_check(independent):
+            result = vectorized_pearson_r(
+                dependent, independent, allowed_missing=0
+            )
+            assert_equal(result, np.array([1.0, 1.0, -1.0]))
+
+        # We should get the same result from passing a N x 1 array or an N x 3
+        # array with the column tiled 3 times.
+        do_check(_independent)
+        do_check(np.tile(_independent, 3))
diff --git a/zipline/pipeline/factors/statistical.py b/zipline/pipeline/factors/statistical.py
index 01137b88..8b055d13 100644
--- a/zipline/pipeline/factors/statistical.py
+++ b/zipline/pipeline/factors/statistical.py
@@ -1,3 +1,4 @@
+from numexpr import evaluate
 import numpy as np
 from numpy import broadcast_arrays
 from scipy.stats import (
@@ -88,13 +89,12 @@ class RollingPearson(_RollingCorrelation):
     window_safe = True
 
     def compute(self, today, assets, out, base_data, target_data):
-        # If `target_data` is a Slice or single column of data, broadcast it
-        # out to the same shape as `base_data`, then compute column-wise. This
-        # is efficient because each column of the broadcasted array only refers
-        # to a single memory location.
-        target_data = broadcast_arrays(target_data, base_data)[0]
-        for i in range(len(out)):
-            out[i] = pearsonr(base_data[:, i], target_data[:, i])[0]
+        vectorized_pearson_r(
+            base_data,
+            target_data,
+            allowed_missing=0,
+            out=out,
+        )
 
 
 class RollingSpearman(_RollingCorrelation):
@@ -663,3 +663,74 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None):
     out[nanlocs] = nan
 
     return out
+
+
+def vectorized_pearson_r(dependents, independents, allowed_missing, out=None):
+    """
+    Compute Pearson's r between columns of ``dependents`` and ``independents``.
+
+    Parameters
+    ----------
+    dependents : np.array[N, M]
+        Array with columns of data to be regressed against ``independent``.
+    independents : np.array[N, M] or np.array[N, 1]
+        Independent variable(s) of the regression. If a single column is
+        passed, it is broadcast to the shape of ``dependents``.
+    allowed_missing : int
+        Number of allowed missing (NaN) observations per column. Columns with
+        more than this many non-nan observations in either ``dependents`` or
+        ``independents`` will output NaN as the correlation coefficient.
+    out : np.array[M] or None, optional
+        Output array into which to write results.  If None, a new array is
+        created and returned.
+
+    Returns
+    -------
+    correlations : np.array[M]
+        Pearson correlation coefficients for each column of ``dependents``.
+
+    See Also
+    --------
+    :class:`zipline.pipeline.factors.RollingPearson`
+    :class:`zipline.pipeline.factors.RollingPearsonOfReturns`
+    """
+    nan = np.nan
+    isnan = np.isnan
+    N, M = dependents.shape
+
+    if out is None:
+        out = np.full(M, nan)
+
+    if allowed_missing > 0:
+        # If we're handling nans robustly, we need to mask both arrays to
+        # locations where either was nan.
+        either_nan = isnan(dependents) | isnan(independents)
+        independents = np.where(either_nan, nan, independents)
+        dependents = np.where(either_nan, nan, dependents)
+        mean = nanmean
+    else:
+        # Otherwise, we can just use mean, which will give us a nan for any
+        # column where there's ever a nan.
+        mean = np.mean
+
+    # Pearson R is Cov(X, Y) / Var(X) * Var(Y).
+    # c.f. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
+    ind_residual = independents - mean(independents, axis=0)
+    dep_residual = dependents - mean(dependents, axis=0)
+
+    ind_variance = mean(ind_residual ** 2, axis=0)
+    dep_variance = mean(dep_residual ** 2, axis=0)
+
+    covariances = mean(ind_residual * dep_residual, axis=0)
+
+    evaluate(
+        'where(mask, nan, cov / sqrt(ind_variance * dep_variance))',
+        local_dict={'cov': covariances,
+                    'mask': isnan(independents).sum(axis=0) > allowed_missing,
+                    'nan': np.nan,
+                    'ind_variance': ind_variance,
+                    'dep_variance': dep_variance},
+        global_dict={},
+        out=out,
+    )
+    return out

From 69a4c38dfc98a851fc53ba5fcc005db13255fb67 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Mon, 1 Jan 2018 20:51:55 -0500
Subject: [PATCH 2/4] DOC: Clarify docstring for vectorized_beta.

---
 zipline/pipeline/factors/statistical.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/zipline/pipeline/factors/statistical.py b/zipline/pipeline/factors/statistical.py
index 8b055d13..42de86f6 100644
--- a/zipline/pipeline/factors/statistical.py
+++ b/zipline/pipeline/factors/statistical.py
@@ -582,8 +582,11 @@ def vectorized_beta(dependents, independent, allowed_missing, out=None):
         Independent variable of the regression
     allowed_missing : int
         Number of allowed missing (NaN) observations per column. Columns with
-        more than this many non-nan observations in both ``dependents`` and
+        more than this many non-nan observations in either ``dependents`` or
         ``independents`` will output NaN as the regression coefficient.
+    out : np.array[M] or None, optional
+        Output array into which to write results.  If None, a new array is
+        created and returned.
 
     Returns
     -------

From c3553940e3b91266bd0b86f0f9a3480f76dfd58c Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Mon, 1 Jan 2018 21:17:18 -0500
Subject: [PATCH 3/4] STY: Flake8 fixes.

---
 tests/pipeline/test_statistical.py      | 5 +----
 zipline/pipeline/factors/statistical.py | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/pipeline/test_statistical.py b/tests/pipeline/test_statistical.py
index 6a184f40..2456b6a9 100644
--- a/tests/pipeline/test_statistical.py
+++ b/tests/pipeline/test_statistical.py
@@ -1103,7 +1103,6 @@ class VectorizedCorrelationTestCase(ZiplineTestCase):
                              [0, 0, 1, 1, 1],
                              [0, 0, 0, 1, 1],
                              [0, 0, 0, 0, 1]], dtype=bool)
-        num_nans = nan_grid.sum(axis=0)
 
         if nans == 'dependent' or nans == 'both':
             dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
@@ -1117,14 +1116,12 @@ class VectorizedCorrelationTestCase(ZiplineTestCase):
             )
             for i, result in enumerate(results):
                 # column i has i + 1 missing values.
-                if i + 1 >  allowed_missing:
+                if i + 1 > allowed_missing:
                     self.assertTrue(np.isnan(result))
                 else:
                     assert_equal(result, expected[i])
 
     def test_broadcasting(self):
-        rand = np.random.RandomState(42)
-
         _independent = as_column(np.array([1, 2, 3, 4, 5]))
         dependent = _independent * [2.5, 1.0, -3.5]
 
diff --git a/zipline/pipeline/factors/statistical.py b/zipline/pipeline/factors/statistical.py
index 42de86f6..09286ba4 100644
--- a/zipline/pipeline/factors/statistical.py
+++ b/zipline/pipeline/factors/statistical.py
@@ -3,7 +3,6 @@ import numpy as np
 from numpy import broadcast_arrays
 from scipy.stats import (
     linregress,
-    pearsonr,
     spearmanr,
 )
 

From cb61ba9650dbde1e0df9c74b305f8c3d43d082a3 Mon Sep 17 00:00:00 2001
From: Scott Sanderson <ssanderson@quantopian.com>
Date: Tue, 2 Jan 2018 12:28:19 -0500
Subject: [PATCH 4/4] DOC: Fix incorrect formula in comment.

---
 zipline/pipeline/factors/statistical.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zipline/pipeline/factors/statistical.py b/zipline/pipeline/factors/statistical.py
index 09286ba4..511e2680 100644
--- a/zipline/pipeline/factors/statistical.py
+++ b/zipline/pipeline/factors/statistical.py
@@ -715,7 +715,7 @@ def vectorized_pearson_r(dependents, independents, allowed_missing, out=None):
         # column where there's ever a nan.
         mean = np.mean
 
-    # Pearson R is Cov(X, Y) / Var(X) * Var(Y).
+    # Pearson R is Cov(X, Y) / StdDev(X) * StdDev(Y)
     # c.f. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
     ind_residual = independents - mean(independents, axis=0)
     dep_residual = dependents - mean(dependents, axis=0)