zipline/tests/pipeline/test_statistical.py

"""
Tests for statistical pipeline terms.
"""
import numpy as np
from numpy import (
    arange,
    full,
    full_like,
    nan,
    where,
)
from pandas import (
    DataFrame,
    date_range,
    Int64Index,
    Timestamp,
)
from pandas.util.testing import assert_frame_equal
from scipy.stats import linregress, pearsonr, spearmanr

from empyrical.stats import beta_aligned as empyrical_beta

from zipline.assets import Equity, ExchangeInfo
from zipline.errors import IncompatibleTerms, NonExistentAssetInTimeFrame
from zipline.pipeline import CustomFactor, Pipeline
from zipline.pipeline.data import USEquityPricing
from zipline.pipeline.data.testing import TestingDataSet
from zipline.pipeline.domain import US_EQUITIES
from zipline.pipeline.engine import SimplePipelineEngine
from zipline.pipeline.factors import (
    Returns,
    RollingLinearRegressionOfReturns,
    RollingPearsonOfReturns,
    RollingSpearmanOfReturns,
    SimpleBeta,
)
from zipline.pipeline.factors.statistical import (
    vectorized_beta,
    vectorized_pearson_r,
)
from zipline.pipeline.loaders.frame import DataFrameLoader
from zipline.pipeline.sentinels import NotSpecified
from zipline.testing import (
    AssetID,
    AssetIDPlusDay,
    check_arrays,
    make_alternating_boolean_array,
    make_cascading_boolean_array,
    parameter_space,
)
import zipline.testing.fixtures as zf
from zipline.testing.predicates import assert_equal
from zipline.utils.numpy_utils import (
    as_column,
    bool_dtype,
    datetime64ns_dtype,
    float64_dtype,
)


class StatisticalBuiltInsTestCase(zf.WithAssetFinder,
                                  zf.WithTradingCalendars,
                                  zf.ZiplineTestCase):
    sids = ASSET_FINDER_EQUITY_SIDS = Int64Index([1, 2, 3])
    START_DATE = Timestamp('2015-01-31', tz='UTC')
    END_DATE = Timestamp('2015-03-01', tz='UTC')
    ASSET_FINDER_EQUITY_SYMBOLS = ('A', 'B', 'C')
    ASSET_FINDER_COUNTRY_CODE = 'US'

    @classmethod
    def init_class_fixtures(cls):
        super(StatisticalBuiltInsTestCase, cls).init_class_fixtures()

        day = cls.trading_calendar.day
        cls.dates = dates = date_range(
            '2015-02-01', '2015-02-28', freq=day, tz='UTC',
        )

        # Using these start and end dates because they are a contigous span of
        # 5 days (Monday - Friday) and they allow for plenty of days to look
        # back on when computing correlations and regressions.
        cls.start_date_index = start_date_index = 14
        cls.end_date_index = end_date_index = 18
        cls.pipeline_start_date = dates[start_date_index]
        cls.pipeline_end_date = dates[end_date_index]
        cls.num_days = num_days = end_date_index - start_date_index + 1

        sids = cls.sids
        cls.assets = assets = cls.asset_finder.retrieve_all(sids)
        cls.my_asset_column = my_asset_column = 0
        cls.my_asset = assets[my_asset_column]
        cls.num_assets = num_assets = len(assets)

        cls.raw_data = raw_data = DataFrame(
            data=arange(len(dates) * len(sids), dtype=float64_dtype).reshape(
                len(dates), len(sids),
            ),
            index=dates,
            columns=assets,
        )

        # Using mock 'close' data here because the correlation and regression
        # built-ins use USEquityPricing.close as the input to their `Returns`
        # factors. Since there is no way to change that when constructing an
        # instance of these built-ins, we need to test with mock 'close' data
        # to most accurately reflect their true behavior and results.
        close_loader = DataFrameLoader(USEquityPricing.close, raw_data)

        cls.run_pipeline = SimplePipelineEngine(
            {USEquityPricing.close: close_loader}.__getitem__,
            cls.asset_finder,
            default_domain=US_EQUITIES,
        ).run_pipeline

        cls.cascading_mask = \
            AssetIDPlusDay() < (sids[-1] + dates[start_date_index].day)
        cls.expected_cascading_mask_result = make_cascading_boolean_array(
            shape=(num_days, num_assets),
        )
        cls.alternating_mask = (AssetIDPlusDay() % 2).eq(0)
        cls.expected_alternating_mask_result = make_alternating_boolean_array(
            shape=(num_days, num_assets),
        )
        cls.expected_no_mask_result = full(
            shape=(num_days, num_assets), fill_value=True, dtype=bool_dtype,
        )

    @parameter_space(returns_length=[2, 3], correlation_length=[3, 4])
    def test_correlation_factors(self, returns_length, correlation_length):
        """
        Tests for the built-in factors `RollingPearsonOfReturns` and
        `RollingSpearmanOfReturns`.
        """
        assets = self.assets
        my_asset = self.my_asset
        my_asset_column = self.my_asset_column
        dates = self.dates
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        start_date_index = self.start_date_index
        end_date_index = self.end_date_index
        num_days = self.num_days
        run_pipeline = self.run_pipeline

        returns = Returns(window_length=returns_length)
        masks = (self.cascading_mask, self.alternating_mask, NotSpecified)
        expected_mask_results = (
            self.expected_cascading_mask_result,
            self.expected_alternating_mask_result,
            self.expected_no_mask_result,
        )

        for mask, expected_mask in zip(masks, expected_mask_results):
            pearson_factor = RollingPearsonOfReturns(
                target=my_asset,
                returns_length=returns_length,
                correlation_length=correlation_length,
                mask=mask,
            )
            spearman_factor = RollingSpearmanOfReturns(
                target=my_asset,
                returns_length=returns_length,
                correlation_length=correlation_length,
                mask=mask,
            )

            columns = {
                'pearson_factor': pearson_factor,
                'spearman_factor': spearman_factor,
            }
            pipeline = Pipeline(columns=columns)
            if mask is not NotSpecified:
                pipeline.add(mask, 'mask')

            results = run_pipeline(pipeline, start_date, end_date)
            pearson_results = results['pearson_factor'].unstack()
            spearman_results = results['spearman_factor'].unstack()
            if mask is not NotSpecified:
                mask_results = results['mask'].unstack()
                check_arrays(mask_results.values, expected_mask)

            # Run a separate pipeline that calculates returns starting
            # (correlation_length - 1) days prior to our start date. This is
            # because we need (correlation_length - 1) extra days of returns to
            # compute our expected correlations.
            results = run_pipeline(
                Pipeline(columns={'returns': returns}),
                dates[start_date_index - (correlation_length - 1)],
                dates[end_date_index],
            )
            returns_results = results['returns'].unstack()

            # On each day, calculate the expected correlation coefficients
            # between the asset we are interested in and each other asset. Each
            # correlation is calculated over `correlation_length` days.
            expected_pearson_results = full_like(pearson_results, nan)
            expected_spearman_results = full_like(spearman_results, nan)
            for day in range(num_days):
                todays_returns = returns_results.iloc[
                    day:day + correlation_length
                ]
                my_asset_returns = todays_returns.iloc[:, my_asset_column]
                for asset, other_asset_returns in todays_returns.iteritems():
                    asset_column = int(asset) - 1
                    expected_pearson_results[day, asset_column] = pearsonr(
                        my_asset_returns, other_asset_returns,
                    )[0]
                    expected_spearman_results[day, asset_column] = spearmanr(
                        my_asset_returns, other_asset_returns,
                    )[0]

            expected_pearson_results = DataFrame(
                data=where(expected_mask, expected_pearson_results, nan),
                index=dates[start_date_index:end_date_index + 1],
                columns=assets,
            )
            assert_frame_equal(pearson_results, expected_pearson_results)

            expected_spearman_results = DataFrame(
                data=where(expected_mask, expected_spearman_results, nan),
                index=dates[start_date_index:end_date_index + 1],
                columns=assets,
            )
            assert_frame_equal(spearman_results, expected_spearman_results)

    @parameter_space(returns_length=[2, 3], regression_length=[3, 4])
    def test_regression_of_returns_factor(self,
                                          returns_length,
                                          regression_length):
        """
        Tests for the built-in factor `RollingLinearRegressionOfReturns`.
        """
        assets = self.assets
        my_asset = self.my_asset
        my_asset_column = self.my_asset_column
        dates = self.dates
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        start_date_index = self.start_date_index
        end_date_index = self.end_date_index
        num_days = self.num_days
        run_pipeline = self.run_pipeline

        # The order of these is meant to align with the output of `linregress`.
        outputs = ['beta', 'alpha', 'r_value', 'p_value', 'stderr']

        returns = Returns(window_length=returns_length)
        masks = self.cascading_mask, self.alternating_mask, NotSpecified
        expected_mask_results = (
            self.expected_cascading_mask_result,
            self.expected_alternating_mask_result,
            self.expected_no_mask_result,
        )

        for mask, expected_mask in zip(masks, expected_mask_results):
            regression_factor = RollingLinearRegressionOfReturns(
                target=my_asset,
                returns_length=returns_length,
                regression_length=regression_length,
                mask=mask,
            )

            columns = {
                output: getattr(regression_factor, output)
                for output in outputs
            }
            pipeline = Pipeline(columns=columns)
            if mask is not NotSpecified:
                pipeline.add(mask, 'mask')

            results = run_pipeline(pipeline, start_date, end_date)
            if mask is not NotSpecified:
                mask_results = results['mask'].unstack()
                check_arrays(mask_results.values, expected_mask)

            output_results = {}
            expected_output_results = {}
            for output in outputs:
                output_results[output] = results[output].unstack()
                expected_output_results[output] = full_like(
                    output_results[output], nan,
                )

            # Run a separate pipeline that calculates returns starting
            # (regression_length - 1) days prior to our start date. This is
            # because we need (regression_length - 1) extra days of returns to
            # compute our expected regressions.
            results = run_pipeline(
                Pipeline(columns={'returns': returns}),
                dates[start_date_index - (regression_length - 1)],
                dates[end_date_index],
            )
            returns_results = results['returns'].unstack()

            # On each day, calculate the expected regression results for Y ~ X
            # where Y is the asset we are interested in and X is each other
            # asset. Each regression is calculated over `regression_length`
            # days of data.
            for day in range(num_days):
                todays_returns = returns_results.iloc[
                    day:day + regression_length
                ]
                my_asset_returns = todays_returns.iloc[:, my_asset_column]
                for asset, other_asset_returns in todays_returns.iteritems():
                    asset_column = int(asset) - 1
                    expected_regression_results = linregress(
                        y=other_asset_returns, x=my_asset_returns,
                    )
                    for i, output in enumerate(outputs):
                        expected_output_results[output][day, asset_column] = \
                            expected_regression_results[i]

            for output in outputs:
                output_result = output_results[output]
                expected_output_result = DataFrame(
                    where(expected_mask, expected_output_results[output], nan),
                    index=dates[start_date_index:end_date_index + 1],
                    columns=assets,
                )
                assert_frame_equal(output_result, expected_output_result)

    def test_simple_beta_matches_regression(self):
        run_pipeline = self.run_pipeline
        simple_beta = SimpleBeta(target=self.my_asset, regression_length=10)
        complex_beta = RollingLinearRegressionOfReturns(
            target=self.my_asset,
            returns_length=2,
            regression_length=10,
        ).beta
        pipe = Pipeline({'simple': simple_beta, 'complex': complex_beta})
        results = run_pipeline(
            pipe,
            self.pipeline_start_date,
            self.pipeline_end_date,
        )
        assert_equal(results['simple'], results['complex'], check_names=False)

    def test_simple_beta_allowed_missing_calculation(self):
        for percentage, expected in [(0.651, 65),
                                     (0.659, 65),
                                     (0.66, 66),
                                     (0.0, 0),
                                     (1.0, 100)]:
            beta = SimpleBeta(
                target=self.my_asset,
                regression_length=100,
                allowed_missing_percentage=percentage,
            )
            self.assertEqual(beta.params['allowed_missing_count'], expected)

    def test_correlation_and_regression_with_bad_asset(self):
        """
        Test that `RollingPearsonOfReturns`, `RollingSpearmanOfReturns` and
        `RollingLinearRegressionOfReturns` raise the proper exception when
        given a nonexistent target asset.
        """
        my_asset = Equity(
            0,
            exchange_info=ExchangeInfo('TEST', 'TEST FULL', 'US'),
        )
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        run_pipeline = self.run_pipeline

        # This filter is arbitrary; the important thing is that we test each
        # factor both with and without a specified mask.
        my_asset_filter = AssetID().eq(1)

        for mask in (NotSpecified, my_asset_filter):
            pearson_factor = RollingPearsonOfReturns(
                target=my_asset,
                returns_length=3,
                correlation_length=3,
                mask=mask,
            )
            spearman_factor = RollingSpearmanOfReturns(
                target=my_asset,
                returns_length=3,
                correlation_length=3,
                mask=mask,
            )
            regression_factor = RollingLinearRegressionOfReturns(
                target=my_asset,
                returns_length=3,
                regression_length=3,
                mask=mask,
            )

            with self.assertRaises(NonExistentAssetInTimeFrame):
                run_pipeline(
                    Pipeline(columns={'pearson_factor': pearson_factor}),
                    start_date,
                    end_date,
                )
            with self.assertRaises(NonExistentAssetInTimeFrame):
                run_pipeline(
                    Pipeline(columns={'spearman_factor': spearman_factor}),
                    start_date,
                    end_date,
                )
            with self.assertRaises(NonExistentAssetInTimeFrame):
                run_pipeline(
                    Pipeline(columns={'regression_factor': regression_factor}),
                    start_date,
                    end_date,
                )

    def test_require_length_greater_than_one(self):
        my_asset = Equity(
            0,
            exchange_info=ExchangeInfo('TEST', 'TEST FULL', 'US'),
        )

        with self.assertRaises(ValueError):
            RollingPearsonOfReturns(
                target=my_asset,
                returns_length=3,
                correlation_length=1,
            )

        with self.assertRaises(ValueError):
            RollingSpearmanOfReturns(
                target=my_asset,
                returns_length=3,
                correlation_length=1,
            )

        with self.assertRaises(ValueError):
            RollingLinearRegressionOfReturns(
                target=my_asset,
                returns_length=3,
                regression_length=1,
            )

    def test_simple_beta_input_validation(self):
        with self.assertRaises(TypeError) as e:
            SimpleBeta(
                target="SPY",
                regression_length=100,
                allowed_missing_percentage=0.5,
            )
        result = str(e.exception)
        expected = (
            r"SimpleBeta\(\) expected a value of type"
            " .*Asset for argument 'target',"
            " but got str instead."
        )
        self.assertRegexpMatches(result, expected)

        with self.assertRaises(ValueError) as e:
            SimpleBeta(
                target=self.my_asset,
                regression_length=1,
                allowed_missing_percentage=0.5,
            )
        result = str(e.exception)
        expected = (
            "SimpleBeta() expected a value greater than or equal to 3"
            " for argument 'regression_length', but got 1 instead."
        )
        self.assertEqual(result, expected)

        with self.assertRaises(ValueError) as e:
            SimpleBeta(
                target=self.my_asset,
                regression_length=100,
                allowed_missing_percentage=50,
            )
        result = str(e.exception)
        expected = (
            "SimpleBeta() expected a value inclusively between 0.0 and 1.0 "
            "for argument 'allowed_missing_percentage', but got 50 instead."
        )
        self.assertEqual(result, expected)

    def test_simple_beta_target(self):
        beta = SimpleBeta(
            target=self.my_asset,
            regression_length=50,
            allowed_missing_percentage=0.5,
        )
        self.assertIs(beta.target, self.my_asset)

    def test_simple_beta_repr(self):
        beta = SimpleBeta(
            target=self.my_asset,
            regression_length=50,
            allowed_missing_percentage=0.5,
        )
        result = repr(beta)
        expected = "SimpleBeta({}, length=50, allowed_missing=25)".format(
            self.my_asset,
        )
        self.assertEqual(result, expected)

    def test_simple_beta_graph_repr(self):
        beta = SimpleBeta(
            target=self.my_asset,
            regression_length=50,
            allowed_missing_percentage=0.5,
        )
        result = beta.graph_repr()
        expected = "SimpleBeta('A', 50, 25)".format(self.my_asset)
        self.assertEqual(result, expected)


class StatisticalMethodsTestCase(zf.WithSeededRandomPipelineEngine,
                                 zf.ZiplineTestCase):
    sids = ASSET_FINDER_EQUITY_SIDS = Int64Index([1, 2, 3])
    START_DATE = Timestamp('2015-01-31', tz='UTC')
    END_DATE = Timestamp('2015-03-01', tz='UTC')
    ASSET_FINDER_COUNTRY_CODE = 'US'
    SEEDED_RANDOM_PIPELINE_DEFAULT_DOMAIN = US_EQUITIES

    @classmethod
    def init_class_fixtures(cls):
        super(StatisticalMethodsTestCase, cls).init_class_fixtures()

        # Using these start and end dates because they are a contigous span of
        # 5 days (Monday - Friday) and they allow for plenty of days to look
        # back on when computing correlations and regressions.
        cls.dates = dates = cls.trading_days
        cls.start_date_index = start_date_index = 14
        cls.end_date_index = end_date_index = 18
        cls.pipeline_start_date = cls.trading_days[start_date_index]
        cls.pipeline_end_date = cls.trading_days[end_date_index]

        sids = cls.sids
        cls.assets = assets = cls.asset_finder.retrieve_all(sids)
        cls.my_asset_column = my_asset_column = 0
        cls.my_asset = assets[my_asset_column]
        cls.num_days = num_days = end_date_index - start_date_index + 1
        cls.num_assets = num_assets = len(assets)

        cls.cascading_mask = \
            AssetIDPlusDay() < (sids[-1] + dates[start_date_index].day)
        cls.expected_cascading_mask_result = make_cascading_boolean_array(
            shape=(num_days, num_assets),
        )
        cls.alternating_mask = (AssetIDPlusDay() % 2).eq(0)
        cls.expected_alternating_mask_result = make_alternating_boolean_array(
            shape=(num_days, num_assets),
        )
        cls.expected_no_mask_result = full(
            shape=(num_days, num_assets), fill_value=True, dtype=bool_dtype,
        )

        # Random input for factors.
        cls.col = TestingDataSet.float_col

    @parameter_space(returns_length=[2, 3], correlation_length=[3, 4])
    def test_factor_correlation_methods(self,
                                        returns_length,
                                        correlation_length):
        """
        Ensure that `Factor.pearsonr` and `Factor.spearmanr` are consistent
        with the built-in factors `RollingPearsonOfReturns` and
        `RollingSpearmanOfReturns`.
        """
        my_asset = self.my_asset
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        run_pipeline = self.run_pipeline

        returns = Returns(window_length=returns_length, inputs=[self.col])
        returns_slice = returns[my_asset]

        pearson = returns.pearsonr(
            target=returns_slice, correlation_length=correlation_length,
        )
        spearman = returns.spearmanr(
            target=returns_slice, correlation_length=correlation_length,
        )
        expected_pearson = RollingPearsonOfReturns(
            target=my_asset,
            returns_length=returns_length,
            correlation_length=correlation_length,
        )
        expected_spearman = RollingSpearmanOfReturns(
            target=my_asset,
            returns_length=returns_length,
            correlation_length=correlation_length,
        )

        # These built-ins construct their own Returns factor to use as inputs,
        # so the only way to set our own inputs is to do so after the fact.
        # This should not be done in practice. It is necessary here because we
        # want Returns to use our random data as an input, but by default it is
        # using USEquityPricing.close.
        expected_pearson.inputs = [returns, returns_slice]
        expected_spearman.inputs = [returns, returns_slice]

        columns = {
            'pearson': pearson,
            'spearman': spearman,
            'expected_pearson': expected_pearson,
            'expected_spearman': expected_spearman,
        }

        results = run_pipeline(Pipeline(columns=columns), start_date, end_date)
        pearson_results = results['pearson'].unstack()
        spearman_results = results['spearman'].unstack()
        expected_pearson_results = results['expected_pearson'].unstack()
        expected_spearman_results = results['expected_spearman'].unstack()

        assert_frame_equal(pearson_results, expected_pearson_results)
        assert_frame_equal(spearman_results, expected_spearman_results)

    def test_correlation_methods_bad_type(self):
        """
        Make sure we cannot call the Factor correlation methods on factors or
        slices that are not of float or int dtype.
        """
        # These are arbitrary for the purpose of this test.
        returns_length = 2
        correlation_length = 10

        returns = Returns(window_length=returns_length, inputs=[self.col])
        returns_slice = returns[self.my_asset]

        class BadTypeFactor(CustomFactor):
            inputs = []
            window_length = 1
            dtype = datetime64ns_dtype
            window_safe = True

            def compute(self, today, assets, out):
                pass

        bad_type_factor = BadTypeFactor()
        bad_type_factor_slice = bad_type_factor[self.my_asset]

        with self.assertRaises(TypeError):
            bad_type_factor.pearsonr(
                target=returns_slice, correlation_length=correlation_length,
            )
        with self.assertRaises(TypeError):
            bad_type_factor.spearmanr(
                target=returns_slice, correlation_length=correlation_length,
            )
        with self.assertRaises(TypeError):
            returns.pearsonr(
                target=bad_type_factor_slice,
                correlation_length=correlation_length,
            )
        with self.assertRaises(TypeError):
            returns.spearmanr(
                target=bad_type_factor_slice,
                correlation_length=correlation_length,
            )

    @parameter_space(returns_length=[2, 3], regression_length=[3, 4])
    def test_factor_regression_method(self, returns_length, regression_length):
        """
        Ensure that `Factor.linear_regression` is consistent with the built-in
        factor `RollingLinearRegressionOfReturns`.
        """
        my_asset = self.my_asset
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        run_pipeline = self.run_pipeline

        returns = Returns(window_length=returns_length, inputs=[self.col])
        returns_slice = returns[my_asset]

        regression = returns.linear_regression(
            target=returns_slice, regression_length=regression_length,
        )
        expected_regression = RollingLinearRegressionOfReturns(
            target=my_asset,
            returns_length=returns_length,
            regression_length=regression_length,
        )

        # This built-in constructs its own Returns factor to use as an input,
        # so the only way to set our own input is to do so after the fact. This
        # should not be done in practice. It is necessary here because we want
        # Returns to use our random data as an input, but by default it is
        # using USEquityPricing.close.
        expected_regression.inputs = [returns, returns_slice]

        columns = {
            'regression': regression,
            'expected_regression': expected_regression,
        }

        results = run_pipeline(Pipeline(columns=columns), start_date, end_date)
        regression_results = results['regression'].unstack()
        expected_regression_results = results['expected_regression'].unstack()

        assert_frame_equal(regression_results, expected_regression_results)

    def test_regression_method_bad_type(self):
        """
        Make sure we cannot call the Factor linear regression method on factors
        or slices that are not of float or int dtype.
        """
        # These are arbitrary for the purpose of this test.
        returns_length = 2
        regression_length = 10

        returns = Returns(window_length=returns_length, inputs=[self.col])
        returns_slice = returns[self.my_asset]

        class BadTypeFactor(CustomFactor):
            window_length = 1
            inputs = []
            dtype = datetime64ns_dtype
            window_safe = True

            def compute(self, today, assets, out):
                pass

        bad_type_factor = BadTypeFactor()
        bad_type_factor_slice = bad_type_factor[self.my_asset]

        with self.assertRaises(TypeError):
            bad_type_factor.linear_regression(
                target=returns_slice, regression_length=regression_length,
            )
        with self.assertRaises(TypeError):
            returns.linear_regression(
                target=bad_type_factor_slice,
                regression_length=regression_length,
            )

    @parameter_space(correlation_length=[2, 3, 4])
    def test_factor_correlation_methods_two_factors(self, correlation_length):
        """
        Tests for `Factor.pearsonr` and `Factor.spearmanr` when passed another
        2D factor instead of a Slice.
        """
        assets = self.assets
        dates = self.dates
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        start_date_index = self.start_date_index
        end_date_index = self.end_date_index
        num_days = self.num_days
        run_pipeline = self.run_pipeline

        # Ensure that the correlation methods cannot be called with two 2D
        # factors which have different masks.
        returns_masked_1 = Returns(
            window_length=5, inputs=[self.col], mask=AssetID().eq(1),
        )
        returns_masked_2 = Returns(
            window_length=5, inputs=[self.col], mask=AssetID().eq(2),
        )
        with self.assertRaises(IncompatibleTerms):
            returns_masked_1.pearsonr(
                target=returns_masked_2, correlation_length=correlation_length,
            )
        with self.assertRaises(IncompatibleTerms):
            returns_masked_1.spearmanr(
                target=returns_masked_2, correlation_length=correlation_length,
            )

        returns_5 = Returns(window_length=5, inputs=[self.col])
        returns_10 = Returns(window_length=10, inputs=[self.col])

        pearson_factor = returns_5.pearsonr(
            target=returns_10, correlation_length=correlation_length,
        )
        spearman_factor = returns_5.spearmanr(
            target=returns_10, correlation_length=correlation_length,
        )

        columns = {
            'pearson_factor': pearson_factor,
            'spearman_factor': spearman_factor,
        }
        pipeline = Pipeline(columns=columns)

        results = run_pipeline(pipeline, start_date, end_date)
        pearson_results = results['pearson_factor'].unstack()
        spearman_results = results['spearman_factor'].unstack()

        # Run a separate pipeline that calculates returns starting
        # (correlation_length - 1) days prior to our start date. This is
        # because we need (correlation_length - 1) extra days of returns to
        # compute our expected correlations.
        columns = {'returns_5': returns_5, 'returns_10': returns_10}
        results = run_pipeline(
            Pipeline(columns=columns),
            dates[start_date_index - (correlation_length - 1)],
            dates[end_date_index],
        )
        returns_5_results = results['returns_5'].unstack()
        returns_10_results = results['returns_10'].unstack()

        # On each day, calculate the expected correlation coefficients
        # between each asset's 5 and 10 day rolling returns. Each correlation
        # is calculated over `correlation_length` days.
        expected_pearson_results = full_like(pearson_results, nan)
        expected_spearman_results = full_like(spearman_results, nan)
        for day in range(num_days):
            todays_returns_5 = returns_5_results.iloc[
                day:day + correlation_length
            ]
            todays_returns_10 = returns_10_results.iloc[
                day:day + correlation_length
            ]
            for asset, asset_returns_5 in todays_returns_5.iteritems():
                asset_column = int(asset) - 1
                asset_returns_10 = todays_returns_10[asset]
                expected_pearson_results[day, asset_column] = pearsonr(
                    asset_returns_5, asset_returns_10,
                )[0]
                expected_spearman_results[day, asset_column] = spearmanr(
                    asset_returns_5, asset_returns_10,
                )[0]

        expected_pearson_results = DataFrame(
            data=expected_pearson_results,
            index=dates[start_date_index:end_date_index + 1],
            columns=assets,
        )
        assert_frame_equal(pearson_results, expected_pearson_results)

        expected_spearman_results = DataFrame(
            data=expected_spearman_results,
            index=dates[start_date_index:end_date_index + 1],
            columns=assets,
        )
        assert_frame_equal(spearman_results, expected_spearman_results)

    @parameter_space(regression_length=[2, 3, 4])
    def test_factor_regression_method_two_factors(self, regression_length):
        """
        Tests for `Factor.linear_regression` when passed another 2D factor
        instead of a Slice.
        """
        assets = self.assets
        dates = self.dates
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        start_date_index = self.start_date_index
        end_date_index = self.end_date_index
        num_days = self.num_days
        run_pipeline = self.run_pipeline

        # The order of these is meant to align with the output of `linregress`.
        outputs = ['beta', 'alpha', 'r_value', 'p_value', 'stderr']

        # Ensure that the `linear_regression` method cannot be called with two
        # 2D factors which have different masks.
        returns_masked_1 = Returns(
            window_length=5, inputs=[self.col], mask=AssetID().eq(1),
        )
        returns_masked_2 = Returns(
            window_length=5, inputs=[self.col], mask=AssetID().eq(2),
        )
        with self.assertRaises(IncompatibleTerms):
            returns_masked_1.linear_regression(
                target=returns_masked_2, regression_length=regression_length,
            )

        returns_5 = Returns(window_length=5, inputs=[self.col])
        returns_10 = Returns(window_length=10, inputs=[self.col])

        regression_factor = returns_5.linear_regression(
            target=returns_10, regression_length=regression_length,
        )

        columns = {
            output: getattr(regression_factor, output)
            for output in outputs
        }
        pipeline = Pipeline(columns=columns)

        results = run_pipeline(pipeline, start_date, end_date)

        output_results = {}
        expected_output_results = {}
        for output in outputs:
            output_results[output] = results[output].unstack()
            expected_output_results[output] = full_like(
                output_results[output], nan,
            )

        # Run a separate pipeline that calculates returns starting
        # (regression_length - 1) days prior to our start date. This is because
        # we need (regression_length - 1) extra days of returns to compute our
        # expected regressions.
        columns = {'returns_5': returns_5, 'returns_10': returns_10}
        results = run_pipeline(
            Pipeline(columns=columns),
            dates[start_date_index - (regression_length - 1)],
            dates[end_date_index],
        )
        returns_5_results = results['returns_5'].unstack()
        returns_10_results = results['returns_10'].unstack()

        # On each day, for each asset, calculate the expected regression
        # results of Y ~ X where Y is the asset's rolling 5 day returns and X
        # is the asset's rolling 10 day returns. Each regression is calculated
        # over `regression_length` days of data.
        for day in range(num_days):
            todays_returns_5 = returns_5_results.iloc[
                day:day + regression_length
            ]
            todays_returns_10 = returns_10_results.iloc[
                day:day + regression_length
            ]
            for asset, asset_returns_5 in todays_returns_5.iteritems():
                asset_column = int(asset) - 1
                asset_returns_10 = todays_returns_10[asset]
                expected_regression_results = linregress(
                    y=asset_returns_5, x=asset_returns_10,
                )
                for i, output in enumerate(outputs):
                    expected_output_results[output][day, asset_column] = \
                        expected_regression_results[i]

        for output in outputs:
            output_result = output_results[output]
            expected_output_result = DataFrame(
                expected_output_results[output],
                index=dates[start_date_index:end_date_index + 1],
                columns=assets,
            )
            assert_frame_equal(output_result, expected_output_result)


class VectorizedBetaTestCase(zf.ZiplineTestCase):

    def compare_with_empyrical(self, dependents, independent):
        INFINITY = 1000000  # close enough
        result = vectorized_beta(
            dependents, independent, allowed_missing=INFINITY,
        )
        expected = np.array([
            empyrical_beta(dependents[:, i].ravel(), independent.ravel())
            for i in range(dependents.shape[1])
        ])
        assert_equal(result, expected, array_decimal=7)
        return result

    @parameter_space(seed=[1, 2, 3], __fail_fast=True)
    def test_matches_empyrical_beta_aligned(self, seed):
        rand = np.random.RandomState(seed)

        true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
        independent = as_column(np.linspace(-5., 5., 30))
        noise = as_column(rand.uniform(-.1, .1, 30))
        dependents = 1.0 + true_betas * independent + noise

        result = self.compare_with_empyrical(dependents, independent)
        self.assertTrue((np.abs(result - true_betas) < 0.01).all())

    @parameter_space(
        seed=[1, 2],
        pct_dependent=[0.3],
        pct_independent=[0.75],
        __fail_fast=True,
    )
    def test_nan_handling_matches_empyrical(self,
                                            seed,
                                            pct_dependent,
                                            pct_independent):
        rand = np.random.RandomState(seed)

        true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5]) * 10
        independent = as_column(np.linspace(-5., 10., 50))
        noise = as_column(rand.uniform(-.1, .1, 50))
        dependents = 1.0 + true_betas * independent + noise

        # Fill 20% of the input arrays with nans randomly.
        dependents[rand.uniform(0, 1, dependents.shape) < pct_dependent] = nan
        independent[independent > np.nanmean(independent)] = nan

        # Sanity check that we actually inserted some nans.
        # self.assertTrue(np.count_nonzero(np.isnan(dependents)) > 0)
        self.assertTrue(np.count_nonzero(np.isnan(independent)) > 0)

        result = self.compare_with_empyrical(dependents, independent)

        # compare_with_empyrical uses requred_observations=0, so we shouldn't
        # have any nans in the output even though we had some in the input.
        self.assertTrue(not np.isnan(result).any())

    @parameter_space(nan_offset=[-1, 0, 1])
    def test_produce_nans_when_too_much_missing_data(self, nan_offset):
        rand = np.random.RandomState(42)

        true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
        independent = as_column(np.linspace(-5., 5., 30))
        noise = as_column(rand.uniform(-.1, .1, 30))
        dependents = 1.0 + true_betas * independent + noise

        # Write nans in a triangular pattern into the middle of the dependent
        # array.
        nan_grid = np.array([[1, 0, 0, 0, 0],
                             [1, 1, 0, 0, 0],
                             [1, 1, 1, 0, 0],
                             [1, 1, 1, 1, 0],
                             [1, 1, 1, 1, 1]], dtype=bool)
        num_nans = nan_grid.sum(axis=0)
        # Move the grid around in the parameterized tests. The positions
        # shouldn't matter.
        dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan

        for allowed_missing in range(7):
            results = vectorized_beta(dependents, independent, allowed_missing)
            for i, expected in enumerate(true_betas):
                result = results[i]
                expect_nan = num_nans[i] > allowed_missing
                true_beta = true_betas[i]
                if expect_nan:
                    self.assertTrue(np.isnan(result))
                else:
                    self.assertTrue(np.abs(result - true_beta) < 0.01)

    def test_allowed_missing_doesnt_double_count(self):
        # Test that allowed_missing only counts a row as missing one
        # observation if it's missing in both the dependent and independent
        # variable.
        rand = np.random.RandomState(42)
        true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
        independent = as_column(np.linspace(-5., 5., 30))
        noise = as_column(rand.uniform(-.1, .1, 30))
        dependents = 1.0 + true_betas * independent + noise

        # Each column has three nans in the grid.
        dependent_nan_grid = np.array([[0, 1, 1, 1, 0],
                                       [0, 0, 1, 1, 1],
                                       [1, 0, 0, 1, 1],
                                       [1, 1, 0, 0, 1],
                                       [1, 1, 1, 0, 0]], dtype=bool)
        # There are also two nans in the independent data.
        independent_nan_grid = np.array([[0],
                                         [0],
                                         [1],
                                         [1],
                                         [0]], dtype=bool)

        dependents[10:15][dependent_nan_grid] = np.nan
        independent[10:15][independent_nan_grid] = np.nan

        # With only two allowed missing values, everything should come up nan,
        # because column has at least 3 nans in the dependent data.
        result2 = vectorized_beta(dependents, independent, allowed_missing=2)
        assert_equal(np.isnan(result2),
                     np.array([True, True, True, True, True]))

        # With three allowed missing values, the first and last columns should
        # produce a value, because they have nans at the same rows where the
        # independent data has nans.
        result3 = vectorized_beta(dependents, independent, allowed_missing=3)
        assert_equal(np.isnan(result3),
                     np.array([False, True, True, True, False]))

        # With four allowed missing values, everything but the middle column
        # should produce a value. The middle column will have 5 nans because
        # the dependent nans have no overlap with the independent nans.
        result4 = vectorized_beta(dependents, independent, allowed_missing=4)
        assert_equal(np.isnan(result4),
                     np.array([False, False, True, False, False]))

        # With five allowed missing values, everything should produce a value.
        result5 = vectorized_beta(dependents, independent, allowed_missing=5)
        assert_equal(np.isnan(result5),
                     np.array([False, False, False, False, False]))


class VectorizedCorrelationTestCase(zf.ZiplineTestCase):

    def naive_columnwise_func(self, func, left, right):
        out = np.empty_like(left[0])
        self.assertEqual(left.shape, right.shape)

        for col in range(left.shape[1]):
            left_col = left[:, col]
            right_col = right[:, col]
            missing = np.isnan(left_col) | np.isnan(right_col)
            left_col = left_col[~missing]
            right_col = right_col[~missing]
            r, pvalue = func(left_col, right_col)
            out[col] = r

        return out

    def naive_columnwise_pearson(self, left, right):
        return self.naive_columnwise_func(pearsonr, left, right)

    def naive_columnwise_spearman(self, left, right):
        return self.naive_columnwise_func(spearmanr, left, right)

    @parameter_space(
        seed=[1, 2, 42],
        nan_offset=[-1, 0, 1],
        nans=['dependent', 'independent', 'both'],
        __fail_fast=True,
    )
    def test_produce_nans_when_too_much_missing_data(self,
                                                     seed,
                                                     nans,
                                                     nan_offset):
        rand = np.random.RandomState(seed)

        betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
        independents = as_column(np.linspace(-5., 5., 30)) + np.arange(5)
        noise = as_column(rand.uniform(-2, 2, 30))
        dependents = 1.0 + betas * independents + noise

        # Write nans in a triangular pattern into the middle of the dependent
        # array.
        nan_grid = np.array([[1, 1, 1, 1, 1],
                             [0, 1, 1, 1, 1],
                             [0, 0, 1, 1, 1],
                             [0, 0, 0, 1, 1],
                             [0, 0, 0, 0, 1]], dtype=bool)

        if nans == 'dependent' or nans == 'both':
            dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
        if nans == 'independent' or nans == 'both':
            independents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan

        expected = self.naive_columnwise_pearson(dependents, independents)
        for allowed_missing in list(range(7)) + [10000]:
            results = vectorized_pearson_r(
                dependents, independents, allowed_missing
            )
            for i, result in enumerate(results):
                # column i has i + 1 missing values.
                if i + 1 > allowed_missing:
                    self.assertTrue(np.isnan(result))
                else:
                    assert_equal(result, expected[i])

    def test_broadcasting(self):
        _independent = as_column(np.array([1, 2, 3, 4, 5]))
        dependent = _independent * [2.5, 1.0, -3.5]

        def do_check(independent):
            result = vectorized_pearson_r(
                dependent, independent, allowed_missing=0
            )
            assert_equal(result, np.array([1.0, 1.0, -1.0]))

        # We should get the same result from passing a N x 1 array or an N x 3
        # array with the column tiled 3 times.
        do_check(_independent)
        do_check(np.tile(_independent, 3))