mirror of
https://github.com/saymrwulf/zipline.git
synced 2026-05-16 21:10:11 +00:00
1677 lines
59 KiB
Python
1677 lines
59 KiB
Python
"""
|
|
Tests for SimplePipelineEngine
|
|
"""
|
|
from __future__ import division
|
|
from collections import OrderedDict
|
|
from itertools import product
|
|
from operator import add, sub
|
|
from unittest import skipIf
|
|
|
|
from nose_parameterized import parameterized
|
|
import numpy as np
|
|
from numpy import (
|
|
arange,
|
|
array,
|
|
concatenate,
|
|
float32,
|
|
float64,
|
|
full,
|
|
full_like,
|
|
log,
|
|
nan,
|
|
tile,
|
|
where,
|
|
zeros,
|
|
)
|
|
from numpy.testing import assert_almost_equal
|
|
from pandas import (
|
|
Categorical,
|
|
DataFrame,
|
|
date_range,
|
|
Int64Index,
|
|
MultiIndex,
|
|
Series,
|
|
Timestamp,
|
|
)
|
|
from pandas.compat.chainmap import ChainMap
|
|
from pandas.util.testing import assert_frame_equal
|
|
from six import iteritems, itervalues
|
|
from toolz import merge
|
|
|
|
from zipline.assets.synthetic import make_rotating_equity_info
|
|
from zipline.errors import NoFurtherDataError
|
|
from zipline.lib.adjustment import MULTIPLY
|
|
from zipline.lib.labelarray import LabelArray
|
|
from zipline.pipeline import CustomFactor, Pipeline
|
|
from zipline.pipeline.data import (
|
|
Column, DataSet, EquityPricing, USEquityPricing,
|
|
)
|
|
from zipline.pipeline.data.testing import TestingDataSet
|
|
from zipline.pipeline.domain import (
|
|
EquitySessionDomain,
|
|
GENERIC,
|
|
JP_EQUITIES,
|
|
US_EQUITIES,
|
|
)
|
|
from zipline.pipeline.engine import SimplePipelineEngine
|
|
from zipline.pipeline.factors import (
|
|
AverageDollarVolume,
|
|
EWMA,
|
|
EWMSTD,
|
|
ExponentialWeightedMovingAverage,
|
|
ExponentialWeightedMovingStdDev,
|
|
MaxDrawdown,
|
|
SimpleMovingAverage,
|
|
)
|
|
from zipline.pipeline.filters import CustomFilter
|
|
from zipline.pipeline.loaders.equity_pricing_loader import (
|
|
EquityPricingLoader,
|
|
)
|
|
from zipline.pipeline.loaders.frame import DataFrameLoader
|
|
from zipline.pipeline.loaders.synthetic import (
|
|
PrecomputedLoader,
|
|
make_bar_data,
|
|
expected_bar_values_2d,
|
|
)
|
|
from zipline.pipeline.sentinels import NotSpecified
|
|
from zipline.pipeline.term import InputDates
|
|
from zipline.testing import (
|
|
AssetID,
|
|
AssetIDPlusDay,
|
|
check_arrays,
|
|
make_alternating_boolean_array,
|
|
make_cascading_boolean_array,
|
|
OpenPrice,
|
|
parameter_space,
|
|
product_upper_triangle,
|
|
)
|
|
import zipline.testing.fixtures as zf
|
|
from zipline.utils.exploding_object import NamedExplodingObject
|
|
from zipline.testing.core import create_simple_domain
|
|
from zipline.testing.predicates import assert_equal
|
|
from zipline.utils.memoize import lazyval
|
|
from zipline.utils.numpy_utils import bool_dtype, datetime64ns_dtype
|
|
from zipline.utils.pandas_utils import new_pandas, skip_pipeline_new_pandas
|
|
|
|
|
|
class RollingSumDifference(CustomFactor):
|
|
window_length = 3
|
|
inputs = [EquityPricing.open, EquityPricing.close]
|
|
|
|
def compute(self, today, assets, out, open, close):
|
|
out[:] = (open - close).sum(axis=0)
|
|
|
|
|
|
class MultipleOutputs(CustomFactor):
|
|
window_length = 1
|
|
inputs = [EquityPricing.open, EquityPricing.close]
|
|
outputs = ['open', 'close']
|
|
|
|
def compute(self, today, assets, out, open, close):
|
|
out.open[:] = open
|
|
out.close[:] = close
|
|
|
|
|
|
class OpenCloseSumAndDiff(CustomFactor):
|
|
"""
|
|
Used for testing a CustomFactor with multiple outputs operating over a non-
|
|
trivial window length.
|
|
"""
|
|
inputs = [EquityPricing.open, EquityPricing.close]
|
|
|
|
def compute(self, today, assets, out, open, close):
|
|
out.sum_[:] = open.sum(axis=0) + close.sum(axis=0)
|
|
out.diff[:] = open.sum(axis=0) - close.sum(axis=0)
|
|
|
|
|
|
def assert_multi_index_is_product(testcase, index, *levels):
|
|
"""Assert that a MultiIndex contains the product of `*levels`."""
|
|
testcase.assertIsInstance(
|
|
index, MultiIndex, "%s is not a MultiIndex" % index
|
|
)
|
|
testcase.assertEqual(set(index), set(product(*levels)))
|
|
|
|
|
|
class ColumnArgs(tuple):
|
|
"""A tuple of Columns that defines equivalence based on the order of the
|
|
columns' DataSets, instead of the columns themselves. This is used when
|
|
comparing the columns passed to a loader's load_adjusted_array method,
|
|
since we want to assert that they are ordered by DataSet.
|
|
"""
|
|
def __new__(cls, *cols):
|
|
return super(ColumnArgs, cls).__new__(cls, cols)
|
|
|
|
@classmethod
|
|
def sorted_by_ds(cls, *cols):
|
|
return cls(*sorted(cols, key=lambda col: col.dataset))
|
|
|
|
def by_ds(self):
|
|
return tuple(col.dataset for col in self)
|
|
|
|
def __eq__(self, other):
|
|
return set(self) == set(other) and self.by_ds() == other.by_ds()
|
|
|
|
def __hash__(self):
|
|
return hash(frozenset(self))
|
|
|
|
|
|
class RecordingPrecomputedLoader(PrecomputedLoader):
|
|
def __init__(self, *args, **kwargs):
|
|
super(RecordingPrecomputedLoader, self).__init__(*args, **kwargs)
|
|
|
|
self.load_calls = []
|
|
|
|
def load_adjusted_array(self, domain, columns, dates, sids, mask):
|
|
self.load_calls.append(ColumnArgs(*columns))
|
|
|
|
return super(RecordingPrecomputedLoader, self).load_adjusted_array(
|
|
domain, columns, dates, sids, mask,
|
|
)
|
|
|
|
|
|
class RollingSumSum(CustomFactor):
|
|
def compute(self, today, assets, out, *inputs):
|
|
assert len(self.inputs) == len(inputs)
|
|
out[:] = sum(inputs).sum(axis=0)
|
|
|
|
|
|
class WithConstantInputs(zf.WithAssetFinder):
|
|
asset_ids = ASSET_FINDER_EQUITY_SIDS = 1, 2, 3, 4
|
|
START_DATE = Timestamp('2014-01-01', tz='utc')
|
|
END_DATE = Timestamp('2014-03-01', tz='utc')
|
|
ASSET_FINDER_COUNTRY_CODE = 'US'
|
|
|
|
@classmethod
|
|
def init_class_fixtures(cls):
|
|
super(WithConstantInputs, cls).init_class_fixtures()
|
|
cls.domain = create_simple_domain(
|
|
start=cls.START_DATE,
|
|
end=cls.END_DATE,
|
|
country_code=cls.ASSET_FINDER_COUNTRY_CODE,
|
|
)
|
|
cls.constants = {
|
|
# Every day, assume every stock starts at 2, goes down to 1,
|
|
# goes up to 4, and finishes at 3.
|
|
EquityPricing.low: 1,
|
|
EquityPricing.open: 2,
|
|
EquityPricing.close: 3,
|
|
EquityPricing.high: 4,
|
|
}
|
|
|
|
cls.dates = date_range(
|
|
cls.START_DATE,
|
|
cls.END_DATE,
|
|
freq='D',
|
|
tz='UTC',
|
|
)
|
|
cls.loader = PrecomputedLoader(
|
|
constants=cls.constants,
|
|
dates=cls.dates,
|
|
sids=cls.asset_ids,
|
|
)
|
|
cls.assets = cls.asset_finder.retrieve_all(cls.asset_ids)
|
|
cls.engine = SimplePipelineEngine(
|
|
lambda c: cls.loader,
|
|
cls.asset_finder,
|
|
default_domain=cls.domain
|
|
)
|
|
|
|
|
|
class ConstantInputTestCase(WithConstantInputs,
|
|
zf.WithAssetFinder,
|
|
zf.WithTradingCalendars,
|
|
zf.ZiplineTestCase):
|
|
|
|
def test_bad_dates(self):
|
|
p = Pipeline()
|
|
|
|
msg = "start_date must be before or equal to end_date .*"
|
|
with self.assertRaisesRegex(ValueError, msg):
|
|
self.engine.run_pipeline(p, self.dates[2], self.dates[1])
|
|
|
|
def test_fail_usefully_on_insufficient_data(self):
|
|
class SomeFactor(CustomFactor):
|
|
inputs = [EquityPricing.close]
|
|
window_length = 10
|
|
|
|
def compute(self, today, assets, out, closes):
|
|
pass
|
|
|
|
p = Pipeline(columns={'t': SomeFactor()})
|
|
|
|
# self.dates[9] is the earliest date we should be able to compute.
|
|
self.engine.run_pipeline(p, self.dates[9], self.dates[9])
|
|
|
|
# We shouldn't be able to compute dates[8], since we only know about 8
|
|
# prior dates, and we need a window length of 10.
|
|
with self.assertRaises(NoFurtherDataError):
|
|
self.engine.run_pipeline(p, self.dates[8], self.dates[8])
|
|
|
|
def test_input_dates_provided_by_default(self):
|
|
|
|
class TestFactor(CustomFactor):
|
|
inputs = [InputDates(), EquityPricing.close]
|
|
window_length = 10
|
|
dtype = datetime64ns_dtype
|
|
|
|
def compute(self, today, assets, out, dates, closes):
|
|
first, last = dates[[0, -1], 0]
|
|
assert last == today.asm8
|
|
assert len(dates) == len(closes) == self.window_length
|
|
out[:] = first
|
|
|
|
p = Pipeline(columns={'t': TestFactor()})
|
|
results = self.engine.run_pipeline(p, self.dates[9], self.dates[10])
|
|
|
|
# All results are the same, so just grab one column.
|
|
column = results.unstack().iloc[:, 0].values
|
|
check_arrays(column, self.dates[:2].values)
|
|
|
|
def test_same_day_pipeline(self):
|
|
factor = AssetID()
|
|
asset = self.asset_ids[0]
|
|
p = Pipeline(columns={'f': factor}, screen=factor <= asset)
|
|
|
|
# The crux of this is that when we run the pipeline for a single day
|
|
# (i.e. start and end dates are the same) we should accurately get
|
|
# data for the day prior.
|
|
result = self.engine.run_pipeline(p, self.dates[1], self.dates[1])
|
|
self.assertEqual(result['f'][0], 1.0)
|
|
|
|
def test_screen(self):
|
|
asset_ids = array(self.asset_ids)
|
|
num_dates = 5
|
|
dates = self.dates[10:10 + num_dates]
|
|
|
|
factor = AssetID()
|
|
for asset_id in asset_ids:
|
|
p = Pipeline(columns={'f': factor}, screen=factor <= asset_id)
|
|
result = self.engine.run_pipeline(p, dates[0], dates[-1])
|
|
|
|
expected_sids = asset_ids[asset_ids <= asset_id]
|
|
expected_assets = self.asset_finder.retrieve_all(expected_sids)
|
|
expected_result = DataFrame(
|
|
index=MultiIndex.from_product([dates, expected_assets]),
|
|
data=tile(expected_sids.astype(float), [len(dates)]),
|
|
columns=['f'],
|
|
)
|
|
|
|
assert_frame_equal(result, expected_result)
|
|
|
|
def test_single_factor(self):
|
|
assets = self.assets
|
|
result_shape = (num_dates, num_assets) = (5, len(assets))
|
|
dates = self.dates[10:10 + num_dates]
|
|
|
|
factor = RollingSumDifference()
|
|
expected_result = -factor.window_length
|
|
|
|
# Since every asset will pass the screen, these should be equivalent.
|
|
pipelines = [
|
|
Pipeline(columns={'f': factor}),
|
|
Pipeline(
|
|
columns={'f': factor},
|
|
screen=factor.eq(expected_result),
|
|
),
|
|
]
|
|
|
|
for p in pipelines:
|
|
result = self.engine.run_pipeline(p, dates[0], dates[-1])
|
|
self.assertEqual(set(result.columns), {'f'})
|
|
assert_multi_index_is_product(
|
|
self, result.index, dates, assets
|
|
)
|
|
|
|
check_arrays(
|
|
result['f'].unstack().values,
|
|
full(result_shape, expected_result, dtype=float),
|
|
)
|
|
|
|
def test_multiple_rolling_factors(self):
|
|
assets = self.assets
|
|
|
|
shape = num_dates, num_assets = (5, len(assets))
|
|
dates = self.dates[10:10 + num_dates]
|
|
|
|
short_factor = RollingSumDifference(window_length=3)
|
|
long_factor = RollingSumDifference(window_length=5)
|
|
high_factor = RollingSumDifference(
|
|
window_length=3,
|
|
inputs=[EquityPricing.open, EquityPricing.high],
|
|
)
|
|
|
|
pipeline = Pipeline(
|
|
columns={
|
|
'short': short_factor,
|
|
'long': long_factor,
|
|
'high': high_factor,
|
|
}
|
|
)
|
|
results = self.engine.run_pipeline(pipeline, dates[0], dates[-1])
|
|
|
|
self.assertEqual(set(results.columns), {'short', 'high', 'long'})
|
|
assert_multi_index_is_product(
|
|
self, results.index, dates, assets
|
|
)
|
|
|
|
# row-wise sum over an array whose values are all (1 - 2)
|
|
check_arrays(
|
|
results['short'].unstack().values,
|
|
full(shape, -short_factor.window_length, dtype=float),
|
|
)
|
|
check_arrays(
|
|
results['long'].unstack().values,
|
|
full(shape, -long_factor.window_length, dtype=float),
|
|
)
|
|
# row-wise sum over an array whose values are all (1 - 3)
|
|
check_arrays(
|
|
results['high'].unstack().values,
|
|
full(shape, -2 * high_factor.window_length, dtype=float),
|
|
)
|
|
|
|
def test_numeric_factor(self):
|
|
constants = self.constants
|
|
num_dates = 5
|
|
dates = self.dates[10:10 + num_dates]
|
|
high, low = EquityPricing.high, EquityPricing.low
|
|
open, close = EquityPricing.open, EquityPricing.close
|
|
|
|
high_minus_low = RollingSumDifference(inputs=[high, low])
|
|
open_minus_close = RollingSumDifference(inputs=[open, close])
|
|
avg = (high_minus_low + open_minus_close) / 2
|
|
|
|
results = self.engine.run_pipeline(
|
|
Pipeline(
|
|
columns={
|
|
'high_low': high_minus_low,
|
|
'open_close': open_minus_close,
|
|
'avg': avg,
|
|
},
|
|
),
|
|
dates[0],
|
|
dates[-1],
|
|
)
|
|
|
|
high_low_result = results['high_low'].unstack()
|
|
expected_high_low = 3.0 * (constants[high] - constants[low])
|
|
assert_frame_equal(
|
|
high_low_result,
|
|
DataFrame(expected_high_low, index=dates, columns=self.assets),
|
|
)
|
|
|
|
open_close_result = results['open_close'].unstack()
|
|
expected_open_close = 3.0 * (constants[open] - constants[close])
|
|
assert_frame_equal(
|
|
open_close_result,
|
|
DataFrame(expected_open_close, index=dates, columns=self.assets),
|
|
)
|
|
|
|
avg_result = results['avg'].unstack()
|
|
expected_avg = (expected_high_low + expected_open_close) / 2.0
|
|
assert_frame_equal(
|
|
avg_result,
|
|
DataFrame(expected_avg, index=dates, columns=self.assets),
|
|
)
|
|
|
|
def test_masked_factor(self):
|
|
"""
|
|
Test that a Custom Factor computes the correct values when passed a
|
|
mask. The mask/filter should be applied prior to computing any values,
|
|
as opposed to computing the factor across the entire universe of
|
|
assets. Any assets that are filtered out should be filled with missing
|
|
values.
|
|
"""
|
|
dates = self.dates[5:8]
|
|
assets = self.assets
|
|
asset_ids = self.asset_ids
|
|
constants = self.constants
|
|
num_dates = len(dates)
|
|
num_assets = len(assets)
|
|
open = EquityPricing.open
|
|
close = EquityPricing.close
|
|
|
|
factor1_value = constants[open]
|
|
factor2_value = 3.0 * (constants[open] - constants[close])
|
|
|
|
def create_expected_results(expected_value, mask):
|
|
expected_values = where(mask, expected_value, nan)
|
|
return DataFrame(expected_values, index=dates, columns=assets)
|
|
|
|
cascading_mask = AssetIDPlusDay() < (asset_ids[-1] + dates[0].day)
|
|
expected_cascading_mask_result = make_cascading_boolean_array(
|
|
shape=(num_dates, num_assets),
|
|
)
|
|
|
|
alternating_mask = (AssetIDPlusDay() % 2).eq(0)
|
|
expected_alternating_mask_result = make_alternating_boolean_array(
|
|
shape=(num_dates, num_assets), first_value=False,
|
|
)
|
|
|
|
masks = cascading_mask, alternating_mask
|
|
expected_mask_results = (
|
|
expected_cascading_mask_result,
|
|
expected_alternating_mask_result,
|
|
)
|
|
for mask, expected_mask in zip(masks, expected_mask_results):
|
|
# Test running a pipeline with a single masked factor.
|
|
columns = {'factor1': OpenPrice(mask=mask), 'mask': mask}
|
|
pipeline = Pipeline(columns=columns)
|
|
results = self.engine.run_pipeline(pipeline, dates[0], dates[-1])
|
|
|
|
mask_results = results['mask'].unstack()
|
|
check_arrays(mask_results.values, expected_mask)
|
|
|
|
factor1_results = results['factor1'].unstack()
|
|
factor1_expected = create_expected_results(factor1_value,
|
|
mask_results)
|
|
assert_frame_equal(factor1_results, factor1_expected)
|
|
|
|
# Test running a pipeline with a second factor. This ensures that
|
|
# adding another factor to the pipeline with a different window
|
|
# length does not cause any unexpected behavior, especially when
|
|
# both factors share the same mask.
|
|
columns['factor2'] = RollingSumDifference(mask=mask)
|
|
pipeline = Pipeline(columns=columns)
|
|
results = self.engine.run_pipeline(pipeline, dates[0], dates[-1])
|
|
|
|
mask_results = results['mask'].unstack()
|
|
check_arrays(mask_results.values, expected_mask)
|
|
|
|
factor1_results = results['factor1'].unstack()
|
|
factor2_results = results['factor2'].unstack()
|
|
factor1_expected = create_expected_results(factor1_value,
|
|
mask_results)
|
|
factor2_expected = create_expected_results(factor2_value,
|
|
mask_results)
|
|
assert_frame_equal(factor1_results, factor1_expected)
|
|
assert_frame_equal(factor2_results, factor2_expected)
|
|
|
|
def test_rolling_and_nonrolling(self):
|
|
open_ = EquityPricing.open
|
|
close = EquityPricing.close
|
|
volume = EquityPricing.volume
|
|
|
|
# Test for thirty days up to the last day that we think all
|
|
# the assets existed.
|
|
dates_to_test = self.dates[-30:]
|
|
|
|
constants = {
|
|
open_: 1,
|
|
close: 2,
|
|
volume: 3,
|
|
}
|
|
loader = PrecomputedLoader(
|
|
constants=constants,
|
|
dates=self.dates,
|
|
sids=self.asset_ids,
|
|
)
|
|
engine = SimplePipelineEngine(lambda column: loader, self.asset_finder)
|
|
|
|
sumdiff = RollingSumDifference()
|
|
|
|
result = engine.run_pipeline(
|
|
Pipeline(
|
|
columns={
|
|
'sumdiff': sumdiff,
|
|
'open': open_.latest,
|
|
'close': close.latest,
|
|
'volume': volume.latest,
|
|
},
|
|
domain=self.domain,
|
|
),
|
|
dates_to_test[0],
|
|
dates_to_test[-1]
|
|
)
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(
|
|
{'sumdiff', 'open', 'close', 'volume'},
|
|
set(result.columns)
|
|
)
|
|
|
|
result_index = self.asset_ids * len(dates_to_test)
|
|
result_shape = (len(result_index),)
|
|
check_arrays(
|
|
result['sumdiff'],
|
|
Series(
|
|
index=result_index,
|
|
data=full(result_shape, -3, dtype=float),
|
|
),
|
|
)
|
|
|
|
for name, const in [('open', 1), ('close', 2), ('volume', 3)]:
|
|
check_arrays(
|
|
result[name],
|
|
Series(
|
|
index=result_index,
|
|
data=full(result_shape, const, dtype=float),
|
|
),
|
|
)
|
|
|
|
def test_factor_with_single_output(self):
|
|
"""
|
|
Test passing an `outputs` parameter of length 1 to a CustomFactor.
|
|
"""
|
|
dates = self.dates[5:10]
|
|
assets = self.assets
|
|
num_dates = len(dates)
|
|
open = EquityPricing.open
|
|
open_values = [self.constants[open]] * num_dates
|
|
open_values_as_tuple = [(self.constants[open],)] * num_dates
|
|
|
|
single_output = OpenPrice(outputs=['open'])
|
|
pipeline = Pipeline(
|
|
columns={
|
|
'open_instance': single_output,
|
|
'open_attribute': single_output.open,
|
|
},
|
|
)
|
|
results = self.engine.run_pipeline(pipeline, dates[0], dates[-1])
|
|
|
|
# The instance `single_output` itself will compute a numpy.recarray
|
|
# when added as a column to our pipeline, so we expect its output
|
|
# values to be 1-tuples.
|
|
open_instance_expected = {
|
|
asset: open_values_as_tuple for asset in assets
|
|
}
|
|
open_attribute_expected = {asset: open_values for asset in assets}
|
|
|
|
for colname, expected_values in (
|
|
('open_instance', open_instance_expected),
|
|
('open_attribute', open_attribute_expected)):
|
|
column_results = results[colname].unstack()
|
|
expected_results = DataFrame(
|
|
expected_values, index=dates, columns=assets, dtype=float64,
|
|
)
|
|
assert_frame_equal(column_results, expected_results)
|
|
|
|
def test_factor_with_multiple_outputs(self):
|
|
dates = self.dates[5:10]
|
|
assets = self.assets
|
|
asset_ids = self.asset_ids
|
|
constants = self.constants
|
|
num_dates = len(dates)
|
|
num_assets = len(assets)
|
|
open = EquityPricing.open
|
|
close = EquityPricing.close
|
|
|
|
def create_expected_results(expected_value, mask):
|
|
expected_values = where(mask, expected_value, nan)
|
|
return DataFrame(expected_values, index=dates, columns=assets)
|
|
|
|
cascading_mask = AssetIDPlusDay() < (asset_ids[-1] + dates[0].day)
|
|
expected_cascading_mask_result = make_cascading_boolean_array(
|
|
shape=(num_dates, num_assets),
|
|
)
|
|
|
|
alternating_mask = (AssetIDPlusDay() % 2).eq(0)
|
|
expected_alternating_mask_result = make_alternating_boolean_array(
|
|
shape=(num_dates, num_assets), first_value=False,
|
|
)
|
|
|
|
expected_no_mask_result = full(
|
|
shape=(num_dates, num_assets), fill_value=True, dtype=bool_dtype,
|
|
)
|
|
|
|
masks = cascading_mask, alternating_mask, NotSpecified
|
|
expected_mask_results = (
|
|
expected_cascading_mask_result,
|
|
expected_alternating_mask_result,
|
|
expected_no_mask_result,
|
|
)
|
|
for mask, expected_mask in zip(masks, expected_mask_results):
|
|
open_price, close_price = MultipleOutputs(mask=mask)
|
|
pipeline = Pipeline(
|
|
columns={'open_price': open_price, 'close_price': close_price},
|
|
)
|
|
if mask is not NotSpecified:
|
|
pipeline.add(mask, 'mask')
|
|
|
|
results = self.engine.run_pipeline(pipeline, dates[0], dates[-1])
|
|
for colname, case_column in (('open_price', open),
|
|
('close_price', close)):
|
|
if mask is not NotSpecified:
|
|
mask_results = results['mask'].unstack()
|
|
check_arrays(mask_results.values, expected_mask)
|
|
output_results = results[colname].unstack()
|
|
output_expected = create_expected_results(
|
|
constants[case_column], expected_mask,
|
|
)
|
|
assert_frame_equal(output_results, output_expected)
|
|
|
|
def test_instance_of_factor_with_multiple_outputs(self):
|
|
"""
|
|
Test adding a CustomFactor instance, which has multiple outputs, as a
|
|
pipeline column directly. Its computed values should be tuples
|
|
containing the computed values of each of its outputs.
|
|
"""
|
|
dates = self.dates[5:10]
|
|
assets = self.assets
|
|
num_dates = len(dates)
|
|
num_assets = len(assets)
|
|
constants = self.constants
|
|
|
|
open_values = [constants[EquityPricing.open]] * num_assets
|
|
close_values = [constants[EquityPricing.close]] * num_assets
|
|
expected_values = [list(zip(open_values, close_values))] * num_dates
|
|
expected_results = DataFrame(
|
|
expected_values, index=dates, columns=assets, dtype=float64,
|
|
)
|
|
|
|
multiple_outputs = MultipleOutputs()
|
|
pipeline = Pipeline(columns={'instance': multiple_outputs})
|
|
results = self.engine.run_pipeline(pipeline, dates[0], dates[-1])
|
|
instance_results = results['instance'].unstack()
|
|
assert_frame_equal(instance_results, expected_results)
|
|
|
|
def test_custom_factor_outputs_parameter(self):
|
|
dates = self.dates[5:10]
|
|
assets = self.assets
|
|
num_dates = len(dates)
|
|
num_assets = len(assets)
|
|
constants = self.constants
|
|
|
|
def create_expected_results(expected_value):
|
|
expected_values = full(
|
|
(num_dates, num_assets), expected_value, float64,
|
|
)
|
|
return DataFrame(expected_values, index=dates, columns=assets)
|
|
|
|
for window_length in range(1, 3):
|
|
sum_, diff = OpenCloseSumAndDiff(
|
|
outputs=['sum_', 'diff'], window_length=window_length,
|
|
)
|
|
pipeline = Pipeline(columns={'sum_': sum_, 'diff': diff})
|
|
results = self.engine.run_pipeline(pipeline, dates[0], dates[-1])
|
|
for colname, op in ('sum_', add), ('diff', sub):
|
|
output_results = results[colname].unstack()
|
|
output_expected = create_expected_results(
|
|
op(
|
|
constants[EquityPricing.open] * window_length,
|
|
constants[EquityPricing.close] * window_length,
|
|
)
|
|
)
|
|
assert_frame_equal(output_results, output_expected)
|
|
|
|
def test_loader_given_multiple_columns(self):
|
|
|
|
class Loader1DataSet1(DataSet):
|
|
col1 = Column(float)
|
|
col2 = Column(float32)
|
|
domain = self.domain
|
|
|
|
class Loader1DataSet2(DataSet):
|
|
col1 = Column(float32)
|
|
col2 = Column(float32)
|
|
domain = self.domain
|
|
|
|
class Loader2DataSet(DataSet):
|
|
col1 = Column(float32)
|
|
col2 = Column(float32)
|
|
domain = self.domain
|
|
|
|
constants1 = {Loader1DataSet1.col1: 1,
|
|
Loader1DataSet1.col2: 2,
|
|
Loader1DataSet2.col1: 3,
|
|
Loader1DataSet2.col2: 4}
|
|
|
|
loader1 = RecordingPrecomputedLoader(constants=constants1,
|
|
dates=self.dates,
|
|
sids=self.assets)
|
|
constants2 = {Loader2DataSet.col1: 5,
|
|
Loader2DataSet.col2: 6}
|
|
loader2 = RecordingPrecomputedLoader(constants=constants2,
|
|
dates=self.dates,
|
|
sids=self.assets)
|
|
|
|
engine = SimplePipelineEngine(
|
|
lambda column:
|
|
loader2 if column.dataset == Loader2DataSet else loader1,
|
|
self.asset_finder,
|
|
)
|
|
|
|
pipe_col1 = RollingSumSum(inputs=[Loader1DataSet1.col1,
|
|
Loader1DataSet2.col1,
|
|
Loader2DataSet.col1],
|
|
window_length=2)
|
|
|
|
pipe_col2 = RollingSumSum(inputs=[Loader1DataSet1.col2,
|
|
Loader1DataSet2.col2,
|
|
Loader2DataSet.col2],
|
|
window_length=3)
|
|
|
|
pipe_col3 = RollingSumSum(inputs=[Loader2DataSet.col1],
|
|
window_length=3)
|
|
|
|
columns = OrderedDict([
|
|
('pipe_col1', pipe_col1),
|
|
('pipe_col2', pipe_col2),
|
|
('pipe_col3', pipe_col3),
|
|
])
|
|
result = engine.run_pipeline(
|
|
Pipeline(columns=columns, domain=self.domain),
|
|
self.dates[2], # index is >= the largest window length - 1
|
|
self.dates[-1]
|
|
)
|
|
min_window = min(pip_col.window_length
|
|
for pip_col in itervalues(columns))
|
|
col_to_val = ChainMap(constants1, constants2)
|
|
vals = {name: (sum(col_to_val[col] for col in pipe_col.inputs)
|
|
* pipe_col.window_length)
|
|
for name, pipe_col in iteritems(columns)}
|
|
|
|
index = MultiIndex.from_product([self.dates[2:], self.assets])
|
|
|
|
def expected_for_col(col):
|
|
val = vals[col]
|
|
offset = columns[col].window_length - min_window
|
|
return concatenate(
|
|
[
|
|
full(offset * index.levshape[1], nan),
|
|
full(
|
|
(index.levshape[0] - offset) * index.levshape[1],
|
|
val,
|
|
float,
|
|
)
|
|
],
|
|
)
|
|
|
|
expected = DataFrame(
|
|
data={col: expected_for_col(col) for col in vals},
|
|
index=index,
|
|
columns=columns,
|
|
)
|
|
|
|
assert_frame_equal(result, expected)
|
|
|
|
self.assertEqual(set(loader1.load_calls),
|
|
{ColumnArgs.sorted_by_ds(Loader1DataSet1.col1,
|
|
Loader1DataSet2.col1),
|
|
ColumnArgs.sorted_by_ds(Loader1DataSet1.col2,
|
|
Loader1DataSet2.col2)})
|
|
self.assertEqual(set(loader2.load_calls),
|
|
{ColumnArgs.sorted_by_ds(Loader2DataSet.col1,
|
|
Loader2DataSet.col2)})
|
|
|
|
|
|
# Use very large sids that don't fit in that doesn't fit in an int32 as a
|
|
# regression test against bugs with 32 bit integer overflow in the adjustment
|
|
# reader.
|
|
HUGE_SID = np.iinfo('int32').max + 1
|
|
|
|
|
|
class FrameInputTestCase(zf.WithAssetFinder,
|
|
zf.WithTradingCalendars,
|
|
zf.ZiplineTestCase):
|
|
asset_ids = ASSET_FINDER_EQUITY_SIDS = range(HUGE_SID, HUGE_SID + 3)
|
|
start = START_DATE = Timestamp('2015-01-01', tz='utc')
|
|
end = END_DATE = Timestamp('2015-01-31', tz='utc')
|
|
ASSET_FINDER_COUNTRY_CODE = 'US'
|
|
|
|
@classmethod
|
|
def init_class_fixtures(cls):
|
|
super(FrameInputTestCase, cls).init_class_fixtures()
|
|
cls.dates = date_range(
|
|
cls.start,
|
|
cls.end,
|
|
freq=cls.trading_calendar.day,
|
|
tz='UTC',
|
|
)
|
|
cls.assets = cls.asset_finder.retrieve_all(cls.asset_ids)
|
|
cls.domain = US_EQUITIES
|
|
|
|
@lazyval
|
|
def base_mask(self):
|
|
return self.make_frame(True)
|
|
|
|
def make_frame(self, data):
|
|
return DataFrame(data, columns=self.assets, index=self.dates)
|
|
|
|
def test_compute_with_adjustments(self):
|
|
dates, asset_ids = self.dates, self.asset_ids
|
|
low, high = EquityPricing.low, EquityPricing.high
|
|
apply_idxs = [3, 10, 16]
|
|
|
|
def apply_date(idx, offset=0):
|
|
return dates[apply_idxs[idx] + offset]
|
|
|
|
adjustments = DataFrame.from_records(
|
|
[
|
|
dict(
|
|
kind=MULTIPLY,
|
|
sid=asset_ids[1],
|
|
value=2.0,
|
|
start_date=None,
|
|
end_date=apply_date(0, offset=-1),
|
|
apply_date=apply_date(0),
|
|
),
|
|
dict(
|
|
kind=MULTIPLY,
|
|
sid=asset_ids[1],
|
|
value=3.0,
|
|
start_date=None,
|
|
end_date=apply_date(1, offset=-1),
|
|
apply_date=apply_date(1),
|
|
),
|
|
dict(
|
|
kind=MULTIPLY,
|
|
sid=asset_ids[1],
|
|
value=5.0,
|
|
start_date=None,
|
|
end_date=apply_date(2, offset=-1),
|
|
apply_date=apply_date(2),
|
|
),
|
|
]
|
|
)
|
|
low_base = DataFrame(self.make_frame(30.0))
|
|
low_loader = DataFrameLoader(low, low_base.copy(), adjustments=None)
|
|
|
|
# Pre-apply inverse of adjustments to the baseline.
|
|
high_base = DataFrame(self.make_frame(30.0))
|
|
high_base.iloc[:apply_idxs[0], 1] /= 2.0
|
|
high_base.iloc[:apply_idxs[1], 1] /= 3.0
|
|
high_base.iloc[:apply_idxs[2], 1] /= 5.0
|
|
|
|
high_loader = DataFrameLoader(high, high_base, adjustments)
|
|
|
|
# Dispatch uses the concrete specializations, not generic columns.
|
|
get_loader = {
|
|
USEquityPricing.low: low_loader,
|
|
USEquityPricing.high: high_loader
|
|
}.__getitem__
|
|
|
|
engine = SimplePipelineEngine(get_loader, self.asset_finder)
|
|
|
|
for window_length in range(1, 4):
|
|
low_mavg = SimpleMovingAverage(
|
|
inputs=[EquityPricing.low],
|
|
window_length=window_length,
|
|
)
|
|
high_mavg = SimpleMovingAverage(
|
|
inputs=[EquityPricing.high],
|
|
window_length=window_length,
|
|
)
|
|
bounds = product_upper_triangle(range(window_length, len(dates)))
|
|
for start, stop in bounds:
|
|
results = engine.run_pipeline(
|
|
Pipeline(
|
|
columns={'low': low_mavg, 'high': high_mavg},
|
|
domain=self.domain,
|
|
),
|
|
dates[start],
|
|
dates[stop],
|
|
)
|
|
self.assertEqual(set(results.columns), {'low', 'high'})
|
|
iloc_bounds = slice(start, stop + 1) # +1 to include end date
|
|
|
|
low_results = results.unstack()['low']
|
|
assert_frame_equal(low_results, low_base.iloc[iloc_bounds])
|
|
|
|
high_results = results.unstack()['high']
|
|
assert_frame_equal(high_results, high_base.iloc[iloc_bounds])
|
|
|
|
|
|
class SyntheticBcolzTestCase(zf.WithAdjustmentReader,
|
|
zf.WithAssetFinder,
|
|
zf.ZiplineTestCase):
|
|
first_asset_start = Timestamp('2015-04-01', tz='UTC')
|
|
START_DATE = Timestamp('2015-01-01', tz='utc')
|
|
END_DATE = Timestamp('2015-08-01', tz='utc')
|
|
|
|
@classmethod
|
|
def make_equity_info(cls):
|
|
cls.equity_info = ret = make_rotating_equity_info(
|
|
num_assets=6,
|
|
first_start=cls.first_asset_start,
|
|
frequency=cls.trading_calendar.day,
|
|
periods_between_starts=4,
|
|
asset_lifetime=8,
|
|
exchange='NYSE',
|
|
)
|
|
return ret
|
|
|
|
@classmethod
|
|
def make_exchanges_info(cls, *args, **kwargs):
|
|
return DataFrame({'exchange': ['NYSE'], 'country_code': ['US']})
|
|
|
|
@classmethod
|
|
def make_equity_daily_bar_data(cls, country_code, sids):
|
|
return make_bar_data(
|
|
cls.equity_info,
|
|
cls.equity_daily_bar_days,
|
|
)
|
|
|
|
@classmethod
|
|
def init_class_fixtures(cls):
|
|
super(SyntheticBcolzTestCase, cls).init_class_fixtures()
|
|
cls.all_asset_ids = cls.asset_finder.sids
|
|
cls.last_asset_end = cls.equity_info['end_date'].max()
|
|
cls.pipeline_loader = EquityPricingLoader.without_fx(
|
|
cls.bcolz_equity_daily_bar_reader,
|
|
cls.adjustment_reader,
|
|
)
|
|
cls.engine = SimplePipelineEngine(
|
|
lambda c: cls.pipeline_loader,
|
|
cls.asset_finder,
|
|
default_domain=US_EQUITIES,
|
|
)
|
|
|
|
def write_nans(self, df):
|
|
"""
|
|
Write nans to the locations in data corresponding to the (date, asset)
|
|
pairs for which we wouldn't have data for `asset` on `date` in a
|
|
backtest.
|
|
|
|
Parameters
|
|
----------
|
|
df : pd.DataFrame
|
|
A DataFrame with a DatetimeIndex as index and an object index of
|
|
Assets as columns.
|
|
|
|
This means that we write nans for dates after an asset's end_date and
|
|
**on or before** an asset's start_date. The assymetry here is because
|
|
of the fact that, on the morning of an asset's first date, we haven't
|
|
yet seen any trades for that asset, so we wouldn't be able to show any
|
|
useful data to the user.
|
|
"""
|
|
# Mask out with nans all the dates on which each asset didn't exist
|
|
index = df.index
|
|
min_, max_ = index[[0, -1]]
|
|
for asset in df.columns:
|
|
if asset.start_date >= min_:
|
|
start = index.get_loc(asset.start_date, method='bfill')
|
|
df.loc[:start + 1, asset] = nan # +1 to overwrite start_date
|
|
if asset.end_date <= max_:
|
|
end = index.get_loc(asset.end_date)
|
|
df.ix[end + 1:, asset] = nan # +1 to *not* overwrite end_date
|
|
|
|
def test_SMA(self):
|
|
window_length = 5
|
|
asset_ids = self.all_asset_ids
|
|
dates = date_range(
|
|
self.first_asset_start + self.trading_calendar.day,
|
|
self.last_asset_end,
|
|
freq=self.trading_calendar.day,
|
|
)
|
|
dates_to_test = dates[window_length:]
|
|
|
|
SMA = SimpleMovingAverage(
|
|
inputs=(EquityPricing.close,),
|
|
window_length=window_length,
|
|
)
|
|
|
|
results = self.engine.run_pipeline(
|
|
Pipeline(columns={'sma': SMA}),
|
|
dates_to_test[0],
|
|
dates_to_test[-1],
|
|
)
|
|
|
|
# Shift back the raw inputs by a trading day because we expect our
|
|
# computed results to be computed using values anchored on the
|
|
# **previous** day's data.
|
|
expected_raw = DataFrame(
|
|
expected_bar_values_2d(
|
|
dates - self.trading_calendar.day,
|
|
asset_ids,
|
|
self.equity_info,
|
|
'close',
|
|
),
|
|
).rolling(window_length, min_periods=1).mean().values
|
|
|
|
expected = DataFrame(
|
|
# Truncate off the extra rows needed to compute the SMAs.
|
|
expected_raw[window_length:],
|
|
index=dates_to_test, # dates_to_test is dates[window_length:]
|
|
columns=self.asset_finder.retrieve_all(asset_ids),
|
|
)
|
|
self.write_nans(expected)
|
|
result = results['sma'].unstack()
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_drawdown(self):
|
|
# The monotonically-increasing data produced by SyntheticDailyBarWriter
|
|
# exercises two pathological cases for MaxDrawdown. The actual
|
|
# computed results are pretty much useless (everything is either NaN)
|
|
# or zero, but verifying we correctly handle those corner cases is
|
|
# valuable.
|
|
window_length = 5
|
|
asset_ids = self.all_asset_ids
|
|
dates = date_range(
|
|
self.first_asset_start + self.trading_calendar.day,
|
|
self.last_asset_end,
|
|
freq=self.trading_calendar.day,
|
|
)
|
|
dates_to_test = dates[window_length:]
|
|
|
|
drawdown = MaxDrawdown(
|
|
inputs=(EquityPricing.close,),
|
|
window_length=window_length,
|
|
)
|
|
|
|
results = self.engine.run_pipeline(
|
|
Pipeline(columns={'drawdown': drawdown}),
|
|
dates_to_test[0],
|
|
dates_to_test[-1],
|
|
)
|
|
|
|
# We expect NaNs when the asset was undefined, otherwise 0 everywhere,
|
|
# since the input is always increasing.
|
|
expected = DataFrame(
|
|
data=zeros((len(dates_to_test), len(asset_ids)), dtype=float),
|
|
index=dates_to_test,
|
|
columns=self.asset_finder.retrieve_all(asset_ids),
|
|
)
|
|
self.write_nans(expected)
|
|
result = results['drawdown'].unstack()
|
|
|
|
assert_frame_equal(expected, result)
|
|
|
|
|
|
class ParameterizedFactorTestCase(zf.WithAssetFinder,
|
|
zf.WithTradingCalendars,
|
|
zf.ZiplineTestCase):
|
|
sids = ASSET_FINDER_EQUITY_SIDS = Int64Index([1, 2, 3])
|
|
START_DATE = Timestamp('2015-01-31', tz='UTC')
|
|
END_DATE = Timestamp('2015-03-01', tz='UTC')
|
|
ASSET_FINDER_COUNTRY_CODE = '??'
|
|
|
|
@classmethod
|
|
def init_class_fixtures(cls):
|
|
super(ParameterizedFactorTestCase, cls).init_class_fixtures()
|
|
day = cls.trading_calendar.day
|
|
|
|
cls.dates = dates = date_range(
|
|
'2015-02-01',
|
|
'2015-02-28',
|
|
freq=day,
|
|
tz='UTC',
|
|
)
|
|
sids = cls.sids
|
|
|
|
cls.raw_data = DataFrame(
|
|
data=arange(len(dates) * len(sids), dtype=float).reshape(
|
|
len(dates), len(sids),
|
|
),
|
|
index=dates,
|
|
columns=cls.asset_finder.retrieve_all(sids),
|
|
)
|
|
cls.raw_data_with_nans = cls.raw_data.where((cls.raw_data % 2) != 0)
|
|
|
|
open_loader = DataFrameLoader(
|
|
EquityPricing.open,
|
|
cls.raw_data_with_nans,
|
|
)
|
|
close_loader = DataFrameLoader(EquityPricing.close, cls.raw_data)
|
|
volume_loader = DataFrameLoader(
|
|
EquityPricing.volume,
|
|
cls.raw_data * 2,
|
|
)
|
|
|
|
loader_map = {
|
|
EquityPricing.open: open_loader,
|
|
EquityPricing.close: close_loader,
|
|
EquityPricing.volume: volume_loader,
|
|
}
|
|
|
|
def get_loader(c):
|
|
return loader_map[c.unspecialize()]
|
|
|
|
cls.engine = SimplePipelineEngine(
|
|
get_loader,
|
|
cls.asset_finder,
|
|
default_domain=EquitySessionDomain(cls.dates, '??'),
|
|
)
|
|
|
|
def expected_ewma(self, window_length, decay_rate):
|
|
alpha = 1 - decay_rate
|
|
span = (2 / alpha) - 1
|
|
|
|
# XXX: This is a comically inefficient way to compute a windowed EWMA.
|
|
# Don't use it outside of testing. We're using rolling-apply of an
|
|
# ewma (which is itself a rolling-window function) because we only want
|
|
# to look at ``window_length`` rows at a time.
|
|
return self.raw_data.rolling(window_length).apply(
|
|
lambda subarray: (DataFrame(subarray)
|
|
.ewm(span=span)
|
|
.mean()
|
|
.values[-1])
|
|
)[window_length:]
|
|
|
|
def expected_ewmstd(self, window_length, decay_rate):
|
|
alpha = 1 - decay_rate
|
|
span = (2 / alpha) - 1
|
|
|
|
# XXX: This is a comically inefficient way to compute a windowed
|
|
# EWMSTD. Don't use it outside of testing. We're using rolling-apply
|
|
# of an ewma (which is itself a rolling-window function) because we
|
|
# only want to look at ``window_length`` rows at a time.
|
|
return self.raw_data.rolling(window_length).apply(
|
|
lambda subarray: (DataFrame(subarray)
|
|
.ewm(span=span)
|
|
.std()
|
|
.values[-1])
|
|
)[window_length:]
|
|
|
|
@parameterized.expand([
|
|
(3,),
|
|
(5,),
|
|
])
|
|
def test_ewm_stats(self, window_length):
|
|
|
|
def ewma_name(decay_rate):
|
|
return 'ewma_%s' % decay_rate
|
|
|
|
def ewmstd_name(decay_rate):
|
|
return 'ewmstd_%s' % decay_rate
|
|
|
|
decay_rates = [0.25, 0.5, 0.75]
|
|
ewmas = {
|
|
ewma_name(decay_rate): EWMA(
|
|
inputs=(EquityPricing.close,),
|
|
window_length=window_length,
|
|
decay_rate=decay_rate,
|
|
)
|
|
for decay_rate in decay_rates
|
|
}
|
|
|
|
ewmstds = {
|
|
ewmstd_name(decay_rate): EWMSTD(
|
|
inputs=(EquityPricing.close,),
|
|
window_length=window_length,
|
|
decay_rate=decay_rate,
|
|
)
|
|
for decay_rate in decay_rates
|
|
}
|
|
|
|
all_results = self.engine.run_pipeline(
|
|
Pipeline(columns=merge(ewmas, ewmstds)),
|
|
self.dates[window_length],
|
|
self.dates[-1],
|
|
)
|
|
|
|
for decay_rate in decay_rates:
|
|
ewma_result = all_results[ewma_name(decay_rate)].unstack()
|
|
ewma_expected = self.expected_ewma(window_length, decay_rate)
|
|
assert_frame_equal(ewma_result, ewma_expected)
|
|
|
|
ewmstd_result = all_results[ewmstd_name(decay_rate)].unstack()
|
|
ewmstd_expected = self.expected_ewmstd(window_length, decay_rate)
|
|
assert_frame_equal(ewmstd_result, ewmstd_expected)
|
|
|
|
@staticmethod
|
|
def decay_rate_to_span(decay_rate):
|
|
alpha = 1 - decay_rate
|
|
return (2 / alpha) - 1
|
|
|
|
@staticmethod
|
|
def decay_rate_to_com(decay_rate):
|
|
alpha = 1 - decay_rate
|
|
return (1 / alpha) - 1
|
|
|
|
@staticmethod
|
|
def decay_rate_to_halflife(decay_rate):
|
|
return log(.5) / log(decay_rate)
|
|
|
|
def ewm_cases():
|
|
return product([EWMSTD, EWMA], [3, 5, 10])
|
|
|
|
@parameterized.expand(ewm_cases())
|
|
def test_from_span(self, type_, span):
|
|
from_span = type_.from_span(
|
|
inputs=[EquityPricing.close],
|
|
window_length=20,
|
|
span=span,
|
|
)
|
|
implied_span = self.decay_rate_to_span(from_span.params['decay_rate'])
|
|
assert_almost_equal(span, implied_span)
|
|
|
|
@parameterized.expand(ewm_cases())
|
|
def test_from_halflife(self, type_, halflife):
|
|
from_hl = EWMA.from_halflife(
|
|
inputs=[EquityPricing.close],
|
|
window_length=20,
|
|
halflife=halflife,
|
|
)
|
|
implied_hl = self.decay_rate_to_halflife(from_hl.params['decay_rate'])
|
|
assert_almost_equal(halflife, implied_hl)
|
|
|
|
@parameterized.expand(ewm_cases())
|
|
def test_from_com(self, type_, com):
|
|
from_com = EWMA.from_center_of_mass(
|
|
inputs=[EquityPricing.close],
|
|
window_length=20,
|
|
center_of_mass=com,
|
|
)
|
|
implied_com = self.decay_rate_to_com(from_com.params['decay_rate'])
|
|
assert_almost_equal(com, implied_com)
|
|
|
|
del ewm_cases
|
|
|
|
def test_ewm_aliasing(self):
|
|
self.assertIs(ExponentialWeightedMovingAverage, EWMA)
|
|
self.assertIs(ExponentialWeightedMovingStdDev, EWMSTD)
|
|
|
|
def test_dollar_volume(self):
|
|
results = self.engine.run_pipeline(
|
|
Pipeline(
|
|
columns={
|
|
'dv1': AverageDollarVolume(window_length=1),
|
|
'dv5': AverageDollarVolume(window_length=5),
|
|
'dv1_nan': AverageDollarVolume(
|
|
window_length=1,
|
|
inputs=[EquityPricing.open, EquityPricing.volume],
|
|
),
|
|
'dv5_nan': AverageDollarVolume(
|
|
window_length=5,
|
|
inputs=[EquityPricing.open, EquityPricing.volume],
|
|
),
|
|
}
|
|
),
|
|
self.dates[5],
|
|
self.dates[-1],
|
|
)
|
|
|
|
expected_1 = (self.raw_data[5:] ** 2) * 2
|
|
assert_frame_equal(results['dv1'].unstack(), expected_1)
|
|
|
|
expected_5 = ((self.raw_data ** 2) * 2).rolling(5).mean()[5:]
|
|
assert_frame_equal(results['dv5'].unstack(), expected_5)
|
|
|
|
# The following two use EquityPricing.open and .volume as inputs.
|
|
# The former uses self.raw_data_with_nans, and the latter uses
|
|
# .raw_data * 2. Thus we multiply instead of squaring as above.
|
|
expected_1_nan = (self.raw_data_with_nans[5:]
|
|
* self.raw_data[5:] * 2).fillna(0)
|
|
assert_frame_equal(results['dv1_nan'].unstack(), expected_1_nan)
|
|
|
|
expected_5_nan = ((self.raw_data_with_nans * self.raw_data * 2)
|
|
.fillna(0)
|
|
.rolling(5).mean()
|
|
[5:])
|
|
|
|
assert_frame_equal(results['dv5_nan'].unstack(), expected_5_nan)
|
|
|
|
|
|
class StringColumnTestCase(zf.WithSeededRandomPipelineEngine,
|
|
zf.ZiplineTestCase):
|
|
ASSET_FINDER_COUNTRY_CODE = 'US'
|
|
SEEDED_RANDOM_PIPELINE_DEFAULT_DOMAIN = US_EQUITIES
|
|
|
|
@skipIf(new_pandas, skip_pipeline_new_pandas)
|
|
def test_string_classifiers_produce_categoricals(self):
|
|
"""
|
|
Test that string-based classifiers produce pandas categoricals as their
|
|
outputs.
|
|
"""
|
|
col = TestingDataSet.categorical_col
|
|
pipe = Pipeline(columns={'c': col.latest})
|
|
|
|
run_dates = self.trading_days[-10:]
|
|
start_date, end_date = run_dates[[0, -1]]
|
|
|
|
result = self.run_pipeline(pipe, start_date, end_date)
|
|
assert isinstance(result.c.values, Categorical)
|
|
|
|
expected_raw_data = self.raw_expected_values(
|
|
col,
|
|
start_date,
|
|
end_date,
|
|
)
|
|
expected_labels = LabelArray(expected_raw_data, col.missing_value)
|
|
expected_final_result = expected_labels.as_categorical_frame(
|
|
index=run_dates,
|
|
columns=self.asset_finder.retrieve_all(self.asset_finder.sids),
|
|
)
|
|
assert_frame_equal(result.c.unstack(), expected_final_result)
|
|
|
|
|
|
class WindowSafetyPropagationTestCase(zf.WithSeededRandomPipelineEngine,
|
|
zf.ZiplineTestCase):
|
|
ASSET_FINDER_COUNTRY_CODE = 'US'
|
|
SEEDED_RANDOM_PIPELINE_DEFAULT_DOMAIN = US_EQUITIES
|
|
SEEDED_RANDOM_PIPELINE_SEED = 5
|
|
|
|
def test_window_safety_propagation(self):
|
|
dates = self.trading_days[-30:]
|
|
start_date, end_date = dates[[-10, -1]]
|
|
|
|
col = TestingDataSet.float_col
|
|
pipe = Pipeline(
|
|
columns={
|
|
'average_of_rank_plus_one': SimpleMovingAverage(
|
|
inputs=[col.latest.rank() + 1],
|
|
window_length=10,
|
|
),
|
|
'average_of_aliased_rank_plus_one': SimpleMovingAverage(
|
|
inputs=[col.latest.rank().alias('some_alias') + 1],
|
|
window_length=10,
|
|
),
|
|
'average_of_rank_plus_one_aliased': SimpleMovingAverage(
|
|
inputs=[(col.latest.rank() + 1).alias('some_alias')],
|
|
window_length=10,
|
|
),
|
|
}
|
|
)
|
|
results = self.run_pipeline(pipe, start_date, end_date).unstack()
|
|
|
|
expected_ranks = DataFrame(
|
|
self.raw_expected_values(
|
|
col,
|
|
dates[-19],
|
|
dates[-1],
|
|
),
|
|
index=dates[-19:],
|
|
columns=self.asset_finder.retrieve_all(
|
|
self.ASSET_FINDER_EQUITY_SIDS,
|
|
)
|
|
).rank(axis='columns')
|
|
|
|
# All three expressions should be equivalent and evaluate to this.
|
|
expected_result = (
|
|
(expected_ranks + 1)
|
|
.rolling(10)
|
|
.mean()
|
|
.dropna(how='any')
|
|
)
|
|
|
|
for colname in results.columns.levels[0]:
|
|
assert_equal(expected_result, results[colname])
|
|
|
|
|
|
class PopulateInitialWorkspaceTestCase(WithConstantInputs,
|
|
zf.WithAssetFinder,
|
|
zf.WithTradingCalendars,
|
|
zf.ZiplineTestCase):
|
|
|
|
@parameter_space(window_length=[3, 5], pipeline_length=[5, 10])
|
|
def test_populate_initial_workspace(self, window_length, pipeline_length):
|
|
column = EquityPricing.low
|
|
base_term = column.latest
|
|
|
|
# Take a Z-Score here so that the precomputed term is window-safe. The
|
|
# z-score will never actually get computed because we swap it out.
|
|
precomputed_term = (base_term.zscore()).alias('precomputed_term')
|
|
|
|
# A term that has `precomputed_term` as an input.
|
|
depends_on_precomputed_term = precomputed_term + 1
|
|
# A term that requires a window of `precomputed_term`.
|
|
depends_on_window_of_precomputed_term = SimpleMovingAverage(
|
|
inputs=[precomputed_term],
|
|
window_length=window_length,
|
|
)
|
|
|
|
precomputed_term_with_window = SimpleMovingAverage(
|
|
inputs=(column,),
|
|
window_length=window_length,
|
|
).alias('precomputed_term_with_window')
|
|
depends_on_precomputed_term_with_window = (
|
|
precomputed_term_with_window + 1
|
|
)
|
|
|
|
column_value = self.constants[column]
|
|
precomputed_term_value = -column_value
|
|
precomputed_term_with_window_value = -(column_value + 1)
|
|
|
|
def populate_initial_workspace(initial_workspace,
|
|
root_mask_term,
|
|
execution_plan,
|
|
dates,
|
|
assets):
|
|
def shape_for_term(term):
|
|
ndates = len(execution_plan.mask_and_dates_for_term(
|
|
term,
|
|
root_mask_term,
|
|
initial_workspace,
|
|
dates,
|
|
)[1])
|
|
nassets = len(assets)
|
|
return (ndates, nassets)
|
|
|
|
ws = initial_workspace.copy()
|
|
ws[precomputed_term] = full(
|
|
shape_for_term(precomputed_term),
|
|
precomputed_term_value,
|
|
dtype=float64,
|
|
)
|
|
ws[precomputed_term_with_window] = full(
|
|
shape_for_term(precomputed_term_with_window),
|
|
precomputed_term_with_window_value,
|
|
dtype=float64,
|
|
)
|
|
return ws
|
|
|
|
def dispatcher(c):
|
|
self.assertIsNot(
|
|
c, column, "Shouldn't need to dispatch precomputed term input!"
|
|
)
|
|
return self.loader
|
|
|
|
engine = SimplePipelineEngine(
|
|
dispatcher,
|
|
self.asset_finder,
|
|
populate_initial_workspace=populate_initial_workspace,
|
|
)
|
|
|
|
results = engine.run_pipeline(
|
|
Pipeline({
|
|
'precomputed_term': precomputed_term,
|
|
'precomputed_term_with_window': precomputed_term_with_window,
|
|
'depends_on_precomputed_term': depends_on_precomputed_term,
|
|
'depends_on_precomputed_term_with_window':
|
|
depends_on_precomputed_term_with_window,
|
|
'depends_on_window_of_precomputed_term':
|
|
depends_on_window_of_precomputed_term,
|
|
}, domain=self.domain),
|
|
self.dates[-pipeline_length],
|
|
self.dates[-1],
|
|
)
|
|
|
|
assert_equal(
|
|
results['precomputed_term'].values,
|
|
full_like(
|
|
results['precomputed_term'],
|
|
precomputed_term_value,
|
|
),
|
|
),
|
|
assert_equal(
|
|
results['precomputed_term_with_window'].values,
|
|
full_like(
|
|
results['precomputed_term_with_window'],
|
|
precomputed_term_with_window_value,
|
|
),
|
|
),
|
|
assert_equal(
|
|
results['depends_on_precomputed_term'].values,
|
|
full_like(
|
|
results['depends_on_precomputed_term'],
|
|
precomputed_term_value + 1,
|
|
),
|
|
)
|
|
assert_equal(
|
|
results['depends_on_precomputed_term_with_window'].values,
|
|
full_like(
|
|
results['depends_on_precomputed_term_with_window'],
|
|
precomputed_term_with_window_value + 1,
|
|
),
|
|
)
|
|
assert_equal(
|
|
results['depends_on_window_of_precomputed_term'].values,
|
|
full_like(
|
|
results['depends_on_window_of_precomputed_term'],
|
|
precomputed_term_value,
|
|
),
|
|
)
|
|
|
|
|
|
class ChunkedPipelineTestCase(zf.WithSeededRandomPipelineEngine,
|
|
zf.ZiplineTestCase):
|
|
|
|
PIPELINE_START_DATE = Timestamp('2006-01-05', tz='UTC')
|
|
END_DATE = Timestamp('2006-12-29', tz='UTC')
|
|
ASSET_FINDER_COUNTRY_CODE = 'US'
|
|
|
|
def test_run_chunked_pipeline(self):
|
|
"""
|
|
Test that running a pipeline in chunks produces the same result as if
|
|
it were run all at once
|
|
"""
|
|
|
|
pipe = Pipeline(
|
|
columns={
|
|
'float': TestingDataSet.float_col.latest,
|
|
'custom_factor': SimpleMovingAverage(
|
|
inputs=[TestingDataSet.float_col],
|
|
window_length=10,
|
|
),
|
|
},
|
|
domain=US_EQUITIES,
|
|
)
|
|
|
|
if not new_pandas:
|
|
# Categoricals only work on old pandas.
|
|
pipe.add(TestingDataSet.categorical_col.latest, 'categorical')
|
|
|
|
pipeline_result = self.run_pipeline(
|
|
pipe,
|
|
start_date=self.PIPELINE_START_DATE,
|
|
end_date=self.END_DATE,
|
|
)
|
|
chunked_result = self.run_chunked_pipeline(
|
|
pipeline=pipe,
|
|
start_date=self.PIPELINE_START_DATE,
|
|
end_date=self.END_DATE,
|
|
chunksize=22
|
|
)
|
|
self.assertTrue(chunked_result.equals(pipeline_result))
|
|
|
|
def test_concatenate_empty_chunks(self):
|
|
# Test that we correctly handle concatenating chunked pipelines when
|
|
# some of the chunks are empty. This is slightly tricky b/c pandas
|
|
# DataFrames lose dtype information when they're empty.
|
|
|
|
class FalseOnOddMonths(CustomFilter):
|
|
"""Filter that returns False for all assets during odd months.
|
|
"""
|
|
inputs = ()
|
|
window_length = 1
|
|
|
|
def compute(self, today, assets, out):
|
|
out[:] = (today.month % 2 == 0)
|
|
|
|
pipe = Pipeline(
|
|
columns={
|
|
'float': TestingDataSet.float_col.latest,
|
|
'bool': TestingDataSet.bool_col.latest,
|
|
},
|
|
# Define a screen that's False for all assets a significant portion
|
|
# of the time.
|
|
screen=FalseOnOddMonths(),
|
|
domain=US_EQUITIES,
|
|
)
|
|
|
|
if not new_pandas:
|
|
# Categoricals only work on old pandas.
|
|
pipe.add(TestingDataSet.categorical_col.latest, 'categorical')
|
|
|
|
self.run_chunked_pipeline(
|
|
pipeline=pipe,
|
|
start_date=self.PIPELINE_START_DATE,
|
|
end_date=self.END_DATE,
|
|
# Make chunksize small enough that some chunks are guaranteed to
|
|
# have no assets pass the screen.
|
|
chunksize=5,
|
|
)
|
|
|
|
|
|
class MaximumRegressionTest(zf.WithSeededRandomPipelineEngine,
|
|
zf.ZiplineTestCase):
|
|
ASSET_FINDER_EQUITY_SIDS = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
|
|
|
|
def test_no_groupby_maximum(self):
|
|
# This is a regression test for a bug where factor.top(1) would fail
|
|
# when not passed a groupby parameter.
|
|
|
|
factor = TestingDataSet.float_col.latest
|
|
maximum = factor.top(1)
|
|
pipe = Pipeline(
|
|
{'factor': factor, 'maximum': maximum},
|
|
domain=EquitySessionDomain(
|
|
self.trading_days,
|
|
self.ASSET_FINDER_COUNTRY_CODE,
|
|
),
|
|
)
|
|
result = self.run_pipeline(
|
|
pipe, self.trading_days[-5], self.trading_days[-1]
|
|
)
|
|
|
|
# We should have one maximum every day.
|
|
maxes_per_day = result.groupby(level=0)['maximum'].sum()
|
|
self.assertTrue((maxes_per_day == 1).all())
|
|
|
|
# The maximum computed by pipeline should match the maximum computed by
|
|
# doing a groupby in pandas.
|
|
groupby_max = result.groupby(level=0).factor.max()
|
|
pipeline_max = (result.factor[result.maximum]
|
|
.reset_index(level=1, drop=True))
|
|
|
|
assert_equal(groupby_max, pipeline_max)
|
|
|
|
|
|
class ResolveDomainTestCase(zf.ZiplineTestCase):
|
|
|
|
def test_resolve_domain(self):
|
|
# we need to pass a get_loader and an asset_finder to construct
|
|
# SimplePipelineEngine, but do not expect to use them
|
|
get_loader = NamedExplodingObject(
|
|
'self._get_loader',
|
|
'SimplePipelineEngine does not currently depend on get_loader '
|
|
'at construction time. Update this test if it now does.'
|
|
)
|
|
asset_finder = NamedExplodingObject(
|
|
'self._finder',
|
|
'SimplePipelineEngine does not currently depend on asset_finder '
|
|
'at construction time. Update this test if it now does.'
|
|
)
|
|
|
|
engine_generic = SimplePipelineEngine(
|
|
get_loader, asset_finder, default_domain=GENERIC
|
|
)
|
|
engine_jp = SimplePipelineEngine(
|
|
get_loader, asset_finder, default_domain=JP_EQUITIES
|
|
)
|
|
|
|
pipe_generic = Pipeline()
|
|
pipe_us = Pipeline(domain=US_EQUITIES)
|
|
|
|
# the engine should resolve a pipeline that already has a domain
|
|
# to that domain
|
|
self.assertIs(
|
|
engine_jp.resolve_domain(pipe_us),
|
|
US_EQUITIES
|
|
)
|
|
|
|
# the engine should resolve a pipeline without a domain to the engine's
|
|
# default
|
|
self.assertIs(
|
|
engine_jp.resolve_domain(pipe_generic),
|
|
JP_EQUITIES
|
|
)
|
|
|
|
# a generic engine should resolve to the pipeline's domain
|
|
# if it has one
|
|
self.assertIs(
|
|
engine_generic.resolve_domain(pipe_us),
|
|
US_EQUITIES
|
|
)
|
|
|
|
# an engine with a default of GENERIC should raise a ValueError when
|
|
# trying to infer a pipeline whose domain is also GENERIC
|
|
with self.assertRaises(ValueError):
|
|
engine_generic.resolve_domain(pipe_generic)
|
|
|
|
# infer domain from the column if the pipeline and engine have
|
|
# a GENERIC domain
|
|
pipe = Pipeline({'close': USEquityPricing.close.latest})
|
|
self.assertIs(
|
|
engine_generic.resolve_domain(pipe),
|
|
US_EQUITIES,
|
|
)
|