mirror of
https://github.com/saymrwulf/zipline.git
synced 2026-05-16 21:10:11 +00:00
2474 lines
88 KiB
Python
2474 lines
88 KiB
Python
"""
|
|
Tests for the blaze interface to the pipeline api.
|
|
"""
|
|
from __future__ import division
|
|
|
|
from collections import OrderedDict
|
|
from datetime import timedelta, time
|
|
from functools import partial
|
|
from itertools import product, chain
|
|
from unittest import skipIf
|
|
import warnings
|
|
|
|
import blaze as bz
|
|
from datashape import dshape, var, Record
|
|
from nose_parameterized import parameterized
|
|
import numpy as np
|
|
from numpy.testing.utils import assert_array_almost_equal
|
|
from odo import odo
|
|
import pandas as pd
|
|
import pytz
|
|
from toolz import keymap, valmap, concatv
|
|
from toolz.curried import operator as op
|
|
|
|
from zipline.assets.synthetic import make_simple_equity_info
|
|
from zipline.errors import UnsupportedPipelineOutput
|
|
from zipline.pipeline import Pipeline, CustomFactor
|
|
from zipline.pipeline.data import DataSet, BoundColumn, Column
|
|
from zipline.pipeline.domain import EquitySessionDomain
|
|
from zipline.pipeline.engine import SimplePipelineEngine
|
|
from zipline.pipeline.loaders.blaze import (
|
|
from_blaze,
|
|
BlazeLoader,
|
|
NoMetaDataWarning,
|
|
)
|
|
from zipline.pipeline.loaders.blaze.core import (
|
|
ExprData,
|
|
NonPipelineField,
|
|
)
|
|
from zipline.testing import (
|
|
ZiplineTestCase,
|
|
parameter_space,
|
|
tmp_asset_finder,
|
|
)
|
|
from zipline.testing.fixtures import WithAssetFinder
|
|
from zipline.testing.predicates import (
|
|
assert_equal,
|
|
assert_frame_equal,
|
|
assert_isidentical,
|
|
)
|
|
from zipline.utils.numpy_utils import float64_dtype, int64_dtype
|
|
from zipline.utils.pandas_utils import empty_dataframe, new_pandas, \
|
|
skip_pipeline_new_pandas
|
|
|
|
nameof = op.attrgetter('name')
|
|
dtypeof = op.attrgetter('dtype')
|
|
asset_infos = (
|
|
(make_simple_equity_info(
|
|
tuple(map(ord, 'ABC')),
|
|
pd.Timestamp(0),
|
|
pd.Timestamp('2015'),
|
|
),),
|
|
(make_simple_equity_info(
|
|
tuple(map(ord, 'ABCD')),
|
|
pd.Timestamp(0),
|
|
pd.Timestamp('2015'),
|
|
),),
|
|
)
|
|
simple_asset_info = asset_infos[0][0]
|
|
|
|
|
|
def with_extra_sid():
|
|
return parameterized.expand(asset_infos)
|
|
|
|
|
|
def with_ignore_sid():
|
|
return parameterized.expand(
|
|
product(chain.from_iterable(asset_infos), [True, False])
|
|
)
|
|
|
|
|
|
class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase):
|
|
START_DATE = pd.Timestamp(0)
|
|
END_DATE = pd.Timestamp('2015')
|
|
|
|
@classmethod
|
|
def init_class_fixtures(cls):
|
|
super(BlazeToPipelineTestCase, cls).init_class_fixtures()
|
|
cls.dates = dates = pd.date_range('2014-01-01', '2014-01-03')
|
|
cls.asof_dates = asof_dates = dates - pd.Timedelta(days=1)
|
|
cls.timestamps = timestamps = dates - pd.Timedelta(hours=1)
|
|
|
|
cls.df = df = pd.DataFrame({
|
|
'sid': cls.ASSET_FINDER_EQUITY_SIDS * 3,
|
|
'value': (0., 1., 2., 1., 2., 3., 2., 3., 4.),
|
|
'int_value': (0, 1, 2, 1, 2, 3, 2, 3, 4),
|
|
'asof_date': asof_dates.repeat(3),
|
|
'timestamp': timestamps.repeat(3),
|
|
})
|
|
cls.dshape = dshape("""
|
|
var * {
|
|
sid: ?int64,
|
|
value: ?float64,
|
|
int_value: ?int64,
|
|
asof_date: datetime,
|
|
timestamp: datetime
|
|
}
|
|
""")
|
|
cls.macro_df = df[df.sid == 65].drop('sid', axis=1)
|
|
dshape_ = OrderedDict(cls.dshape.measure.fields)
|
|
del dshape_['sid']
|
|
cls.macro_dshape = var * Record(dshape_)
|
|
|
|
cls.garbage_loader = BlazeLoader()
|
|
cls.missing_values = {'int_value': 0}
|
|
|
|
cls.value_dshape = dshape("""var * {
|
|
sid: ?int64,
|
|
value: float64,
|
|
asof_date: datetime,
|
|
timestamp: datetime,
|
|
}""")
|
|
|
|
def create_domain(self,
|
|
sessions,
|
|
data_query_time=time(0, 0, tzinfo=pytz.utc),
|
|
data_query_date_offset=0):
|
|
if sessions.tz is None:
|
|
sessions = sessions.tz_localize('UTC')
|
|
|
|
return EquitySessionDomain(
|
|
sessions,
|
|
country_code=self.ASSET_FINDER_COUNTRY_CODE,
|
|
data_query_time=data_query_time,
|
|
data_query_date_offset=data_query_date_offset,
|
|
)
|
|
|
|
def test_tabular(self):
|
|
name = 'expr'
|
|
expr = bz.data(self.df, name=name, dshape=self.dshape)
|
|
ds = from_blaze(
|
|
expr,
|
|
loader=self.garbage_loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
)
|
|
self.assertEqual(ds.__name__, name)
|
|
self.assertTrue(issubclass(ds, DataSet))
|
|
|
|
self.assertIs(ds.value.dtype, float64_dtype)
|
|
self.assertIs(ds.int_value.dtype, int64_dtype)
|
|
|
|
self.assertTrue(np.isnan(ds.value.missing_value))
|
|
self.assertEqual(ds.int_value.missing_value, 0)
|
|
|
|
# test memoization
|
|
self.assertIs(
|
|
from_blaze(
|
|
expr,
|
|
loader=self.garbage_loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
),
|
|
ds,
|
|
)
|
|
|
|
def test_column(self):
|
|
exprname = 'expr'
|
|
expr = bz.data(self.df, name=exprname, dshape=self.dshape)
|
|
value = from_blaze(
|
|
expr.value,
|
|
loader=self.garbage_loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
)
|
|
self.assertEqual(value.name, 'value')
|
|
self.assertIsInstance(value, BoundColumn)
|
|
self.assertIs(value.dtype, float64_dtype)
|
|
|
|
# test memoization
|
|
self.assertIs(
|
|
from_blaze(
|
|
expr.value,
|
|
loader=self.garbage_loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
),
|
|
value,
|
|
)
|
|
self.assertIs(
|
|
from_blaze(
|
|
expr,
|
|
loader=self.garbage_loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
).value,
|
|
value,
|
|
)
|
|
|
|
# test the walk back up the tree
|
|
self.assertIs(
|
|
from_blaze(
|
|
expr,
|
|
loader=self.garbage_loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
),
|
|
value.dataset,
|
|
)
|
|
self.assertEqual(value.dataset.__name__, exprname)
|
|
|
|
def test_missing_asof(self):
|
|
expr = bz.data(
|
|
self.df.loc[:, ['sid', 'value', 'timestamp']],
|
|
name='expr',
|
|
dshape="""var * {
|
|
sid: int64,
|
|
value: float64,
|
|
timestamp: datetime,
|
|
}""",
|
|
)
|
|
|
|
with self.assertRaises(TypeError) as e:
|
|
from_blaze(
|
|
expr,
|
|
loader=self.garbage_loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
)
|
|
self.assertIn("'asof_date'", str(e.exception))
|
|
self.assertIn(repr(str(expr.dshape.measure)), str(e.exception))
|
|
|
|
def test_missing_timestamp(self):
|
|
expr = bz.data(
|
|
self.df.loc[:, ['sid', 'value', 'asof_date']],
|
|
name='expr',
|
|
dshape="""var * {
|
|
sid: int64,
|
|
value: float64,
|
|
asof_date: datetime,
|
|
}""",
|
|
)
|
|
|
|
loader = BlazeLoader()
|
|
|
|
ds = from_blaze(
|
|
expr,
|
|
loader=loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
)
|
|
|
|
self.assertEqual(len(loader), 2) # added the two columns
|
|
for column in ds.columns:
|
|
exprdata = loader[column]
|
|
|
|
assert_isidentical(
|
|
exprdata.expr,
|
|
bz.transform(expr, timestamp=expr.asof_date),
|
|
)
|
|
|
|
def test_from_blaze_no_resources_dataset_expr(self):
|
|
expr = bz.symbol('expr', self.dshape)
|
|
|
|
with self.assertRaises(ValueError) as e:
|
|
from_blaze(
|
|
expr,
|
|
loader=self.garbage_loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
)
|
|
assert_equal(
|
|
str(e.exception),
|
|
'no resources provided to compute expr',
|
|
)
|
|
|
|
@parameter_space(metadata={'deltas', 'checkpoints'})
|
|
def test_from_blaze_no_resources_metadata_expr(self, metadata):
|
|
expr = bz.data(self.df, name='expr', dshape=self.dshape)
|
|
metadata_expr = bz.symbol('metadata', self.dshape)
|
|
|
|
with self.assertRaises(ValueError) as e:
|
|
from_blaze(
|
|
expr,
|
|
loader=self.garbage_loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
**{metadata: metadata_expr}
|
|
)
|
|
assert_equal(
|
|
str(e.exception),
|
|
'no resources provided to compute %s' % metadata,
|
|
)
|
|
|
|
def test_from_blaze_mixed_resources_dataset_expr(self):
|
|
expr = bz.data(self.df, name='expr', dshape=self.dshape)
|
|
|
|
with self.assertRaises(ValueError) as e:
|
|
from_blaze(
|
|
expr,
|
|
resources={expr: self.df},
|
|
loader=self.garbage_loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
)
|
|
assert_equal(
|
|
str(e.exception),
|
|
'explicit and implicit resources provided to compute expr',
|
|
)
|
|
|
|
@parameter_space(metadata={'deltas', 'checkpoints'})
|
|
def test_from_blaze_mixed_resources_metadata_expr(self, metadata):
|
|
expr = bz.symbol('expr', self.dshape)
|
|
metadata_expr = bz.data(self.df, name=metadata, dshape=self.dshape)
|
|
|
|
with self.assertRaises(ValueError) as e:
|
|
from_blaze(
|
|
expr,
|
|
resources={metadata_expr: self.df},
|
|
loader=self.garbage_loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
**{metadata: metadata_expr}
|
|
)
|
|
assert_equal(
|
|
str(e.exception),
|
|
'explicit and implicit resources provided to compute %s' %
|
|
metadata,
|
|
)
|
|
|
|
@parameter_space(deltas={True, False}, checkpoints={True, False})
|
|
def test_auto_metadata(self, deltas, checkpoints):
|
|
select_level = op.getitem(('ignore', 'raise'))
|
|
m = {'ds': self.df}
|
|
if deltas:
|
|
m['ds_deltas'] = pd.DataFrame(columns=self.df.columns),
|
|
if checkpoints:
|
|
m['ds_checkpoints'] = pd.DataFrame(columns=self.df.columns),
|
|
expr = bz.data(
|
|
m,
|
|
dshape=var * Record((k, self.dshape.measure) for k in m),
|
|
)
|
|
loader = BlazeLoader()
|
|
ds = from_blaze(
|
|
expr.ds,
|
|
loader=loader,
|
|
missing_values=self.missing_values,
|
|
no_deltas_rule=select_level(deltas),
|
|
no_checkpoints_rule=select_level(checkpoints),
|
|
)
|
|
self.assertEqual(len(loader), 3) # added the three columns
|
|
for column in ds.columns:
|
|
exprdata = loader[column]
|
|
self.assertTrue(exprdata.expr.isidentical(expr.ds))
|
|
if deltas:
|
|
self.assertTrue(exprdata.deltas.isidentical(expr.ds_deltas))
|
|
else:
|
|
self.assertIsNone(exprdata.deltas)
|
|
if checkpoints:
|
|
self.assertTrue(
|
|
exprdata.checkpoints.isidentical(expr.ds_checkpoints),
|
|
)
|
|
else:
|
|
self.assertIsNone(exprdata.checkpoints)
|
|
|
|
@parameter_space(deltas={True, False}, checkpoints={True, False})
|
|
def test_auto_metadata_fail_warn(self, deltas, checkpoints):
|
|
select_level = op.getitem(('ignore', 'warn'))
|
|
with warnings.catch_warnings(record=True) as ws:
|
|
warnings.simplefilter('always')
|
|
loader = BlazeLoader()
|
|
expr = bz.data(self.df, dshape=self.dshape)
|
|
from_blaze(
|
|
expr,
|
|
loader=loader,
|
|
no_deltas_rule=select_level(deltas),
|
|
no_checkpoints_rule=select_level(checkpoints),
|
|
missing_values=self.missing_values,
|
|
)
|
|
self.assertEqual(len(ws), deltas + checkpoints)
|
|
|
|
for w in ws:
|
|
w = w.message
|
|
self.assertIsInstance(w, NoMetaDataWarning)
|
|
self.assertIn(str(expr), str(w))
|
|
|
|
@parameter_space(deltas={True, False}, checkpoints={True, False})
|
|
def test_auto_metadata_fail_raise(self, deltas, checkpoints):
|
|
if not (deltas or checkpoints):
|
|
# not a real case
|
|
return
|
|
select_level = op.getitem(('ignore', 'raise'))
|
|
loader = BlazeLoader()
|
|
expr = bz.data(self.df, dshape=self.dshape)
|
|
with self.assertRaises(ValueError) as e:
|
|
from_blaze(
|
|
expr,
|
|
loader=loader,
|
|
no_deltas_rule=select_level(deltas),
|
|
no_checkpoints_rule=select_level(checkpoints),
|
|
)
|
|
self.assertIn(str(expr), str(e.exception))
|
|
|
|
def test_non_pipeline_field(self):
|
|
expr = bz.data(
|
|
[],
|
|
dshape="""
|
|
var * {
|
|
a: complex,
|
|
asof_date: datetime,
|
|
timestamp: datetime,
|
|
}""",
|
|
)
|
|
ds = from_blaze(
|
|
expr,
|
|
loader=self.garbage_loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
)
|
|
with self.assertRaises(AttributeError):
|
|
ds.a
|
|
self.assertIsInstance(
|
|
object.__getattribute__(ds, 'a'),
|
|
NonPipelineField,
|
|
)
|
|
|
|
@skipIf(new_pandas, skip_pipeline_new_pandas)
|
|
def test_cols_with_all_missing_vals(self):
|
|
"""
|
|
Tests that when there is no known data, we get output where the
|
|
columns have the right dtypes and the right missing values filled in.
|
|
|
|
input (self.df):
|
|
Empty DataFrame
|
|
Columns: [sid, float_value, str_value, int_value, bool_value, dt_value,
|
|
asof_date, timestamp]
|
|
Index: []
|
|
|
|
output (expected)
|
|
str_value float_value int_value
|
|
2014-01-01 Equity(65 [A]) None NaN 0
|
|
Equity(66 [B]) None NaN 0
|
|
Equity(67 [C]) None NaN 0
|
|
2014-01-02 Equity(65 [A]) None NaN 0
|
|
Equity(66 [B]) None NaN 0
|
|
Equity(67 [C]) None NaN 0
|
|
2014-01-03 Equity(65 [A]) None NaN 0
|
|
Equity(66 [B]) None NaN 0
|
|
Equity(67 [C]) None NaN 0
|
|
|
|
dt_value bool_value
|
|
2014-01-01 Equity(65 [A]) NaT False
|
|
Equity(66 [B]) NaT False
|
|
Equity(67 [C]) NaT False
|
|
2014-01-02 Equity(65 [A]) NaT False
|
|
Equity(66 [B]) NaT False
|
|
Equity(67 [C]) NaT False
|
|
2014-01-03 Equity(65 [A]) NaT False
|
|
Equity(66 [B]) NaT False
|
|
Equity(67 [C]) NaT False
|
|
"""
|
|
df = empty_dataframe(
|
|
('sid', 'int64'),
|
|
('float_value', 'float64'),
|
|
('str_value', 'object'),
|
|
('int_value', 'int64'),
|
|
('bool_value', 'bool'),
|
|
('dt_value', 'datetime64[ns]'),
|
|
('asof_date', 'datetime64[ns]'),
|
|
('timestamp', 'datetime64[ns]'),
|
|
)
|
|
|
|
expr = bz.data(
|
|
df,
|
|
dshape="""
|
|
var * {
|
|
sid: int64,
|
|
float_value: float64,
|
|
str_value: string,
|
|
int_value: int64,
|
|
bool_value: bool,
|
|
dt_value: datetime,
|
|
asof_date: datetime,
|
|
timestamp: datetime,
|
|
}""",
|
|
)
|
|
fields = OrderedDict(expr.dshape.measure.fields)
|
|
|
|
expected = pd.DataFrame({
|
|
"str_value": np.array([None,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
None],
|
|
dtype='object'),
|
|
"float_value": np.array([np.NaN,
|
|
np.NaN,
|
|
np.NaN,
|
|
np.NaN,
|
|
np.NaN,
|
|
np.NaN,
|
|
np.NaN,
|
|
np.NaN,
|
|
np.NaN],
|
|
dtype='float64'),
|
|
"int_value": np.array([0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0],
|
|
dtype='int64'),
|
|
"bool_value": np.array([False,
|
|
False,
|
|
False,
|
|
False,
|
|
False,
|
|
False,
|
|
False,
|
|
False,
|
|
False],
|
|
dtype='bool'),
|
|
"dt_value": [pd.NaT,
|
|
pd.NaT,
|
|
pd.NaT,
|
|
pd.NaT,
|
|
pd.NaT,
|
|
pd.NaT,
|
|
pd.NaT,
|
|
pd.NaT,
|
|
pd.NaT],
|
|
},
|
|
columns=['str_value', 'float_value', 'int_value', 'bool_value',
|
|
'dt_value'],
|
|
index=pd.MultiIndex.from_product(
|
|
(self.dates.tz_localize('UTC'), self.asset_finder.retrieve_all(
|
|
self.ASSET_FINDER_EQUITY_SIDS
|
|
))
|
|
)
|
|
)
|
|
|
|
self._test_id(
|
|
df,
|
|
var * Record(fields),
|
|
expected,
|
|
self.asset_finder,
|
|
('float_value', 'str_value', 'int_value', 'bool_value',
|
|
'dt_value'),
|
|
)
|
|
|
|
@skipIf(new_pandas, skip_pipeline_new_pandas)
|
|
def test_cols_with_some_missing_vals(self):
|
|
"""
|
|
Tests the following:
|
|
1) Forward filling replaces missing values correctly for the data
|
|
types supported in pipeline.
|
|
2) We don't forward fill when the missing value is the actual value
|
|
we got for a date in the case of int/bool columns.
|
|
3) We get the correct type of missing value in the output.
|
|
|
|
input (self.df):
|
|
asof_date bool_value dt_value float_value int_value sid
|
|
0 2014-01-01 True 2011-01-01 0 1 65
|
|
1 2014-01-03 True 2011-01-02 1 2 66
|
|
2 2014-01-01 True 2011-01-03 2 3 67
|
|
3 2014-01-02 False NaT NaN 0 67
|
|
|
|
str_value timestamp
|
|
0 a 2014-01-01
|
|
1 b 2014-01-03
|
|
2 c 2014-01-01
|
|
3 None 2014-01-02
|
|
|
|
output (expected)
|
|
str_value float_value int_value bool_value
|
|
2014-01-01 Equity(65 [A]) a 0 1 True
|
|
Equity(66 [B]) None NaN 0 False
|
|
Equity(67 [C]) c 2 3 True
|
|
2014-01-02 Equity(65 [A]) a 0 1 True
|
|
Equity(66 [B]) None NaN 0 False
|
|
Equity(67 [C]) c 2 0 False
|
|
2014-01-03 Equity(65 [A]) a 0 1 True
|
|
Equity(66 [B]) b 1 2 True
|
|
Equity(67 [C]) c 2 0 False
|
|
|
|
dt_value
|
|
2014-01-01 Equity(65 [A]) 2011-01-01
|
|
Equity(66 [B]) NaT
|
|
Equity(67 [C]) 2011-01-03
|
|
2014-01-02 Equity(65 [A]) 2011-01-01
|
|
Equity(66 [B]) NaT
|
|
Equity(67 [C]) 2011-01-03
|
|
2014-01-03 Equity(65 [A]) 2011-01-01
|
|
Equity(66 [B]) 2011-01-02
|
|
Equity(67 [C]) 2011-01-03
|
|
"""
|
|
dates = pd.Index([
|
|
self.dates[0],
|
|
self.dates[-1],
|
|
self.dates[0],
|
|
self.dates[1],
|
|
])
|
|
df = pd.DataFrame({
|
|
'sid': self.ASSET_FINDER_EQUITY_SIDS[:-1] +
|
|
(self.ASSET_FINDER_EQUITY_SIDS[-1],) * 2,
|
|
'float_value': (0., 1., 2., np.NaN),
|
|
'str_value': ('a', 'b', 'c', None),
|
|
'cat_value': pd.Categorical(
|
|
values=['a', 'b', 'c', None],
|
|
categories=['a', 'b', 'c', None],
|
|
),
|
|
'int_value': (1, 2, 3, 0),
|
|
'bool_value': (True, True, True, False),
|
|
'dt_value': (pd.Timestamp('2011-01-01'),
|
|
pd.Timestamp('2011-01-02'),
|
|
pd.Timestamp('2011-01-03'),
|
|
pd.NaT),
|
|
'asof_date': dates - pd.Timedelta(days=2),
|
|
'timestamp': dates - pd.Timedelta(days=1),
|
|
})
|
|
|
|
expr = bz.data(
|
|
df,
|
|
dshape="""
|
|
var * {
|
|
sid: int64,
|
|
float_value: float64,
|
|
str_value: string,
|
|
cat_value: string,
|
|
int_value: int64,
|
|
bool_value: bool,
|
|
dt_value: datetime,
|
|
asof_date: datetime,
|
|
timestamp: datetime,
|
|
}""",
|
|
)
|
|
fields = OrderedDict(expr.dshape.measure.fields)
|
|
|
|
expected = pd.DataFrame(
|
|
{
|
|
'str_value': np.array(
|
|
['a', None, 'c', 'a', None, 'c', 'a', 'b', 'c'],
|
|
dtype='object',
|
|
),
|
|
'cat_value': np.array(
|
|
['a', None, 'c', 'a', None, 'c', 'a', 'b', 'c'],
|
|
dtype='object',
|
|
),
|
|
'float_value': np.array(
|
|
[0, np.NaN, 2, 0, np.NaN, 2, 0, 1, 2],
|
|
dtype='float64',
|
|
),
|
|
'int_value': np.array(
|
|
[1, 0, 3, 1, 0, 3, 1, 2, 3],
|
|
dtype='int64',
|
|
),
|
|
'bool_value': np.array(
|
|
[True, False, True, True, False, False, True, True, False],
|
|
dtype='bool',
|
|
),
|
|
'dt_value': [
|
|
pd.Timestamp('2011-01-01'),
|
|
pd.NaT,
|
|
pd.Timestamp('2011-01-03'),
|
|
pd.Timestamp('2011-01-01'),
|
|
pd.NaT,
|
|
pd.Timestamp('2011-01-03'),
|
|
pd.Timestamp('2011-01-01'),
|
|
pd.Timestamp('2011-01-02'),
|
|
pd.Timestamp('2011-01-03'),
|
|
],
|
|
},
|
|
columns=[
|
|
'str_value',
|
|
'cat_value',
|
|
'float_value',
|
|
'int_value',
|
|
'bool_value',
|
|
'dt_value',
|
|
],
|
|
index=pd.MultiIndex.from_product((
|
|
self.dates.tz_localize('UTC'),
|
|
self.asset_finder.retrieve_all(self.ASSET_FINDER_EQUITY_SIDS),
|
|
)),
|
|
)
|
|
|
|
self._test_id(
|
|
df,
|
|
var * Record(fields),
|
|
expected,
|
|
self.asset_finder,
|
|
expected.columns,
|
|
)
|
|
|
|
def test_complex_expr(self):
|
|
expr = bz.data(self.df, dshape=self.dshape, name='expr')
|
|
# put an Add in the table
|
|
expr_with_add = bz.transform(expr, value=expr.value + 1)
|
|
|
|
# test that we can have complex expressions with no metadata
|
|
from_blaze(
|
|
expr_with_add,
|
|
deltas=None,
|
|
checkpoints=None,
|
|
loader=self.garbage_loader,
|
|
missing_values=self.missing_values,
|
|
no_checkpoints_rule='ignore',
|
|
)
|
|
|
|
with self.assertRaises(TypeError) as e:
|
|
# test that we cannot create a single column from a non field
|
|
from_blaze(
|
|
expr.value + 1, # put an Add in the column
|
|
deltas=None,
|
|
checkpoints=None,
|
|
loader=self.garbage_loader,
|
|
missing_values=self.missing_values,
|
|
no_checkpoints_rule='ignore',
|
|
)
|
|
assert_equal(
|
|
str(e.exception),
|
|
"expression 'expr.value + 1' was array-like but not a simple field"
|
|
" of some larger table",
|
|
)
|
|
|
|
deltas = bz.data(
|
|
pd.DataFrame(columns=self.df.columns),
|
|
dshape=self.dshape,
|
|
name='deltas',
|
|
)
|
|
checkpoints = bz.data(
|
|
pd.DataFrame(columns=self.df.columns),
|
|
dshape=self.dshape,
|
|
name='checkpoints',
|
|
)
|
|
|
|
# test that we can have complex expressions with explicit metadata
|
|
from_blaze(
|
|
expr_with_add,
|
|
deltas=deltas,
|
|
checkpoints=checkpoints,
|
|
loader=self.garbage_loader,
|
|
missing_values=self.missing_values,
|
|
)
|
|
|
|
with self.assertRaises(TypeError) as e:
|
|
# test that we cannot create a single column from a non field
|
|
# even with explicit metadata
|
|
from_blaze(
|
|
expr.value + 1,
|
|
deltas=deltas,
|
|
checkpoints=checkpoints,
|
|
loader=self.garbage_loader,
|
|
missing_values=self.missing_values,
|
|
)
|
|
assert_equal(
|
|
str(e.exception),
|
|
"expression 'expr.value + 1' was array-like but not a simple field"
|
|
" of some larger table",
|
|
)
|
|
|
|
def _test_id(self, df, dshape, expected, finder, add):
|
|
expr = bz.data(df, name='expr', dshape=dshape)
|
|
loader = BlazeLoader()
|
|
domain = self.create_domain(self.dates)
|
|
ds = from_blaze(
|
|
expr,
|
|
loader=loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
domain=domain
|
|
)
|
|
p = Pipeline(domain=domain)
|
|
for a in add:
|
|
p.add(getattr(ds, a).latest, a)
|
|
dates = self.dates
|
|
|
|
result = SimplePipelineEngine(
|
|
loader, finder,
|
|
).run_pipeline(p, dates[0], dates[-1])
|
|
assert_frame_equal(
|
|
result.sort_index(axis=1),
|
|
expected.sort_index(axis=1),
|
|
check_dtype=False,
|
|
)
|
|
|
|
def _test_id_macro(self, df, dshape, expected, finder, add, dates=None):
|
|
if dates is None:
|
|
dates = self.dates
|
|
expr = bz.data(df, name='expr', dshape=dshape)
|
|
loader = BlazeLoader()
|
|
domain = self.create_domain(dates)
|
|
ds = from_blaze(
|
|
expr,
|
|
loader=loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
domain=domain,
|
|
)
|
|
|
|
p = Pipeline(domain=domain)
|
|
macro_inputs = []
|
|
for column_name in add:
|
|
column = getattr(ds, column_name)
|
|
macro_inputs.append(column)
|
|
with self.assertRaises(UnsupportedPipelineOutput):
|
|
# Single column output terms cannot be added to a pipeline.
|
|
p.add(column.latest, column_name)
|
|
|
|
class UsesMacroInputs(CustomFactor):
|
|
inputs = macro_inputs
|
|
window_length = 1
|
|
|
|
def compute(self, today, assets, out, *inputs):
|
|
e = expected.loc[today]
|
|
for i, input_ in enumerate(inputs):
|
|
# Each macro input should only have one column.
|
|
assert_equal(input_.shape, (self.window_length, 1))
|
|
assert_equal(input_[0, 0], e[i])
|
|
|
|
# Run the pipeline with our custom factor. Assertions about the
|
|
# expected macro data are made in the `compute` function of our custom
|
|
# factor above.
|
|
p.add(UsesMacroInputs(), 'uses_macro_inputs')
|
|
engine = SimplePipelineEngine(loader, finder)
|
|
engine.run_pipeline(p, dates[0], dates[-1])
|
|
|
|
def test_custom_query_time_tz(self):
|
|
"""
|
|
input (df):
|
|
asof_date int_value sid timestamp value
|
|
0 2013-12-31 0 65 2014-01-01 13:44:00 0.0
|
|
1 2013-12-31 1 66 2014-01-01 13:44:00 1.0
|
|
2 2013-12-31 2 67 2014-01-01 13:44:00 2.0
|
|
3 2013-12-31 1 65 2014-01-01 13:45:00 1.0
|
|
4 2013-12-31 2 66 2014-01-01 13:45:00 2.0
|
|
5 2013-12-31 3 67 2014-01-01 13:45:00 3.0
|
|
6 2014-01-02 2 65 2014-01-03 13:44:00 2.0
|
|
7 2014-01-02 3 66 2014-01-03 13:44:00 3.0
|
|
8 2014-01-02 4 67 2014-01-03 13:44:00 4.0
|
|
|
|
output (expected):
|
|
int_value value
|
|
2014-01-01 00:00:00+00:00 Equity(65 [A]) 0 0.0
|
|
Equity(66 [B]) 1 1.0
|
|
Equity(67 [C]) 2 2.0
|
|
2014-01-02 00:00:00+00:00 Equity(65 [A]) 1 1.0
|
|
Equity(66 [B]) 2 2.0
|
|
Equity(67 [C]) 3 3.0
|
|
2014-01-03 00:00:00+00:00 Equity(65 [A]) 2 2.0
|
|
Equity(66 [B]) 3 3.0
|
|
Equity(67 [C]) 4 4.0
|
|
"""
|
|
df = self.df.copy()
|
|
df['timestamp'] = (
|
|
pd.DatetimeIndex(df['timestamp'], tz='EST') +
|
|
timedelta(hours=8, minutes=44)
|
|
).tz_convert('utc').tz_localize(None)
|
|
df.ix[3:5, 'timestamp'] = pd.Timestamp('2014-01-01 13:45')
|
|
expr = bz.data(df, name='expr', dshape=self.dshape)
|
|
loader = BlazeLoader()
|
|
ds = from_blaze(
|
|
expr,
|
|
loader=loader,
|
|
no_deltas_rule='ignore',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
domain=self.create_domain(
|
|
self.dates,
|
|
data_query_time=time(8, 45, tzinfo=pytz.timezone('EST')),
|
|
),
|
|
)
|
|
p = Pipeline()
|
|
p.add(ds.value.latest, 'value')
|
|
p.add(ds.int_value.latest, 'int_value')
|
|
|
|
result = SimplePipelineEngine(
|
|
loader, self.asset_finder,
|
|
).run_pipeline(p, self.dates[0], self.dates[-1])
|
|
|
|
expected = df.drop('asof_date', axis=1)
|
|
expected['timestamp'] = expected['timestamp'].dt.normalize().astype(
|
|
'datetime64[ns]',
|
|
).dt.tz_localize('utc')
|
|
expected.ix[3:5, 'timestamp'] += timedelta(days=1)
|
|
expected.set_index(['timestamp', 'sid'], inplace=True)
|
|
expected.index = pd.MultiIndex.from_product((
|
|
expected.index.levels[0],
|
|
self.asset_finder.retrieve_all(expected.index.levels[1]),
|
|
))
|
|
assert_frame_equal(result, expected, check_dtype=False)
|
|
|
|
def test_id(self):
|
|
"""
|
|
input (self.df):
|
|
asof_date sid timestamp int_value value
|
|
0 2014-01-01 65 2014-01-01 0 0
|
|
1 2014-01-01 66 2014-01-01 1 1
|
|
2 2014-01-01 67 2014-01-01 2 2
|
|
3 2014-01-02 65 2014-01-02 1 1
|
|
4 2014-01-02 66 2014-01-02 2 2
|
|
5 2014-01-02 67 2014-01-02 3 3
|
|
6 2014-01-03 65 2014-01-03 2 2
|
|
7 2014-01-03 66 2014-01-03 3 3
|
|
8 2014-01-03 67 2014-01-03 4 4
|
|
|
|
output (expected)
|
|
int_value value
|
|
2014-01-01 Equity(65 [A]) 0 0
|
|
Equity(66 [B]) 1 1
|
|
Equity(67 [C]) 2 2
|
|
2014-01-02 Equity(65 [A]) 1 1
|
|
Equity(66 [B]) 2 2
|
|
Equity(67 [C]) 3 3
|
|
2014-01-03 Equity(65 [A]) 2 2
|
|
Equity(66 [B]) 3 3
|
|
Equity(67 [C]) 4 4
|
|
"""
|
|
expected = self.df.drop(['timestamp', 'asof_date', 'sid'], axis=1)
|
|
expected.index = pd.MultiIndex.from_product((
|
|
self.dates.tz_localize('UTC'),
|
|
self.asset_finder.retrieve_all(self.asset_finder.sids),
|
|
))
|
|
self._test_id(
|
|
self.df,
|
|
self.dshape,
|
|
expected,
|
|
self.asset_finder,
|
|
('int_value', 'value',)
|
|
)
|
|
|
|
def test_id_with_asof_date(self):
|
|
"""
|
|
input (self.df):
|
|
asof_date sid timestamp int_value value
|
|
0 2014-01-01 65 2014-01-01 0 0
|
|
1 2014-01-01 66 2014-01-01 1 1
|
|
2 2014-01-01 67 2014-01-01 2 2
|
|
3 2014-01-02 65 2014-01-02 1 1
|
|
4 2014-01-02 66 2014-01-02 2 2
|
|
5 2014-01-02 67 2014-01-02 3 3
|
|
6 2014-01-03 65 2014-01-03 2 2
|
|
7 2014-01-03 66 2014-01-03 3 3
|
|
8 2014-01-03 67 2014-01-03 4 4
|
|
|
|
output (expected)
|
|
asof_date
|
|
2014-01-01 Equity(65 [A]) 2014-01-01
|
|
Equity(66 [B]) 2014-01-01
|
|
Equity(67 [C]) 2014-01-01
|
|
2014-01-02 Equity(65 [A]) 2014-01-02
|
|
Equity(66 [B]) 2014-01-02
|
|
Equity(67 [C]) 2014-01-02
|
|
2014-01-03 Equity(65 [A]) 2014-01-03
|
|
Equity(66 [B]) 2014-01-03
|
|
Equity(67 [C]) 2014-01-03
|
|
"""
|
|
expected = self.df.drop(
|
|
['timestamp', 'sid', 'value', 'int_value'],
|
|
axis=1,
|
|
)
|
|
expected.index = pd.MultiIndex.from_product((
|
|
self.dates.tz_localize('UTC'),
|
|
self.asset_finder.retrieve_all(self.asset_finder.sids),
|
|
))
|
|
self._test_id(
|
|
self.df,
|
|
self.dshape,
|
|
expected,
|
|
self.asset_finder,
|
|
('asof_date',)
|
|
)
|
|
|
|
def test_id_ffill_out_of_window(self):
|
|
"""
|
|
input (df):
|
|
|
|
asof_date timestamp sid other value
|
|
0 2013-12-22 2013-12-22 65 0 0
|
|
1 2013-12-22 2013-12-22 66 NaN 1
|
|
2 2013-12-22 2013-12-22 67 2 NaN
|
|
3 2013-12-23 2013-12-23 65 NaN 1
|
|
4 2013-12-23 2013-12-23 66 2 NaN
|
|
5 2013-12-23 2013-12-23 67 3 3
|
|
6 2013-12-24 2013-12-24 65 2 NaN
|
|
7 2013-12-24 2013-12-24 66 3 3
|
|
8 2013-12-24 2013-12-24 67 NaN 4
|
|
|
|
output (expected):
|
|
other value
|
|
2014-01-01 Equity(65 [A]) 2 1
|
|
Equity(66 [B]) 3 3
|
|
Equity(67 [C]) 3 4
|
|
2014-01-02 Equity(65 [A]) 2 1
|
|
Equity(66 [B]) 3 3
|
|
Equity(67 [C]) 3 4
|
|
2014-01-03 Equity(65 [A]) 2 1
|
|
Equity(66 [B]) 3 3
|
|
Equity(67 [C]) 3 4
|
|
"""
|
|
dates = self.dates.repeat(3) - timedelta(days=10)
|
|
df = pd.DataFrame({
|
|
'sid': self.ASSET_FINDER_EQUITY_SIDS * 3,
|
|
'value': (0, 1, np.nan, 1, np.nan, 3, np.nan, 3, 4),
|
|
'other': (0, np.nan, 2, np.nan, 2, 3, 2, 3, np.nan),
|
|
'asof_date': dates,
|
|
'timestamp': dates,
|
|
})
|
|
fields = OrderedDict(self.dshape.measure.fields)
|
|
fields['other'] = fields['value']
|
|
|
|
expected = pd.DataFrame(
|
|
np.array([[2, 1],
|
|
[3, 3],
|
|
[3, 4],
|
|
[2, 1],
|
|
[3, 3],
|
|
[3, 4],
|
|
[2, 1],
|
|
[3, 3],
|
|
[3, 4]]),
|
|
columns=['other', 'value'],
|
|
index=pd.MultiIndex.from_product(
|
|
(self.dates.tz_localize('UTC'), self.asset_finder.retrieve_all(
|
|
self.ASSET_FINDER_EQUITY_SIDS
|
|
)),
|
|
),
|
|
)
|
|
self._test_id(
|
|
df,
|
|
var * Record(fields),
|
|
expected,
|
|
self.asset_finder,
|
|
('value', 'other'),
|
|
)
|
|
|
|
def test_id_multiple_columns(self):
|
|
"""
|
|
input (df):
|
|
asof_date sid timestamp value other
|
|
0 2014-01-01 65 2014-01-01 0 1
|
|
1 2014-01-01 66 2014-01-01 1 2
|
|
2 2014-01-01 67 2014-01-01 2 3
|
|
3 2014-01-02 65 2014-01-02 1 2
|
|
4 2014-01-02 66 2014-01-02 2 3
|
|
5 2014-01-02 67 2014-01-02 3 4
|
|
6 2014-01-03 65 2014-01-03 2 3
|
|
7 2014-01-03 66 2014-01-03 3 4
|
|
8 2014-01-03 67 2014-01-03 4 5
|
|
|
|
output (expected):
|
|
value other
|
|
2014-01-01 Equity(65 [A]) 0 1
|
|
Equity(66 [B]) 1 2
|
|
Equity(67 [C]) 2 3
|
|
2014-01-02 Equity(65 [A]) 1 2
|
|
Equity(66 [B]) 2 3
|
|
Equity(67 [C]) 3 4
|
|
2014-01-03 Equity(65 [A]) 2 3
|
|
Equity(66 [B]) 3 4
|
|
Equity(67 [C]) 4 5
|
|
"""
|
|
df = self.df.copy()
|
|
df['other'] = df.value + 1
|
|
fields = OrderedDict(self.dshape.measure.fields)
|
|
fields['other'] = fields['value']
|
|
|
|
expected = df.drop(['timestamp', 'asof_date', 'sid'], axis=1)
|
|
expected.index = pd.MultiIndex.from_product((
|
|
self.dates.tz_localize('UTC'),
|
|
self.asset_finder.retrieve_all(self.asset_finder.sids),
|
|
))
|
|
self._test_id(
|
|
df,
|
|
var * Record(fields),
|
|
expected,
|
|
self.asset_finder,
|
|
('value', 'int_value', 'other'),
|
|
)
|
|
|
|
def test_id_macro_dataset(self):
|
|
"""
|
|
input (self.macro_df)
|
|
asof_date timestamp value
|
|
0 2014-01-01 2014-01-01 0
|
|
3 2014-01-02 2014-01-02 1
|
|
6 2014-01-03 2014-01-03 2
|
|
|
|
output (expected):
|
|
value
|
|
2014-01-01 0
|
|
2014-01-02 1
|
|
2014-01-03 2
|
|
"""
|
|
expected = pd.DataFrame(
|
|
data=[[0],
|
|
[1],
|
|
[2]],
|
|
columns=['value'],
|
|
index=self.dates,
|
|
)
|
|
self._test_id_macro(
|
|
self.macro_df,
|
|
self.macro_dshape,
|
|
expected,
|
|
self.asset_finder,
|
|
('value',),
|
|
)
|
|
|
|
def test_id_ffill_out_of_window_macro_dataset(self):
|
|
"""
|
|
input (df):
|
|
asof_date timestamp other value
|
|
0 2013-12-22 2013-12-22 NaN 0
|
|
1 2013-12-23 2013-12-23 1 NaN
|
|
2 2013-12-24 2013-12-24 NaN NaN
|
|
|
|
output (expected):
|
|
other value
|
|
2014-01-01 1 0
|
|
2014-01-02 1 0
|
|
2014-01-03 1 0
|
|
"""
|
|
dates = self.dates - timedelta(days=10)
|
|
df = pd.DataFrame({
|
|
'value': (0, np.nan, np.nan),
|
|
'other': (np.nan, 1, np.nan),
|
|
'asof_date': dates,
|
|
'timestamp': dates,
|
|
})
|
|
fields = OrderedDict(self.macro_dshape.measure.fields)
|
|
fields['other'] = fields['value']
|
|
|
|
expected = pd.DataFrame(
|
|
data=[[0, 1],
|
|
[0, 1],
|
|
[0, 1]],
|
|
columns=['other', 'value'],
|
|
index=self.dates.tz_localize('UTC'),
|
|
)
|
|
self._test_id_macro(
|
|
df,
|
|
var * Record(fields),
|
|
expected,
|
|
self.asset_finder,
|
|
('value', 'other'),
|
|
)
|
|
|
|
def test_id_macro_dataset_multiple_columns(self):
|
|
"""
|
|
input (df):
|
|
asof_date timestamp other value
|
|
0 2014-01-01 2014-01-01 1 0
|
|
3 2014-01-02 2014-01-02 2 1
|
|
6 2014-01-03 2014-01-03 3 2
|
|
|
|
output (expected):
|
|
other value
|
|
2014-01-01 1 0
|
|
2014-01-02 2 1
|
|
2014-01-03 3 2
|
|
"""
|
|
df = self.macro_df.copy()
|
|
df['other'] = df.value + 1
|
|
fields = OrderedDict(self.macro_dshape.measure.fields)
|
|
fields['other'] = fields['value']
|
|
|
|
with tmp_asset_finder(equities=simple_asset_info) as finder:
|
|
expected = pd.DataFrame(
|
|
data=[[0, 1],
|
|
[1, 2],
|
|
[2, 3]],
|
|
columns=['value', 'other'],
|
|
index=self.dates,
|
|
dtype=np.float64,
|
|
)
|
|
self._test_id_macro(
|
|
df,
|
|
var * Record(fields),
|
|
expected,
|
|
finder,
|
|
('value', 'other'),
|
|
)
|
|
|
|
def test_id_take_last_in_group(self):
|
|
T = pd.Timestamp
|
|
df = pd.DataFrame(
|
|
columns=['asof_date', 'timestamp', 'sid', 'other', 'value'],
|
|
data=[
|
|
[T('2013-12-31'), T('2013-12-31 22'), 65, 0, 0],
|
|
[T('2013-12-31'), T('2013-12-31 23'), 65, 1, np.nan],
|
|
[T('2013-12-31'), T('2013-12-31 22'), 66, np.nan, np.nan],
|
|
[T('2013-12-31'), T('2013-12-31 23'), 66, np.nan, 1],
|
|
[T('2013-12-31'), T('2013-12-31 22'), 67, 2, np.nan],
|
|
[T('2013-12-31'), T('2013-12-31 23'), 67, np.nan, np.nan],
|
|
[T('2014-01-01'), T('2014-01-01 22'), 65, np.nan, np.nan],
|
|
[T('2014-01-01'), T('2014-01-01 23'), 65, np.nan, 1],
|
|
[T('2014-01-01'), T('2014-01-01 22'), 66, np.nan, np.nan],
|
|
[T('2014-01-01'), T('2014-01-01 23'), 66, 2, np.nan],
|
|
[T('2014-01-01'), T('2014-01-01 22'), 67, 3, 3],
|
|
[T('2014-01-01'), T('2014-01-01 23'), 67, 3, 3],
|
|
[T('2014-01-02'), T('2014-01-02 22'), 65, 2, np.nan],
|
|
[T('2014-01-02'), T('2014-01-02 23'), 65, 2, np.nan],
|
|
[T('2014-01-02'), T('2014-01-02 22'), 66, 3, 3],
|
|
[T('2014-01-02'), T('2014-01-02 23'), 66, np.nan, np.nan],
|
|
[T('2014-01-02'), T('2014-01-02 22'), 67, np.nan, np.nan],
|
|
[T('2014-01-02'), T('2014-01-02 23'), 67, np.nan, 4],
|
|
],
|
|
)
|
|
fields = OrderedDict(self.dshape.measure.fields)
|
|
fields['other'] = fields['value']
|
|
|
|
expected = pd.DataFrame(
|
|
columns=['other', 'value'],
|
|
data=[
|
|
[1, 0], # 2014-01-01 Equity(65 [A])
|
|
[np.nan, 1], # Equity(66 [B])
|
|
[2, np.nan], # Equity(67 [C])
|
|
[1, 1], # 2014-01-02 Equity(65 [A])
|
|
[2, 1], # Equity(66 [B])
|
|
[3, 3], # Equity(67 [C])
|
|
[2, 1], # 2014-01-03 Equity(65 [A])
|
|
[3, 3], # Equity(66 [B])
|
|
[3, 4], # Equity(67 [C])
|
|
],
|
|
index=pd.MultiIndex.from_product(
|
|
(self.dates.tz_localize('UTC'), self.asset_finder.retrieve_all(
|
|
self.ASSET_FINDER_EQUITY_SIDS
|
|
)),
|
|
),
|
|
)
|
|
self._test_id(
|
|
df,
|
|
var * Record(fields),
|
|
expected,
|
|
self.asset_finder,
|
|
('value', 'other'),
|
|
)
|
|
|
|
def test_id_take_last_in_group_macro(self):
|
|
"""
|
|
output (expected):
|
|
|
|
other value
|
|
2014-01-01 NaN 2
|
|
2014-01-02 1 3
|
|
2014-01-03 2 3
|
|
"""
|
|
T = pd.Timestamp
|
|
df = pd.DataFrame(
|
|
columns=['asof_date', 'timestamp', 'other', 'value'],
|
|
data=[
|
|
[T('2013-12-31'), T('2013-12-31 01'), np.nan, 1],
|
|
[T('2013-12-31'), T('2013-12-31 02'), np.nan, 2],
|
|
[T('2014-01-01'), T('2014-01-01 01'), 1, np.nan],
|
|
[T('2014-01-01'), T('2014-01-01 02'), np.nan, 3],
|
|
[T('2014-01-02'), T('2014-01-02 01'), 2, np.nan],
|
|
[T('2014-01-02'), T('2014-01-02 02'), np.nan, np.nan],
|
|
],
|
|
)
|
|
fields = OrderedDict(self.macro_dshape.measure.fields)
|
|
fields['other'] = fields['value']
|
|
|
|
expected = pd.DataFrame(
|
|
data=[[np.nan, 2], # 2014-01-01
|
|
[1, 3], # 2014-01-02
|
|
[2, 3]], # 2014-01-03
|
|
columns=['other', 'value'],
|
|
index=self.dates,
|
|
)
|
|
self._test_id_macro(
|
|
df,
|
|
var * Record(fields),
|
|
expected,
|
|
self.asset_finder,
|
|
('other', 'value'),
|
|
)
|
|
|
|
def _run_pipeline(self,
|
|
expr,
|
|
deltas,
|
|
checkpoints,
|
|
expected_views,
|
|
expected_output,
|
|
finder,
|
|
calendar,
|
|
start,
|
|
end,
|
|
window_length,
|
|
compute_fn=None):
|
|
loader = BlazeLoader()
|
|
ds = from_blaze(
|
|
expr,
|
|
deltas,
|
|
checkpoints,
|
|
loader=loader,
|
|
no_deltas_rule='raise',
|
|
no_checkpoints_rule='ignore',
|
|
missing_values=self.missing_values,
|
|
domain=self.create_domain(calendar),
|
|
)
|
|
p = Pipeline()
|
|
|
|
# prevent unbound locals issue in the inner class
|
|
window_length_ = window_length
|
|
|
|
if compute_fn is None:
|
|
self.assertIsNone(
|
|
expected_output,
|
|
'expected_output must be None if compute_fn is None',
|
|
)
|
|
|
|
def compute_fn(data):
|
|
return data[0]
|
|
|
|
class TestFactor(CustomFactor):
|
|
inputs = ds.value,
|
|
window_length = window_length_
|
|
|
|
def compute(self, today, assets, out, data):
|
|
assert_array_almost_equal(
|
|
data,
|
|
expected_views[today],
|
|
err_msg=str(today),
|
|
)
|
|
out[:] = compute_fn(data)
|
|
|
|
p.add(TestFactor(), 'value')
|
|
|
|
result = SimplePipelineEngine(
|
|
loader, finder,
|
|
).run_pipeline(p, start, end)
|
|
|
|
if expected_output is not None:
|
|
assert_frame_equal(
|
|
result,
|
|
expected_output,
|
|
check_dtype=False,
|
|
)
|
|
|
|
@with_ignore_sid()
|
|
def test_deltas(self, asset_info, add_extra_sid):
|
|
df = self.df.copy()
|
|
if add_extra_sid:
|
|
extra_sid_df = pd.DataFrame({
|
|
'asof_date': self.asof_dates,
|
|
'timestamp': self.timestamps,
|
|
'sid': (ord('E'),) * 3,
|
|
'value': (3., 4., 5.,),
|
|
'int_value': (3, 4, 5),
|
|
})
|
|
df = df.append(extra_sid_df, ignore_index=True)
|
|
expr = bz.data(df, name='expr', dshape=self.dshape)
|
|
deltas = bz.data(df, dshape=self.dshape)
|
|
deltas = bz.data(
|
|
odo(
|
|
bz.transform(
|
|
deltas,
|
|
value=deltas.value + 10,
|
|
timestamp=deltas.timestamp + timedelta(days=1),
|
|
),
|
|
pd.DataFrame,
|
|
),
|
|
name='delta',
|
|
dshape=self.dshape,
|
|
)
|
|
|
|
expected_views = keymap(partial(pd.Timestamp, tz='UTC'), {
|
|
'2014-01-02': np.array([[10.0, 11.0, 12.0],
|
|
[1.0, 2.0, 3.0]]),
|
|
'2014-01-03': np.array([[11.0, 12.0, 13.0],
|
|
[2.0, 3.0, 4.0]]),
|
|
'2014-01-04': np.array([[12.0, 13.0, 14.0],
|
|
[12.0, 13.0, 14.0]]),
|
|
})
|
|
|
|
nassets = len(asset_info)
|
|
if nassets == 4:
|
|
expected_views = valmap(
|
|
lambda view: np.c_[view, [np.nan, np.nan]],
|
|
expected_views,
|
|
)
|
|
with tmp_asset_finder(equities=asset_info) as finder:
|
|
expected_output = pd.DataFrame(
|
|
list(concatv([12] * nassets, [13] * nassets, [14] * nassets)),
|
|
index=pd.MultiIndex.from_product((
|
|
sorted(expected_views.keys()),
|
|
finder.retrieve_all(asset_info.index),
|
|
)),
|
|
columns=('value',),
|
|
)
|
|
dates = self.dates
|
|
dates = dates.insert(len(dates), dates[-1] + timedelta(days=1))
|
|
self._run_pipeline(
|
|
expr,
|
|
deltas,
|
|
None,
|
|
expected_views,
|
|
expected_output,
|
|
finder,
|
|
calendar=dates,
|
|
start=dates[1],
|
|
end=dates[-1],
|
|
window_length=2,
|
|
compute_fn=np.nanmax,
|
|
)
|
|
|
|
@with_ignore_sid()
|
|
def test_deltas_before_index_0(self, asset_info, add_extra_sid):
|
|
df = empty_dataframe(
|
|
('sid', 'int64'),
|
|
('value', 'float64'),
|
|
('asof_date', 'datetime64[ns]'),
|
|
('timestamp', 'datetime64[ns]'),
|
|
)
|
|
expr = bz.data(df, name='expr', dshape=self.dshape)
|
|
|
|
T = pd.Timestamp
|
|
# These data are interesting because we have four rows with an asof
|
|
# date prior to the start of the query window. The first, second, and
|
|
# fourth rows should become the best-known value on their timestamp.
|
|
# The third row's asof date is less than the second row's asof date so,
|
|
# due to forward filling rules, it is *not* the most recent value on
|
|
# its timestamp. The value for row three should never be shown to the
|
|
# user.
|
|
deltas_df_single_sid = pd.DataFrame({
|
|
'value': [0.0, 1.0, 2.0, 3.0],
|
|
'asof_date': [
|
|
T('2013-12-01'),
|
|
T('2013-12-15'),
|
|
T('2013-12-02'), # not more recent than the previous day
|
|
T('2013-12-16'),
|
|
],
|
|
'timestamp': [
|
|
T('2014-01-01 23:00'),
|
|
T('2014-01-02 23:00'),
|
|
T('2014-01-03 23:00'),
|
|
T('2014-01-04 23:00'),
|
|
],
|
|
})
|
|
sids = asset_info.index
|
|
if add_extra_sid:
|
|
# add a sid to the dataset that the asset finder doesn't know about
|
|
sids = sids.insert(0, ord('Z'))
|
|
|
|
deltas_df = pd.concat([
|
|
deltas_df_single_sid.assign(
|
|
sid=sid,
|
|
value=deltas_df_single_sid.value + (100 * n),
|
|
)
|
|
for n, sid in enumerate(asset_info.index)
|
|
])
|
|
deltas = bz.data(deltas_df, name='deltas', dshape=self.dshape)
|
|
|
|
expected_views_single_sid = keymap(partial(pd.Timestamp, tz='UTC'), {
|
|
'2014-01-02': np.array([[0.0],
|
|
[0.0]]),
|
|
'2014-01-03': np.array([[1.0],
|
|
[1.0]]),
|
|
# The third row's value of 2.0 is *not* the best known value
|
|
# because its asof date of 2013-12-02 is earlier than the previous
|
|
# row's asof date of 2013-12-15. We continue to surface the second
|
|
# row's value on this day.
|
|
'2014-01-04': np.array([[1.0],
|
|
[1.0]]),
|
|
'2014-01-05': np.array([[3.0],
|
|
[3.0]]),
|
|
})
|
|
|
|
column_constant = np.arange(len(asset_info)) * 100
|
|
expected_views = {
|
|
k: v + column_constant
|
|
for k, v in expected_views_single_sid.items()
|
|
}
|
|
with tmp_asset_finder(equities=asset_info) as finder:
|
|
dates = pd.date_range('2014-01-01', '2014-01-05')
|
|
self._run_pipeline(
|
|
expr,
|
|
deltas,
|
|
None,
|
|
expected_views,
|
|
None,
|
|
finder,
|
|
calendar=dates,
|
|
start=dates[1],
|
|
end=dates[-1],
|
|
window_length=2,
|
|
)
|
|
|
|
@with_ignore_sid()
|
|
def test_deltas_on_same_ix_out_of_order(self, asset_info, add_extra_sid):
|
|
df = empty_dataframe(
|
|
('sid', 'int64'),
|
|
('value', 'float64'),
|
|
('asof_date', 'datetime64[ns]'),
|
|
('timestamp', 'datetime64[ns]'),
|
|
)
|
|
expr = bz.data(df, name='expr', dshape=self.dshape)
|
|
|
|
T = pd.Timestamp
|
|
|
|
# These data are interesting because we have pairs of rows that come on
|
|
# the same asof_date in index space. The catch is that the asof dates
|
|
# are sometimes out of order relative to their timestamps. This is used
|
|
# to test cases where we get novel rows for dates between trading days
|
|
# (weekends and holidays) although we learn about them out of order.
|
|
#
|
|
# The first two rows both map to index 0 in the output. The first row
|
|
# has an earlier timestamp but later asof date so it should be
|
|
# selected.
|
|
#
|
|
# The third and fourth rows both map to index 1 in the output. The
|
|
# fourth row (second in the group) has both a later timestamp and asof
|
|
# date so it should be selected.
|
|
#
|
|
# The fifth and sixth rows both map to index 2 in the output. The fifth
|
|
# row (first in the group) has an earlier timestamp but later asof date
|
|
# so it should be selected.
|
|
deltas_df_single_sid = pd.DataFrame({
|
|
'value': [
|
|
0.0, # selected
|
|
1.0, # ignored
|
|
|
|
2.0, # ignored
|
|
3.0, # selected
|
|
|
|
4.0, # selected
|
|
5.0, # ignored
|
|
],
|
|
'asof_date': [
|
|
# swapped order: second row is before the first
|
|
T('2014-01-02'),
|
|
T('2014-01-01'),
|
|
|
|
# chronological order: second row is after the first
|
|
T('2014-01-03'),
|
|
T('2014-01-04'),
|
|
|
|
# swapped order: second row is before the first
|
|
T('2014-01-06'),
|
|
T('2014-01-05'),
|
|
],
|
|
'timestamp': [
|
|
# we learn about all rows in monotonically increasing order
|
|
T('2013-01-02 22:00'),
|
|
T('2014-01-02 23:00'),
|
|
T('2014-01-04 22:00'),
|
|
T('2014-01-04 23:00'),
|
|
T('2014-01-06 22:00'),
|
|
T('2014-01-06 23:00'),
|
|
],
|
|
})
|
|
sids = asset_info.index
|
|
if add_extra_sid:
|
|
# add a sid to the dataset that the asset finder doesn't know about
|
|
sids = sids.insert(0, ord('Z'))
|
|
|
|
deltas_df = pd.concat([
|
|
deltas_df_single_sid.assign(
|
|
sid=sid,
|
|
value=deltas_df_single_sid.value + (100 * n),
|
|
)
|
|
for n, sid in enumerate(asset_info.index)
|
|
])
|
|
deltas = bz.data(deltas_df, name='deltas', dshape=self.dshape)
|
|
|
|
expected_views_single_sid = keymap(partial(pd.Timestamp, tz='UTC'), {
|
|
'2014-01-05': np.array([[0.0],
|
|
[3.0]]),
|
|
'2014-01-07': np.array([[3.0],
|
|
[4.0]]),
|
|
})
|
|
|
|
column_constant = np.arange(len(asset_info)) * 100
|
|
expected_views = {
|
|
k: v + column_constant
|
|
for k, v in expected_views_single_sid.items()
|
|
}
|
|
with tmp_asset_finder(equities=asset_info) as finder:
|
|
# The dates queried are non-contiguous. We have two day groups to
|
|
# capture the two day pairs in the input data.
|
|
dates = pd.to_datetime(['2014-01-03', '2014-01-05', '2014-01-07'])
|
|
self._run_pipeline(
|
|
expr=expr,
|
|
deltas=deltas,
|
|
checkpoints=None,
|
|
expected_views=expected_views,
|
|
expected_output=None,
|
|
finder=finder,
|
|
calendar=dates,
|
|
start=dates[1],
|
|
end=dates[-1],
|
|
window_length=2,
|
|
)
|
|
|
|
@with_extra_sid()
|
|
def test_deltas_only_one_delta_in_universe(self, asset_info):
|
|
expr = bz.data(self.df, name='expr', dshape=self.dshape)
|
|
deltas = pd.DataFrame({
|
|
'sid': [65, 66],
|
|
'asof_date': [self.asof_dates[1], self.asof_dates[0]],
|
|
'timestamp': [self.timestamps[2], self.timestamps[1]],
|
|
'value': [10, 11],
|
|
})
|
|
deltas = bz.data(deltas, name='deltas', dshape=self.dshape)
|
|
|
|
expected_views = keymap(partial(pd.Timestamp, tz='UTC'), {
|
|
'2014-01-02': np.array([[0.0, 11.0, 2.0],
|
|
[1.0, 2.0, 3.0]]),
|
|
'2014-01-03': np.array([[10.0, 2.0, 3.0],
|
|
[2.0, 3.0, 4.0]]),
|
|
'2014-01-04': np.array([[2.0, 3.0, 4.0],
|
|
[2.0, 3.0, 4.0]]),
|
|
})
|
|
|
|
nassets = len(asset_info)
|
|
if nassets == 4:
|
|
expected_views = valmap(
|
|
lambda view: np.c_[view, [np.nan, np.nan]],
|
|
expected_views,
|
|
)
|
|
|
|
with tmp_asset_finder(equities=asset_info) as finder:
|
|
expected_output = pd.DataFrame(
|
|
columns=[
|
|
'value',
|
|
],
|
|
data=np.array([11, 10, 4]).repeat(len(asset_info.index)),
|
|
index=pd.MultiIndex.from_product((
|
|
sorted(expected_views.keys()),
|
|
finder.retrieve_all(asset_info.index),
|
|
)),
|
|
)
|
|
dates = self.dates
|
|
dates = dates.insert(len(dates), dates[-1] + timedelta(days=1))
|
|
self._run_pipeline(
|
|
expr,
|
|
deltas,
|
|
None,
|
|
expected_views,
|
|
expected_output,
|
|
finder,
|
|
calendar=dates,
|
|
start=dates[1],
|
|
end=dates[-1],
|
|
window_length=2,
|
|
compute_fn=np.nanmax,
|
|
)
|
|
|
|
def test_deltas_macro(self):
|
|
expr = bz.data(self.macro_df, name='expr', dshape=self.macro_dshape)
|
|
deltas = bz.data(
|
|
self.macro_df.iloc[:-1],
|
|
name='deltas',
|
|
dshape=self.macro_dshape,
|
|
)
|
|
deltas = bz.transform(
|
|
deltas,
|
|
value=deltas.value + 10,
|
|
timestamp=deltas.timestamp + timedelta(days=1),
|
|
)
|
|
|
|
nassets = len(simple_asset_info)
|
|
expected_views = keymap(partial(pd.Timestamp, tz='UTC'), {
|
|
'2014-01-02': np.array([[10.0],
|
|
[1.0]]),
|
|
'2014-01-03': np.array([[11.0],
|
|
[2.0]]),
|
|
})
|
|
|
|
with tmp_asset_finder(equities=simple_asset_info) as finder:
|
|
expected_output = pd.DataFrame(
|
|
list(concatv([10] * nassets, [11] * nassets)),
|
|
index=pd.MultiIndex.from_product((
|
|
sorted(expected_views.keys()),
|
|
finder.retrieve_all(simple_asset_info.index),
|
|
)),
|
|
columns=('value',),
|
|
)
|
|
dates = self.dates
|
|
self._run_pipeline(
|
|
expr,
|
|
deltas,
|
|
None,
|
|
expected_views,
|
|
expected_output,
|
|
finder,
|
|
calendar=dates,
|
|
start=dates[1],
|
|
end=dates[-1],
|
|
window_length=2,
|
|
compute_fn=np.nanmax,
|
|
)
|
|
|
|
def test_deltas_before_index_0_macro(self):
|
|
df = empty_dataframe(
|
|
('value', 'float64'),
|
|
('asof_date', 'datetime64[ns]'),
|
|
('timestamp', 'datetime64[ns]'),
|
|
)
|
|
expr = bz.data(df, name='expr', dshape=self.macro_dshape)
|
|
|
|
T = pd.Timestamp
|
|
# These data are interesting because we have four rows with an asof
|
|
# date prior to the start of the query window. The first, second, and
|
|
# fourth rows should become the best-known value on their timestamp.
|
|
# The third row's asof date is less than the second row's asof date so,
|
|
# due to forward filling rules, it is *not* the most recent value on
|
|
# its timestamp. The value for row three should never be shown to the
|
|
# user.
|
|
deltas_df = pd.DataFrame({
|
|
'value': [0.0, 1.0, 2.0, 3.0],
|
|
'asof_date': [
|
|
T('2013-12-01'),
|
|
T('2013-12-15'),
|
|
T('2013-12-02'), # not more recent than the previous day
|
|
T('2013-12-16'),
|
|
],
|
|
'timestamp': [
|
|
T('2014-01-01 23:00'),
|
|
T('2014-01-02 23:00'),
|
|
T('2014-01-03 23:00'),
|
|
T('2014-01-04 23:00'),
|
|
],
|
|
})
|
|
deltas = bz.data(deltas_df, name='deltas', dshape=self.macro_dshape)
|
|
|
|
expected_views = keymap(partial(pd.Timestamp, tz='UTC'), {
|
|
'2014-01-02': np.array([[0.0],
|
|
[0.0]]),
|
|
'2014-01-03': np.array([[1.0],
|
|
[1.0]]),
|
|
# The third row's value of 2.0 is *not* the best known value
|
|
# because its asof date of 2013-12-02 is earlier than the previous
|
|
# row's asof date of 2013-12-15. We continue to surface the second
|
|
# row's value on this day.
|
|
'2014-01-04': np.array([[1.0],
|
|
[1.0]]),
|
|
'2014-01-05': np.array([[3.0],
|
|
[3.0]]),
|
|
})
|
|
|
|
with tmp_asset_finder(equities=simple_asset_info) as finder:
|
|
dates = pd.date_range('2014-01-01', '2014-01-05')
|
|
self._run_pipeline(
|
|
expr,
|
|
deltas,
|
|
None,
|
|
expected_views,
|
|
None,
|
|
finder,
|
|
calendar=dates,
|
|
start=dates[1],
|
|
end=dates[-1],
|
|
window_length=2,
|
|
)
|
|
|
|
def test_deltas_on_same_ix_out_of_order_macro(self):
|
|
df = empty_dataframe(
|
|
('value', 'float64'),
|
|
('asof_date', 'datetime64[ns]'),
|
|
('timestamp', 'datetime64[ns]'),
|
|
)
|
|
expr = bz.data(df, name='expr', dshape=self.macro_dshape)
|
|
|
|
T = pd.Timestamp
|
|
|
|
# These data are interesting because we have pairs of rows that come on
|
|
# the same asof_date in index space. The catch is that the asof dates
|
|
# are sometimes out of order relative to their timestamps. This is used
|
|
# to test cases where we get novel rows for dates between trading days
|
|
# (weekends and holidays) although we learn about them out of order.
|
|
#
|
|
# The first two rows both map to index 0 in the output. The first row
|
|
# has an earlier timestamp but later asof date so it should be
|
|
# selected.
|
|
#
|
|
# The third and fourth rows both map to index 1 in the output. The
|
|
# fourth row (second in the group) has both a later timestamp and asof
|
|
# date so it should be selected.
|
|
#
|
|
# The fifth and sixth rows both map to index 2 in the output. The fifth
|
|
# row (first in the group) has an earlier timestamp but later asof date
|
|
# so it should be selected.
|
|
deltas_df = pd.DataFrame({
|
|
'value': [
|
|
0.0, # selected
|
|
1.0, # ignored
|
|
|
|
2.0, # ignored
|
|
3.0, # selected
|
|
|
|
4.0, # selected
|
|
5.0, # ignored
|
|
],
|
|
'asof_date': [
|
|
# swapped order: second row is before the first
|
|
T('2014-01-02'),
|
|
T('2014-01-01'),
|
|
|
|
# chronological order: second row is after the first
|
|
T('2014-01-03'),
|
|
T('2014-01-04'),
|
|
|
|
# swapped order: second row is before the first
|
|
T('2014-01-06'),
|
|
T('2014-01-05'),
|
|
],
|
|
'timestamp': [
|
|
# we learn about all rows in monotonically increasing order
|
|
T('2013-01-02 22:00'),
|
|
T('2014-01-02 23:00'),
|
|
T('2014-01-04 22:00'),
|
|
T('2014-01-04 23:00'),
|
|
T('2014-01-06 22:00'),
|
|
T('2014-01-06 23:00'),
|
|
],
|
|
})
|
|
deltas = bz.data(deltas_df, name='deltas', dshape=self.macro_dshape)
|
|
|
|
expected_views = keymap(partial(pd.Timestamp, tz='UTC'), {
|
|
'2014-01-05': np.array([[0.0],
|
|
[3.0]]),
|
|
'2014-01-07': np.array([[3.0],
|
|
[4.0]]),
|
|
})
|
|
|
|
with tmp_asset_finder(equities=simple_asset_info) as finder:
|
|
# The dates queried are non-contiguous. We have two day groups to
|
|
# capture the two day pairs in the input data.
|
|
dates = pd.to_datetime(['2014-01-03', '2014-01-05', '2014-01-07'])
|
|
self._run_pipeline(
|
|
expr=expr,
|
|
deltas=deltas,
|
|
checkpoints=None,
|
|
expected_views=expected_views,
|
|
expected_output=None,
|
|
finder=finder,
|
|
calendar=dates,
|
|
start=dates[1],
|
|
end=dates[-1],
|
|
window_length=2,
|
|
)
|
|
|
|
def test_stacked_deltas_macro(self):
|
|
df = empty_dataframe(
|
|
('value', 'float64'),
|
|
('asof_date', 'datetime64[ns]'),
|
|
('timestamp', 'datetime64[ns]'),
|
|
)
|
|
expr = bz.data(df, name='expr', dshape=self.macro_dshape)
|
|
|
|
T = pd.Timestamp
|
|
|
|
# These data are interesting because they exercise the tie breaking of
|
|
# adjustments. Here we have 4 rows which we learn about within a single
|
|
# calendar index. The first row provides the most recently known value
|
|
# for some day in the window. All of the following rows are adjustments
|
|
# to the same (earlier) historical value. We expect that the first
|
|
# row's value is the most recently know value, and the lookback window
|
|
# will be filled with the *last* row's value. This is because each
|
|
# adjustment gets applied in timestamp order, and the last row was
|
|
# learned most recently.
|
|
deltas_df = pd.DataFrame({
|
|
'value': [
|
|
0.0, # selected
|
|
1.0, # ignored
|
|
2.0, # ignored
|
|
3.0, # ignored
|
|
4.0, # selected
|
|
],
|
|
'asof_date': [
|
|
# the first row is for current data
|
|
T('2014-01-02'),
|
|
|
|
# all other rows are restating the same historical value
|
|
T('2013-12-01'),
|
|
T('2013-12-01'),
|
|
T('2013-12-01'),
|
|
T('2013-12-01'),
|
|
],
|
|
'timestamp': [
|
|
# we learn about all rows within a single calendar index
|
|
T('2014-01-02 23:00'),
|
|
T('2014-01-02 23:01'),
|
|
T('2014-01-02 23:02'),
|
|
T('2014-01-02 23:03'),
|
|
T('2014-01-02 23:04'),
|
|
],
|
|
})
|
|
deltas = bz.data(deltas_df, name='deltas', dshape=self.macro_dshape)
|
|
|
|
expected_views = keymap(partial(pd.Timestamp, tz='UTC'), {
|
|
'2014-01-03': np.array([[4.0],
|
|
[4.0],
|
|
[0.0]]),
|
|
})
|
|
|
|
with tmp_asset_finder(equities=simple_asset_info) as finder:
|
|
# The dates queried are non-contiguous. We have two day groups to
|
|
# capture the two day pairs in the input data.
|
|
dates = pd.date_range('2014-01-01', '2014-01-03')
|
|
self._run_pipeline(
|
|
expr=expr,
|
|
deltas=deltas,
|
|
checkpoints=None,
|
|
expected_views=expected_views,
|
|
expected_output=None,
|
|
finder=finder,
|
|
calendar=dates,
|
|
start=dates[-1],
|
|
end=dates[-1],
|
|
window_length=3,
|
|
)
|
|
|
|
@with_extra_sid()
|
|
def test_novel_deltas(self, asset_info):
|
|
base_dates = pd.DatetimeIndex([
|
|
pd.Timestamp('2013-12-31'),
|
|
pd.Timestamp('2014-01-03')
|
|
])
|
|
repeated_dates = base_dates.repeat(3)
|
|
baseline = pd.DataFrame({
|
|
'sid': self.ASSET_FINDER_EQUITY_SIDS * 2,
|
|
'value': (0., 1., 2., 1., 2., 3.),
|
|
'int_value': (0, 1, 2, 1, 2, 3),
|
|
'asof_date': repeated_dates,
|
|
'timestamp': repeated_dates + pd.Timedelta(hours=23),
|
|
})
|
|
expr = bz.data(baseline, name='expr', dshape=self.dshape)
|
|
deltas = bz.data(
|
|
odo(
|
|
bz.transform(
|
|
expr,
|
|
value=expr.value + 10,
|
|
timestamp=expr.timestamp + timedelta(days=1),
|
|
),
|
|
pd.DataFrame,
|
|
),
|
|
name='delta',
|
|
dshape=self.dshape,
|
|
)
|
|
expected_views = keymap(partial(pd.Timestamp, tz='UTC'), {
|
|
'2014-01-03': np.array([[10.0, 11.0, 12.0],
|
|
[10.0, 11.0, 12.0],
|
|
[10.0, 11.0, 12.0]]),
|
|
'2014-01-06': np.array([[10.0, 11.0, 12.0],
|
|
[10.0, 11.0, 12.0],
|
|
[11.0, 12.0, 13.0]]),
|
|
})
|
|
|
|
if len(asset_info) == 4:
|
|
def get_fourth_asset_view(expected_views, window_length):
|
|
return valmap(
|
|
lambda view: np.c_[view, [np.nan] * window_length],
|
|
expected_views,
|
|
)
|
|
|
|
expected_views = get_fourth_asset_view(
|
|
expected_views,
|
|
window_length=3,
|
|
)
|
|
expected_output_buffer = [
|
|
10,
|
|
11,
|
|
12,
|
|
np.nan,
|
|
11,
|
|
12,
|
|
13,
|
|
np.nan,
|
|
]
|
|
else:
|
|
expected_output_buffer = [10, 11, 12, 11, 12, 13]
|
|
|
|
cal = pd.DatetimeIndex([
|
|
pd.Timestamp('2014-01-01'),
|
|
pd.Timestamp('2014-01-02'),
|
|
pd.Timestamp('2014-01-03'),
|
|
# omitting the 4th and 5th to simulate a weekend
|
|
pd.Timestamp('2014-01-06'),
|
|
])
|
|
|
|
with tmp_asset_finder(equities=asset_info) as finder:
|
|
expected_output = pd.DataFrame(
|
|
expected_output_buffer,
|
|
index=pd.MultiIndex.from_product((
|
|
sorted(expected_views.keys()),
|
|
finder.retrieve_all(asset_info.index),
|
|
)),
|
|
columns=('value',),
|
|
)
|
|
|
|
self._run_pipeline(
|
|
expr,
|
|
deltas,
|
|
None,
|
|
expected_views,
|
|
expected_output,
|
|
finder,
|
|
calendar=cal,
|
|
start=cal[2],
|
|
end=cal[-1],
|
|
window_length=3,
|
|
compute_fn=op.itemgetter(-1),
|
|
)
|
|
|
|
def test_novel_deltas_macro(self):
|
|
base_dates = pd.DatetimeIndex([
|
|
pd.Timestamp('2013-12-31'),
|
|
pd.Timestamp('2014-01-03')
|
|
])
|
|
baseline = pd.DataFrame({
|
|
'value': (0., 1.),
|
|
'asof_date': base_dates,
|
|
'timestamp': base_dates + pd.Timedelta(days=1),
|
|
})
|
|
expr = bz.data(baseline, name='expr', dshape=self.macro_dshape)
|
|
deltas = bz.data(baseline, name='deltas', dshape=self.macro_dshape)
|
|
deltas = bz.transform(
|
|
deltas,
|
|
value=deltas.value + 10,
|
|
timestamp=deltas.timestamp + timedelta(days=1),
|
|
)
|
|
nassets = len(simple_asset_info)
|
|
expected_views = keymap(partial(pd.Timestamp, tz='UTC'), {
|
|
'2014-01-03': np.array([[10.0],
|
|
[10.0],
|
|
[10.0]]),
|
|
'2014-01-06': np.array([[10.0],
|
|
[10.0],
|
|
[11.0]]),
|
|
})
|
|
|
|
cal = pd.DatetimeIndex([
|
|
pd.Timestamp('2014-01-01'),
|
|
pd.Timestamp('2014-01-02'),
|
|
pd.Timestamp('2014-01-03'),
|
|
# omitting the 4th and 5th to simulate a weekend
|
|
pd.Timestamp('2014-01-06'),
|
|
])
|
|
|
|
def get_expected_output(expected_views, values, asset_info):
|
|
return pd.DataFrame(
|
|
list(concatv(*([value] * nassets for value in values))),
|
|
index=pd.MultiIndex.from_product(
|
|
(sorted(expected_views.keys()),
|
|
finder.retrieve_all(asset_info.index),)
|
|
), columns=('value',),
|
|
)
|
|
|
|
with tmp_asset_finder(equities=simple_asset_info) as finder:
|
|
expected_output = get_expected_output(
|
|
expected_views,
|
|
[10, 11],
|
|
simple_asset_info,
|
|
)
|
|
self._run_pipeline(
|
|
expr,
|
|
deltas,
|
|
None,
|
|
expected_views,
|
|
expected_output,
|
|
finder,
|
|
calendar=cal,
|
|
start=cal[2],
|
|
end=cal[-1],
|
|
window_length=3,
|
|
compute_fn=op.itemgetter(-1),
|
|
)
|
|
|
|
test_checkpoints_dates = pd.date_range('2013-12-31', '2014-01-04')
|
|
test_checkpoints_expected_view_date = pd.Timestamp('2014-01-03')
|
|
|
|
def _test_checkpoints_macro(self, checkpoints, ffilled_value=-1.0):
|
|
"""Simple checkpoints test that accepts a checkpoints dataframe and
|
|
the expected value for 2014-01-03 for macro datasets.
|
|
|
|
The underlying data has value -1.0 on 2014-01-01 and 1.0 on 2014-01-04.
|
|
|
|
Parameters
|
|
----------
|
|
checkpoints : pd.DataFrame
|
|
The checkpoints data.
|
|
ffilled_value : float, optional
|
|
The value to be read on the third, if not provided, it will be the
|
|
value in the base data that will be naturally ffilled there.
|
|
"""
|
|
dates = self.test_checkpoints_dates[[1, -1]]
|
|
asof_dates = dates - pd.Timedelta(days=1)
|
|
timestamps = asof_dates + pd.Timedelta(hours=23)
|
|
baseline = pd.DataFrame({
|
|
'value': [-1.0, 1.0],
|
|
'asof_date': asof_dates,
|
|
'timestamp': timestamps,
|
|
})
|
|
|
|
nassets = len(simple_asset_info)
|
|
expected_views = keymap(lambda t: t.tz_localize('UTC'), {
|
|
self.test_checkpoints_expected_view_date: (
|
|
np.array([[ffilled_value]])
|
|
),
|
|
self.test_checkpoints_dates[-1]: np.array([[1.0]]),
|
|
})
|
|
|
|
with tmp_asset_finder(equities=simple_asset_info) as finder:
|
|
expected_output = pd.DataFrame(
|
|
list(concatv([ffilled_value] * nassets, [1.0] * nassets)),
|
|
index=pd.MultiIndex.from_product((
|
|
sorted(expected_views.keys()),
|
|
finder.retrieve_all(simple_asset_info.index),
|
|
)),
|
|
columns=('value',),
|
|
)
|
|
|
|
self._run_pipeline(
|
|
bz.data(baseline, name='expr', dshape=self.macro_dshape),
|
|
None,
|
|
bz.data(
|
|
checkpoints,
|
|
name='expr_checkpoints',
|
|
dshape=self.macro_dshape,
|
|
),
|
|
expected_views,
|
|
expected_output,
|
|
finder,
|
|
calendar=pd.date_range('2014-01-01', '2014-01-04'),
|
|
start=pd.Timestamp('2014-01-03'),
|
|
end=dates[-1],
|
|
window_length=1,
|
|
compute_fn=op.itemgetter(-1),
|
|
)
|
|
|
|
@parameter_space(checkpoints_ts_fuzz_minutes=range(-5, 5))
|
|
def test_checkpoints_macro(self, checkpoints_ts_fuzz_minutes):
|
|
ffilled_value = 0.0
|
|
|
|
checkpoints_ts = (
|
|
self.test_checkpoints_expected_view_date -
|
|
pd.Timedelta(days=1)
|
|
)
|
|
checkpoints = pd.DataFrame({
|
|
'value': [ffilled_value],
|
|
'asof_date': checkpoints_ts,
|
|
'timestamp': (
|
|
checkpoints_ts +
|
|
# Fuzz the checkpoints timestamp a little so that it doesn't
|
|
# align with the data query time. This should not affect the
|
|
# correctness of the output.
|
|
pd.Timedelta(minutes=checkpoints_ts_fuzz_minutes)
|
|
),
|
|
})
|
|
|
|
self._test_checkpoints_macro(checkpoints, ffilled_value)
|
|
|
|
def test_empty_checkpoints_macro(self):
|
|
empty_checkpoints = pd.DataFrame({
|
|
'value': [],
|
|
'asof_date': [],
|
|
'timestamp': [],
|
|
})
|
|
|
|
self._test_checkpoints_macro(empty_checkpoints)
|
|
|
|
def test_checkpoints_out_of_bounds_macro(self):
|
|
# provide two checkpoints, one before the data in the base table
|
|
# and one after, these should not affect the value on the third
|
|
asof_dates = self.test_checkpoints_dates[[0, -1]]
|
|
out_of_bounds = pd.DataFrame({
|
|
'value': [-2, 2],
|
|
'asof_date': asof_dates,
|
|
'timestamp': asof_dates + pd.Timedelta(hours=23),
|
|
})
|
|
|
|
# Add a single checkpoint on the query day with a timestamp of exactly
|
|
# the data query time. This should not get pulled to overwrite the
|
|
# expected data on the 3rd.
|
|
exact_query_time = pd.DataFrame({
|
|
'value': [1],
|
|
'asof_date': [
|
|
self.test_checkpoints_expected_view_date -
|
|
pd.Timedelta(days=1)
|
|
],
|
|
'timestamp': [self.test_checkpoints_expected_view_date],
|
|
})
|
|
|
|
self._test_checkpoints_macro(
|
|
pd.concat([out_of_bounds, exact_query_time]),
|
|
)
|
|
|
|
def _test_checkpoints(self, checkpoints, ffilled_values=None):
|
|
"""Simple checkpoints test that accepts a checkpoints dataframe and
|
|
the expected value for 2014-01-03.
|
|
|
|
The underlying data has value -(sid + 1) on 2014-01-01 and sid + 1 on
|
|
2014-01-04.
|
|
|
|
Parameters
|
|
----------
|
|
checkpoints : pd.DataFrame
|
|
The checkpoints data.
|
|
ffilled_value : float, optional
|
|
The value to be read on the third, if not provided, it will be the
|
|
value in the base data that will be naturally ffilled there.
|
|
"""
|
|
nassets = len(simple_asset_info)
|
|
|
|
dates = self.test_checkpoints_dates[[1, -1]]
|
|
|
|
asof_dates = dates - pd.Timedelta(days=1)
|
|
asof_dates_repeated = np.tile(asof_dates, nassets)
|
|
timestamps = asof_dates + pd.Timedelta(hours=23)
|
|
timestamps_repeated = np.tile(timestamps, nassets)
|
|
|
|
values = simple_asset_info.index.values + 1
|
|
values = np.hstack((values[::-1], values))
|
|
baseline = pd.DataFrame({
|
|
'sid': np.tile(simple_asset_info.index, 2),
|
|
'value': values,
|
|
'asof_date': asof_dates_repeated,
|
|
'timestamp': timestamps_repeated,
|
|
})
|
|
|
|
if ffilled_values is None:
|
|
ffilled_values = baseline.value.iloc[:nassets]
|
|
|
|
updated_values = baseline.value.iloc[nassets:]
|
|
|
|
expected_views = keymap(partial(pd.Timestamp, tz='UTC'), {
|
|
self.test_checkpoints_expected_view_date: [ffilled_values],
|
|
self.test_checkpoints_dates[-1]: [updated_values],
|
|
})
|
|
|
|
with tmp_asset_finder(equities=simple_asset_info) as finder:
|
|
expected_output = pd.DataFrame(
|
|
list(concatv(ffilled_values, updated_values)),
|
|
index=pd.MultiIndex.from_product((
|
|
sorted(expected_views.keys()),
|
|
finder.retrieve_all(simple_asset_info.index),
|
|
)),
|
|
columns=('value',),
|
|
)
|
|
|
|
self._run_pipeline(
|
|
bz.data(baseline, name='expr', dshape=self.value_dshape),
|
|
None,
|
|
bz.data(
|
|
checkpoints,
|
|
name='expr_checkpoints',
|
|
dshape=self.value_dshape,
|
|
),
|
|
expected_views,
|
|
expected_output,
|
|
finder,
|
|
calendar=pd.date_range('2014-01-01', '2014-01-04'),
|
|
start=pd.Timestamp('2014-01-03'),
|
|
end=dates[-1],
|
|
window_length=1,
|
|
compute_fn=op.itemgetter(-1),
|
|
)
|
|
|
|
@parameter_space(checkpoints_ts_fuzz_minutes=range(-5, 5))
|
|
def test_checkpoints(self, checkpoints_ts_fuzz_minutes):
|
|
nassets = len(simple_asset_info)
|
|
ffilled_values = (np.arange(nassets, dtype=np.float64) + 1) * 10
|
|
dates = pd.Index([pd.Timestamp('2014-01-01')] * nassets)
|
|
checkpoints = pd.DataFrame({
|
|
'sid': simple_asset_info.index,
|
|
'value': ffilled_values,
|
|
'asof_date': dates,
|
|
'timestamp': (
|
|
dates +
|
|
# Fuzz the checkpoints timestamp a little so that it doesn't
|
|
# align with the data query time. This should not affect the
|
|
# correctness of the output.
|
|
pd.Timedelta(days=1, minutes=checkpoints_ts_fuzz_minutes)
|
|
),
|
|
})
|
|
self._test_checkpoints(checkpoints, ffilled_values)
|
|
|
|
def test_empty_checkpoints(self):
|
|
checkpoints = pd.DataFrame({
|
|
'sid': [],
|
|
'value': [],
|
|
'asof_date': [],
|
|
'timestamp': [],
|
|
})
|
|
|
|
self._test_checkpoints(checkpoints)
|
|
|
|
def test_checkpoints_out_of_bounds(self):
|
|
nassets = len(simple_asset_info)
|
|
# provide two sets of checkpoints, one before the data in the base
|
|
# table and one after, these should not affect the value on the third
|
|
asof_dates = self.test_checkpoints_dates[[0, -1]]
|
|
asof_dates_repeated = np.tile(asof_dates, nassets)
|
|
ffilled_values = (np.arange(nassets) + 2) * 10
|
|
ffilled_values = np.hstack((ffilled_values[::-1], ffilled_values))
|
|
out_of_bounds = pd.DataFrame({
|
|
'sid': np.tile(simple_asset_info.index, 2),
|
|
'value': ffilled_values,
|
|
'asof_date': asof_dates_repeated,
|
|
'timestamp': asof_dates_repeated + pd.Timedelta(hours=23),
|
|
})
|
|
|
|
# Add a single checkpoint on the query day with a timestamp of exactly
|
|
# the data query time. This should not get pulled to overwrite the
|
|
# expected data on the 3rd.
|
|
exact_query_time = pd.DataFrame({
|
|
'sid': simple_asset_info.index,
|
|
'value': simple_asset_info.index + 1,
|
|
'asof_date': (
|
|
self.test_checkpoints_expected_view_date -
|
|
pd.Timedelta(days=1)
|
|
),
|
|
'timestamp': self.test_checkpoints_expected_view_date,
|
|
})
|
|
|
|
self._test_checkpoints(pd.concat([out_of_bounds, exact_query_time]))
|
|
|
|
def test_id_take_last_in_group_sorted(self):
|
|
"""
|
|
input
|
|
asof_date timestamp other value
|
|
2014-01-03 2014-01-04 00 3 3
|
|
2014-01-02 2014-01-04 00 2 2
|
|
|
|
output (expected):
|
|
|
|
other value
|
|
2014-01-02 NaN NaN
|
|
2014-01-03 NaN NaN
|
|
2014-01-06 3 3
|
|
"""
|
|
|
|
dates = pd.DatetimeIndex([
|
|
pd.Timestamp('2014-01-02'),
|
|
pd.Timestamp('2014-01-03'),
|
|
pd.Timestamp('2014-01-06'),
|
|
]).tz_localize('UTC')
|
|
|
|
T = pd.Timestamp
|
|
df = pd.DataFrame(
|
|
columns=['asof_date', 'timestamp', 'other', 'value'],
|
|
data=[
|
|
# asof-dates are flipped in terms of order so that if we
|
|
# don't sort on asof-date before getting the last in group,
|
|
# we will get the wrong result.
|
|
[T('2014-01-03'), T('2014-01-04 00'), 3, 3],
|
|
[T('2014-01-02'), T('2014-01-04 00'), 2, 2],
|
|
],
|
|
)
|
|
fields = OrderedDict(self.macro_dshape.measure.fields)
|
|
fields['other'] = fields['value']
|
|
expected = pd.DataFrame(
|
|
data=[[np.nan, np.nan], # 2014-01-02
|
|
[np.nan, np.nan], # 2014-01-03
|
|
[3, 3]], # 2014-01-06
|
|
columns=['other', 'value'],
|
|
index=dates,
|
|
)
|
|
self._test_id_macro(
|
|
df,
|
|
var * Record(fields),
|
|
expected,
|
|
self.asset_finder,
|
|
('other', 'value'),
|
|
dates=dates,
|
|
)
|
|
|
|
|
|
class MiscTestCase(ZiplineTestCase):
|
|
def test_exprdata_repr(self):
|
|
strd = set()
|
|
|
|
class BadRepr(object):
|
|
"""A class which cannot be repr'd.
|
|
"""
|
|
def __init__(self, name):
|
|
self._name = name
|
|
|
|
def __repr__(self): # pragma: no cover
|
|
raise AssertionError('ayy')
|
|
|
|
def __str__(self):
|
|
strd.add(self)
|
|
return self._name
|
|
|
|
assert_equal(
|
|
repr(ExprData(
|
|
expr=BadRepr('expr'),
|
|
deltas=BadRepr('deltas'),
|
|
checkpoints=BadRepr('checkpoints'),
|
|
odo_kwargs={'a': 'b'},
|
|
)),
|
|
"ExprData(expr=expr, deltas=deltas,"
|
|
" checkpoints=checkpoints, odo_kwargs={'a': 'b'})",
|
|
)
|
|
|
|
def test_exprdata_eq(self):
|
|
dshape = 'var * {sid: int64, asof_date: datetime, value: float64}'
|
|
base_expr = bz.symbol('base', dshape)
|
|
checkpoints_expr = bz.symbol('checkpoints', dshape)
|
|
|
|
# use a nested dict to emulate real call sites
|
|
odo_kwargs = {'a': {'c': 1, 'd': 2}, 'b': {'e': 3, 'f': 4}}
|
|
|
|
actual = ExprData(
|
|
expr=base_expr,
|
|
deltas=None,
|
|
checkpoints=checkpoints_expr,
|
|
odo_kwargs=odo_kwargs,
|
|
)
|
|
same = ExprData(
|
|
expr=base_expr,
|
|
deltas=None,
|
|
checkpoints=checkpoints_expr,
|
|
odo_kwargs=odo_kwargs,
|
|
)
|
|
self.assertEqual(actual, same)
|
|
self.assertEqual(hash(actual), hash(same))
|
|
|
|
different_obs = [
|
|
actual.replace(expr=bz.symbol('not base', dshape)),
|
|
actual.replace(expr=bz.symbol('not deltas', dshape)),
|
|
actual.replace(checkpoints=bz.symbol('not checkpoints', dshape)),
|
|
actual.replace(checkpoints=None),
|
|
actual.replace(odo_kwargs={
|
|
# invert the leaf values
|
|
ok: {ik: ~iv for ik, iv in ov.items()}
|
|
for ok, ov in odo_kwargs.items()
|
|
}),
|
|
]
|
|
|
|
for different in different_obs:
|
|
self.assertNotEqual(actual, different)
|
|
|
|
actual_with_none_odo_kwargs = actual.replace(odo_kwargs=None)
|
|
same_with_none_odo_kwargs = same.replace(odo_kwargs=None)
|
|
|
|
self.assertEqual(
|
|
actual_with_none_odo_kwargs,
|
|
same_with_none_odo_kwargs,
|
|
)
|
|
self.assertEqual(
|
|
hash(actual_with_none_odo_kwargs),
|
|
hash(same_with_none_odo_kwargs),
|
|
)
|
|
|
|
def test_blaze_loader_lookup_failure(self):
|
|
class D(DataSet):
|
|
c = Column(dtype='float64')
|
|
|
|
with self.assertRaises(KeyError) as e:
|
|
BlazeLoader()(D.c)
|
|
assert_equal(str(e.exception), 'D.c::float64')
|