zipline/tests/pipeline/test_classifier.py

from functools import reduce
import operator as op

import numpy as np
import pandas as pd

from zipline.lib.labelarray import LabelArray
from zipline.pipeline import Classifier
from zipline.pipeline.data.testing import TestingDataSet
from zipline.pipeline.expression import methods_to_ops
from zipline.testing import parameter_space
from zipline.testing.fixtures import ZiplineTestCase
from zipline.testing.predicates import assert_equal
from zipline.utils.numpy_utils import (
    categorical_dtype,
    int64_dtype,
)

from .base import BaseUSEquityPipelineTestCase


bytes_dtype = np.dtype('S3')
unicode_dtype = np.dtype('U3')


class ClassifierTestCase(BaseUSEquityPipelineTestCase):

    @parameter_space(mv=[-1, 0, 1, 999])
    def test_integral_isnull(self, mv):

        class C(Classifier):
            dtype = int64_dtype
            missing_value = mv
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of missing and non-missing values.
        data = np.array([[-1,  1,  0, 2],
                         [3,   0,  1, 0],
                         [-5,  0, -1, 0],
                         [-3,  1,  2, 2]], dtype=int64_dtype)

        self.check_terms(
            terms={
                'isnull': c.isnull(),
                'notnull': c.notnull()
            },
            expected={
                'isnull': data == mv,
                'notnull': data != mv,
            },
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )

    @parameter_space(mv=['0', None])
    def test_string_isnull(self, mv):

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = mv
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of missing and non-missing values.
        raw = np.asarray(
            [['',    'a',  'ab', 'ba'],
             ['z',  'ab',   'a', 'ab'],
             ['aa', 'ab',    '', 'ab'],
             ['aa',  'a',  'ba', 'ba']],
            dtype=categorical_dtype,
        )
        data = LabelArray(raw, missing_value=mv)

        self.check_terms(
            terms={
                'isnull': c.isnull(),
                'notnull': c.notnull()
            },
            expected={
                'isnull': np.equal(raw, mv),
                'notnull': np.not_equal(raw, mv),
            },
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )

    @parameter_space(compval=[0, 1, 999])
    def test_eq(self, compval):

        class C(Classifier):
            dtype = int64_dtype
            missing_value = -1
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of the comparison value and other values.
        data = np.array([[-1,  1,  0, 2],
                         [3,   0,  1, 0],
                         [-5,  0, -1, 0],
                         [-3,  1,  2, 2]], dtype=int64_dtype)

        self.check_terms(
            terms={
                'eq': c.eq(compval),
            },
            expected={
                'eq': (data == compval),
            },
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )

    @parameter_space(
        __fail_fast=True,
        compval=['a', 'ab', 'not in the array'],
        labelarray_dtype=(bytes_dtype, categorical_dtype, unicode_dtype),
    )
    def test_string_eq(self, compval, labelarray_dtype):

        compval = labelarray_dtype.type(compval)

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = ''
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of the comparison value and other values.
        data = LabelArray(
            np.asarray(
                [['',    'a',  'ab', 'ba'],
                 ['z',  'ab',   'a', 'ab'],
                 ['aa', 'ab',    '', 'ab'],
                 ['aa',  'a',  'ba', 'ba']],
                dtype=labelarray_dtype,
            ),
            missing_value='',
        )

        self.check_terms(
            terms={
                'eq': c.eq(compval),
            },
            expected={
                'eq': (data == compval),
            },
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )

    @parameter_space(
        missing=[-1, 0, 1],
        dtype_=[int64_dtype, categorical_dtype],
    )
    def test_disallow_comparison_to_missing_value(self, missing, dtype_):
        if dtype_ == categorical_dtype:
            missing = str(missing)

        class C(Classifier):
            dtype = dtype_
            missing_value = missing
            inputs = ()
            window_length = 0

        with self.assertRaises(ValueError) as e:
            C().eq(missing)
        errmsg = str(e.exception)
        self.assertEqual(
            errmsg,
            "Comparison against self.missing_value ({v!r}) in C.eq().\n"
            "Missing values have NaN semantics, so the requested comparison"
            " would always produce False.\n"
            "Use the isnull() method to check for missing values.".format(
                v=missing,
            ),
        )

    @parameter_space(compval=[0, 1, 999], missing=[-1, 0, 999])
    def test_not_equal(self, compval, missing):

        class C(Classifier):
            dtype = int64_dtype
            missing_value = missing
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of the comparison value and other values.
        data = np.array([[-1,  1,  0, 2],
                         [3,   0,  1, 0],
                         [-5,  0, -1, 0],
                         [-3,  1,  2, 2]], dtype=int64_dtype)

        self.check_terms(
            terms={
                'ne': c != compval,
            },
            expected={
                'ne': (data != compval) & (data != C.missing_value),
            },
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )

    @parameter_space(
        __fail_fast=True,
        compval=['a', 'ab', '', 'not in the array'],
        missing=['a', 'ab', '', 'not in the array'],
        labelarray_dtype=(bytes_dtype, unicode_dtype, categorical_dtype),
    )
    def test_string_not_equal(self, compval, missing, labelarray_dtype):

        compval = labelarray_dtype.type(compval)

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = missing
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of the comparison value and other values.
        data = LabelArray(
            np.asarray(
                [['',    'a',  'ab', 'ba'],
                 ['z',  'ab',   'a', 'ab'],
                 ['aa', 'ab',    '', 'ab'],
                 ['aa',  'a',  'ba', 'ba']],
                dtype=labelarray_dtype,
            ),
            missing_value=missing,
        )

        expected = (
            (data.as_int_array() != data.reverse_categories.get(compval, -1)) &
            (data.as_int_array() != data.reverse_categories[C.missing_value])
        )

        self.check_terms(
            terms={
                'ne': c != compval,
            },
            expected={
                'ne': expected,
            },
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )

    @parameter_space(
        __fail_fast=True,
        compval=[u'a', u'b', u'ab', u'not in the array'],
        missing=[u'a', u'ab', u'', u'not in the array'],
        labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype),
    )
    def test_string_elementwise_predicates(self,
                                           compval,
                                           missing,
                                           labelarray_dtype):
        if labelarray_dtype == bytes_dtype:
            compval = compval.encode('utf-8')
            missing = missing.encode('utf-8')

            startswith_re = b'^' + compval + b'.*'
            endswith_re = b'.*' + compval + b'$'
            substring_re = b'.*' + compval + b'.*'
        else:
            startswith_re = '^' + compval + '.*'
            endswith_re = '.*' + compval + '$'
            substring_re = '.*' + compval + '.*'

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = missing
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of the comparison value and other values.
        data = LabelArray(
            np.asarray(
                [['',    'a',  'ab', 'ba'],
                 ['z',  'ab',   'a', 'ab'],
                 ['aa', 'ab',    '', 'ab'],
                 ['aa',  'a',  'ba', 'ba']],
                dtype=labelarray_dtype,
            ),
            missing_value=missing,
        )

        terms = {
            'startswith': c.startswith(compval),
            'endswith': c.endswith(compval),
            'has_substring': c.has_substring(compval),
            # Equivalent filters using regex matching.
            'startswith_re': c.matches(startswith_re),
            'endswith_re': c.matches(endswith_re),
            'has_substring_re': c.matches(substring_re),
        }

        expected = {
            'startswith': (data.startswith(compval) & (data != missing)),
            'endswith': (data.endswith(compval) & (data != missing)),
            'has_substring': (data.has_substring(compval) & (data != missing)),
        }
        for key in list(expected):
            expected[key + '_re'] = expected[key]

        self.check_terms(
            terms=terms,
            expected=expected,
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )

    @parameter_space(
        __fail_fast=True,
        container_type=(set, list, tuple, frozenset),
        labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype),
    )
    def test_element_of_strings(self, container_type, labelarray_dtype):

        missing = labelarray_dtype.type("not in the array")

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = missing
            inputs = ()
            window_length = 0

        c = C()

        raw = np.asarray(
            [['',    'a',  'ab', 'ba'],
             ['z',  'ab',   'a', 'ab'],
             ['aa', 'ab',    '', 'ab'],
             ['aa',  'a',  'ba', 'ba']],
            dtype=labelarray_dtype,
        )
        data = LabelArray(raw, missing_value=missing)

        choices = [
            container_type(choices) for choices in [
                [],
                ['a', ''],
                ['a', 'a', 'a', 'ab', 'a'],
                set(data.reverse_categories) - {missing},
                ['random value', 'ab'],
                ['_' * i for i in range(30)],
            ]
        ]

        def make_expected(choice_set):
            return np.vectorize(choice_set.__contains__, otypes=[bool])(raw)

        terms = {str(i): c.element_of(s) for i, s in enumerate(choices)}
        expected = {str(i): make_expected(s) for i, s in enumerate(choices)}

        self.check_terms(
            terms=terms,
            expected=expected,
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )

    def test_element_of_integral(self):
        """
        Element of is well-defined for integral classifiers.
        """
        class C(Classifier):
            dtype = int64_dtype
            missing_value = -1
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of missing and non-missing values.
        data = np.array([[-1,  1,  0, 2],
                         [3,   0,  1, 0],
                         [-5,  0, -1, 0],
                         [-3,  1,  2, 2]], dtype=int64_dtype)

        terms = {}
        expected = {}
        for choices in [(0,), (0, 1), (0, 1, 2)]:
            terms[str(choices)] = c.element_of(choices)
            expected[str(choices)] = reduce(
                op.or_,
                (data == elem for elem in choices),
                np.zeros_like(data, dtype=bool),
            )

        self.check_terms(
            terms=terms,
            expected=expected,
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )

    def test_element_of_rejects_missing_value(self):
        """
        Test that element_of raises a useful error if we attempt to pass it an
        array of choices that include the classifier's missing_value.
        """
        missing = "not in the array"

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = missing
            inputs = ()
            window_length = 0

        c = C()

        for bad_elems in ([missing], [missing, 'random other value']):
            with self.assertRaises(ValueError) as e:
                c.element_of(bad_elems)
            errmsg = str(e.exception)
            expected = (
                "Found self.missing_value ('not in the array') in choices"
                " supplied to C.element_of().\n"
                "Missing values have NaN semantics, so the requested"
                " comparison would always produce False.\n"
                "Use the isnull() method to check for missing values.\n"
                "Received choices were {}.".format(bad_elems)
            )
            self.assertEqual(errmsg, expected)

    @parameter_space(dtype_=Classifier.ALLOWED_DTYPES)
    def test_element_of_rejects_unhashable_type(self, dtype_):

        class C(Classifier):
            dtype = dtype_
            missing_value = dtype.type('1')
            inputs = ()
            window_length = 0

        c = C()

        with self.assertRaises(TypeError) as e:
            c.element_of([{'a': 1}])

        errmsg = str(e.exception)
        expected = (
            "Expected `choices` to be an iterable of hashable values,"
            " but got [{'a': 1}] instead.\n"
            "This caused the following error: "
            "TypeError(\"unhashable type: 'dict'\",)."
        )
        self.assertEqual(errmsg, expected)

    @parameter_space(
        __fail_fast=True,
        labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype),
        relabel_func=[
            lambda s: str(s[0]),
            lambda s: str(len(s)),
            lambda s: str(len([c for c in s if c == 'a'])),
            lambda s: None,
        ]
    )
    def test_relabel_strings(self, relabel_func, labelarray_dtype):

        class C(Classifier):
            inputs = ()
            dtype = categorical_dtype
            missing_value = None
            window_length = 0

        c = C()

        raw = np.asarray(
            [['a', 'aa', 'aaa', 'abab'],
             ['bab', 'aba', 'aa', 'bb'],
             ['a', 'aba', 'abaa', 'abaab'],
             ['a', 'aa', 'aaa', 'aaaa']],
            dtype=labelarray_dtype,
        )
        raw_relabeled = np.vectorize(relabel_func, otypes=[object])(raw)

        data = LabelArray(raw, missing_value=None)

        terms = {
            'relabeled': c.relabel(relabel_func),
        }
        expected_results = {
            'relabeled': LabelArray(raw_relabeled, missing_value=None),
        }

        self.check_terms(
            terms,
            expected_results,
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )

    @parameter_space(
        __fail_fast=True,
        missing_value=[None, 'M'],
    )
    def test_relabel_missing_value_interactions(self, missing_value):

        mv = missing_value

        class C(Classifier):
            inputs = ()
            dtype = categorical_dtype
            missing_value = mv
            window_length = 0

        c = C()

        def relabel_func(s):
            if s == 'B':
                return mv
            return ''.join([s, s])

        raw = np.asarray(
            [['A', 'B', 'C', mv],
             [mv, 'A', 'B', 'C'],
             ['C', mv, 'A', 'B'],
             ['B', 'C', mv, 'A']],
            dtype=categorical_dtype,
        )
        data = LabelArray(raw, missing_value=mv)

        expected_relabeled_raw = np.asarray(
            [['AA', mv, 'CC', mv],
             [mv, 'AA', mv, 'CC'],
             ['CC', mv, 'AA', mv],
             [mv, 'CC', mv, 'AA']],
            dtype=categorical_dtype,
        )

        terms = {
            'relabeled': c.relabel(relabel_func),
        }
        expected_results = {
            'relabeled': LabelArray(expected_relabeled_raw, missing_value=mv),
        }

        self.check_terms(
            terms,
            expected_results,
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )

    def test_relabel_int_classifier_not_yet_supported(self):
        class C(Classifier):
            inputs = ()
            dtype = int64_dtype
            missing_value = -1
            window_length = 0

        c = C()

        with self.assertRaises(TypeError) as e:
            c.relabel(lambda x: 0 / 0)  # Function should never be called.

        result = str(e.exception)
        expected = (
            "relabel() is only defined on Classifiers producing strings "
            "but it was called on a Classifier of dtype int64."
        )
        self.assertEqual(result, expected)

    @parameter_space(
        compare_op=[op.gt, op.ge, op.le, op.lt],
        dtype_and_missing=[(int64_dtype, 0), (categorical_dtype, '')],
    )
    def test_bad_compare(self, compare_op, dtype_and_missing):
        class C(Classifier):
            inputs = ()
            window_length = 0
            dtype = dtype_and_missing[0]
            missing_value = dtype_and_missing[1]

        with self.assertRaises(TypeError) as e:
            compare_op(C(), object())

        self.assertEqual(
            str(e.exception),
            'cannot compare classifiers with %s' % (
                methods_to_ops['__%s__' % compare_op.__name__],
            ),
        )

    @parameter_space(
        dtype_and_missing=[(int64_dtype, -1), (categorical_dtype, None)],
        use_mask=[True, False],
    )
    def test_peer_count(self, dtype_and_missing, use_mask):
        class C(Classifier):
            dtype = dtype_and_missing[0]
            missing_value = dtype_and_missing[1]
            inputs = ()
            window_length = 0

        c = C()

        if dtype_and_missing[0] == int64_dtype:
            data = np.array(
                [[1, 1, -1, 2, 1, -1],
                 [2, 1, 3, 2, 2, 2],
                 [-1, 1, 10, 10, 10, -1],
                 [3, 3, 3, 3, 3, 3]],
                dtype=int64_dtype,
            )
        else:
            data = LabelArray(
                [['a', 'a', None, 'b', 'a', None],
                 ['b', 'a', 'c', 'b', 'b', 'b'],
                 [None, 'a', 'aa', 'aa', 'aa', None],
                 ['c', 'c', 'c', 'c', 'c', 'c']],
                missing_value=None,
            )

        if not use_mask:
            mask = self.build_mask(self.ones_mask(shape=data.shape))
            expected = np.array(
                [[3, 3, np.nan, 1, 3, np.nan],
                 [4, 1, 1, 4, 4, 4],
                 [np.nan, 1, 3, 3, 3, np.nan],
                 [6, 6, 6, 6, 6, 6]],
            )
        else:
            # Punch a couple holes in the mask to check that we handle the mask
            # correctly.
            mask = self.build_mask(
                np.array([[1, 1, 1, 1, 0, 1],
                          [1, 1, 1, 1, 1, 0],
                          [1, 1, 1, 1, 1, 1],
                          [1, 1, 0, 0, 1, 1]], dtype='bool')
            )
            expected = np.array(
                [[2, 2, np.nan, 1, np.nan, np.nan],
                 [3, 1, 1, 3, 3, np.nan],
                 [np.nan, 1, 3, 3, 3, np.nan],
                 [4, 4, np.nan, np.nan, 4, 4]],
            )

        terms = {
            'peer_counts': c.peer_count(),
        }
        expected_results = {
            'peer_counts': expected,
        }

        self.check_terms(
            terms=terms,
            expected=expected_results,
            initial_workspace={c: data},
            mask=mask,
        )


class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
    def test_reversability_categorical(self):
        class F(Classifier):
            inputs = ()
            window_length = 0
            dtype = categorical_dtype
            missing_value = '<missing>'

        f = F()
        column_data = LabelArray(
            np.array(
                [['a', f.missing_value],
                 ['b', f.missing_value],
                 ['c', 'd']],
            ),
            missing_value=f.missing_value,
        )

        assert_equal(
            f.postprocess(column_data.ravel()),
            pd.Categorical(
                ['a', f.missing_value, 'b', f.missing_value, 'c', 'd'],
            ),
        )

        # only include the non-missing data
        pipeline_output = pd.Series(
            data=['a', 'b', 'c', 'd'],
            index=pd.MultiIndex.from_arrays([
                [pd.Timestamp('2014-01-01'),
                 pd.Timestamp('2014-01-02'),
                 pd.Timestamp('2014-01-03'),
                 pd.Timestamp('2014-01-03')],
                [0, 0, 0, 1],
            ]),
            dtype='category',
        )

        assert_equal(
            f.to_workspace_value(pipeline_output, pd.Index([0, 1])),
            column_data,
        )

    def test_reversability_int64(self):
        class F(Classifier):
            inputs = ()
            window_length = 0
            dtype = int64_dtype
            missing_value = -1

        f = F()
        column_data = np.array(
            [[0, f.missing_value],
             [1, f.missing_value],
             [2, 3]],
        )

        assert_equal(f.postprocess(column_data.ravel()), column_data.ravel())

        # only include the non-missing data
        pipeline_output = pd.Series(
            data=[0, 1, 2, 3],
            index=pd.MultiIndex.from_arrays([
                [pd.Timestamp('2014-01-01'),
                 pd.Timestamp('2014-01-02'),
                 pd.Timestamp('2014-01-03'),
                 pd.Timestamp('2014-01-03')],
                [0, 0, 0, 1],
            ]),
            dtype=int64_dtype,
        )

        assert_equal(
            f.to_workspace_value(pipeline_output, pd.Index([0, 1])),
            column_data,
        )


class ReprTestCase(ZiplineTestCase):

    def test_quantiles_graph_repr(self):
        quantiles = TestingDataSet.float_col.latest.quantiles(5)
        self.assertEqual(quantiles.graph_repr(), "Quantiles(5)")