Add support for MinMax scaling (#2470)

This commit is contained in:
Yasir Ekinci 2023-09-15 00:51:16 +02:00 committed by GitHub
parent 2ac9e8fa76
commit 415bb6ef97
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 187 additions and 18 deletions

View file

@ -94,7 +94,8 @@ class Prophet(object):
mcmc_samples=0,
interval_width=0.80,
uncertainty_samples=1000,
stan_backend=None
stan_backend=None,
scaling: str = 'absmax',
):
self.growth = growth
@ -121,9 +122,13 @@ class Prophet(object):
self.mcmc_samples = mcmc_samples
self.interval_width = interval_width
self.uncertainty_samples = uncertainty_samples
if scaling not in ("absmax", "minmax"):
raise ValueError("scaling must be one of 'absmax' or 'minmax'")
self.scaling = scaling
# Set during fitting or by other methods
self.start = None
self.y_min = None
self.y_scale = None
self.logistic_floor = False
self.t_scale = None
@ -313,7 +318,10 @@ class Prophet(object):
if 'floor' not in df:
raise ValueError('Expected column "floor".')
else:
df['floor'] = 0
if self.scaling == "absmax":
df['floor'] = 0.
elif self.scaling == "minmax":
df['floor'] = self.y_min
if self.growth == 'logistic':
if 'cap' not in df:
raise ValueError(
@ -346,14 +354,25 @@ class Prophet(object):
"""
if not initialize_scales:
return
if self.growth == 'logistic' and 'floor' in df:
self.logistic_floor = True
floor = df['floor']
if self.scaling == "absmax":
self.y_min = float((df['y'] - df['floor']).abs().min())
self.y_scale = float((df['y'] - df['floor']).abs().max())
elif self.scaling == "minmax":
self.y_min = df['floor'].min()
self.y_scale = float(df['cap'].max() - self.y_min)
else:
floor = 0.
self.y_scale = float((df['y'] - floor).abs().max())
if self.scaling == "absmax":
self.y_min = 0.
self.y_scale = float((df['y']).abs().max())
elif self.scaling == "minmax":
self.y_min = df['y'].min()
self.y_scale = float(df['y'].max() - self.y_min)
if self.y_scale == 0:
self.y_scale = 1.0
self.start = df['ds'].min()
self.t_scale = df['ds'].max() - self.start
for name, props in self.extra_regressors.items():

View file

@ -27,7 +27,7 @@ SIMPLE_ATTRIBUTES = [
'yearly_seasonality', 'weekly_seasonality', 'daily_seasonality',
'seasonality_mode', 'seasonality_prior_scale', 'changepoint_prior_scale',
'holidays_prior_scale', 'mcmc_samples', 'interval_width', 'uncertainty_samples',
'y_scale', 'logistic_floor', 'country_holidays', 'component_modes'
'y_scale', 'y_min', 'scaling', 'logistic_floor', 'country_holidays', 'component_modes'
]
PD_SERIES = ['changepoints', 'history_dates', 'train_holiday_names']
@ -129,6 +129,13 @@ def model_to_json(model):
return json.dumps(model_json)
def _handle_simple_attributes_backwards_compat(model_dict):
"""Handle backwards compatibility for SIMPLE_ATTRIBUTES."""
# prophet<=1.1.4: handle scaling parameters introduced in #2470
if 'scaling' not in model_dict:
model_dict['scaling'] = 'absmax'
model_dict['y_min'] = 0.
def model_from_dict(model_dict):
"""Recreate a Prophet model from a dictionary.
@ -144,6 +151,7 @@ def model_from_dict(model_dict):
"""
model = Prophet() # We will overwrite all attributes set in init anyway
# Simple types
_handle_simple_attributes_backwards_compat(model_dict)
for attribute in SIMPLE_ATTRIBUTES:
setattr(model, attribute, model_dict[attribute])
for attribute in PD_SERIES:

View file

@ -16,6 +16,12 @@ def subdaily_univariate_ts() -> pd.DataFrame:
return pd.read_csv(Path(__file__).parent / "data2.csv", parse_dates=["ds"])
@pytest.fixture(scope="package")
def large_numbers_ts() -> pd.DataFrame:
"""Univariate time series with large values to test scaling"""
return pd.read_csv(Path(__file__).parent / "data3.csv", parse_dates=["ds"])
def pytest_configure(config):
config.addinivalue_line("markers", "slow: mark tests as slow (include in run with --test-slow)")

View file

@ -0,0 +1,71 @@
ds,y
2023-03-02,623031970.0
2023-03-06,623032040.0
2023-03-07,623032054.0
2023-03-08,623032091.0
2023-03-09,623032123.0
2023-03-10,623032152.0
2023-03-11,623032177.0
2023-03-12,623032184.0
2023-03-13,623032193.0
2023-03-16,623032296.0
2023-03-17,623032316.0
2023-03-18,623032328.0
2023-03-19,623032339.0
2023-03-20,623032352.0
2023-03-21,623032385.0
2023-03-22,623032410.0
2023-03-23,623032427.0
2023-03-25,623032479.0
2023-03-26,623032496.0
2023-03-27,623032506.0
2023-03-28,623032533.0
2023-03-29,623032598.0
2023-03-30,623032643.0
2023-03-31,623032681.0
2023-04-01,623032727.0
2023-04-02,623032756.0
2023-04-03,623032767.0
2023-04-04,623032799.0
2023-04-05,623032843.0
2023-04-06,623032890.0
2023-04-07,623032934.0
2023-04-08,623032954.0
2023-04-09,623032959.0
2023-04-10,623032964.0
2023-04-11,623032997.0
2023-04-12,623033041.0
2023-04-13,623033062.0
2023-04-14,623033095.0
2023-04-15,623033122.0
2023-04-16,623033163.0
2023-04-17,623033190.0
2023-04-18,623033227.0
2023-04-19,623033258.0
2023-04-20,623033294.0
2023-04-21,623033329.0
2023-04-22,623033361.0
2023-04-23,623033385.0
2023-04-24,623033397.0
2023-04-25,623033419.0
2023-04-26,623033440.0
2023-04-27,623033482.0
2023-04-28,623033535.0
2023-04-29,623033575.0
2023-04-30,623033600.0
2023-05-01,623033610.0
2023-05-02,623033632.0
2023-05-03,623033666.0
2023-05-04,623033704.0
2023-05-05,623033714.0
2023-05-06,623033752.0
2023-05-07,623033760.0
2023-05-08,623033769.0
2023-05-09,623033784.0
2023-05-10,623033823.0
2023-05-11,623033853.0
2023-05-12,623034010.0
2023-05-13,623034041.0
2023-05-14,623034060.0
2023-05-15,623034068.0
2023-05-16,623034084.0
1 ds y
2 2023-03-02 623031970.0
3 2023-03-06 623032040.0
4 2023-03-07 623032054.0
5 2023-03-08 623032091.0
6 2023-03-09 623032123.0
7 2023-03-10 623032152.0
8 2023-03-11 623032177.0
9 2023-03-12 623032184.0
10 2023-03-13 623032193.0
11 2023-03-16 623032296.0
12 2023-03-17 623032316.0
13 2023-03-18 623032328.0
14 2023-03-19 623032339.0
15 2023-03-20 623032352.0
16 2023-03-21 623032385.0
17 2023-03-22 623032410.0
18 2023-03-23 623032427.0
19 2023-03-25 623032479.0
20 2023-03-26 623032496.0
21 2023-03-27 623032506.0
22 2023-03-28 623032533.0
23 2023-03-29 623032598.0
24 2023-03-30 623032643.0
25 2023-03-31 623032681.0
26 2023-04-01 623032727.0
27 2023-04-02 623032756.0
28 2023-04-03 623032767.0
29 2023-04-04 623032799.0
30 2023-04-05 623032843.0
31 2023-04-06 623032890.0
32 2023-04-07 623032934.0
33 2023-04-08 623032954.0
34 2023-04-09 623032959.0
35 2023-04-10 623032964.0
36 2023-04-11 623032997.0
37 2023-04-12 623033041.0
38 2023-04-13 623033062.0
39 2023-04-14 623033095.0
40 2023-04-15 623033122.0
41 2023-04-16 623033163.0
42 2023-04-17 623033190.0
43 2023-04-18 623033227.0
44 2023-04-19 623033258.0
45 2023-04-20 623033294.0
46 2023-04-21 623033329.0
47 2023-04-22 623033361.0
48 2023-04-23 623033385.0
49 2023-04-24 623033397.0
50 2023-04-25 623033419.0
51 2023-04-26 623033440.0
52 2023-04-27 623033482.0
53 2023-04-28 623033535.0
54 2023-04-29 623033575.0
55 2023-04-30 623033600.0
56 2023-05-01 623033610.0
57 2023-05-02 623033632.0
58 2023-05-03 623033666.0
59 2023-05-04 623033704.0
60 2023-05-05 623033714.0
61 2023-05-06 623033752.0
62 2023-05-07 623033760.0
63 2023-05-08 623033769.0
64 2023-05-09 623033784.0
65 2023-05-10 623033823.0
66 2023-05-11 623033853.0
67 2023-05-12 623034010.0
68 2023-05-13 623034041.0
69 2023-05-14 623034060.0
70 2023-05-15 623034068.0
71 2023-05-16 623034084.0

View file

@ -22,29 +22,53 @@ def rmse(predictions, targets) -> float:
class TestProphetFitPredictDefault:
def test_fit_predict(self, daily_univariate_ts, backend):
@pytest.mark.parametrize(
"scaling,expected",
[("absmax", 10.64), ("minmax", 11.13)],
ids=["absmax", "minmax"]
)
def test_fit_predict(self, daily_univariate_ts, backend, scaling, expected):
test_days = 30
train, test = train_test_split(daily_univariate_ts, test_days)
forecaster = Prophet(stan_backend=backend)
forecaster = Prophet(stan_backend=backend, scaling=scaling)
forecaster.fit(train, seed=1237861298)
np.random.seed(876543987)
future = forecaster.make_future_dataframe(test_days, include_history=False)
future = forecaster.predict(future)
res = rmse(future["yhat"], test["y"])
# this gives ~ 10.64
assert 15 > res > 5, "backend: {}".format(forecaster.stan_backend)
assert res == pytest.approx(expected, 0.02), "backend: {}".format(forecaster.stan_backend)
def test_fit_predict_newton(self, daily_univariate_ts, backend):
@pytest.mark.parametrize(
"scaling,expected",
[("absmax", 23.44), ("minmax", 11.29)],
ids=["absmax", "minmax"]
)
def test_fit_predict_newton(self, daily_univariate_ts, backend, scaling, expected):
test_days = 30
train, test = train_test_split(daily_univariate_ts, test_days)
forecaster = Prophet(stan_backend=backend)
forecaster = Prophet(stan_backend=backend, scaling=scaling)
forecaster.fit(train, algorithm="Newton", seed=1237861298)
np.random.seed(876543987)
future = forecaster.make_future_dataframe(test_days, include_history=False)
future = forecaster.predict(future)
# this gives ~ 10.64
res = rmse(future["yhat"], test["y"])
assert res == pytest.approx(23.44, 0.01), "backend: {}".format(forecaster.stan_backend)
assert res == pytest.approx(expected, 0.01), "backend: {}".format(forecaster.stan_backend)
@pytest.mark.parametrize(
"scaling,expected",
[("absmax", 127.01), ("minmax", 93.45)],
ids=["absmax", "minmax"]
)
def test_fit_predict_large_numbers(self, large_numbers_ts, backend, scaling, expected):
test_days = 30
train, test = train_test_split(large_numbers_ts, test_days)
forecaster = Prophet(stan_backend=backend, scaling=scaling)
forecaster.fit(train, seed=1237861298)
np.random.seed(876543987)
future = forecaster.make_future_dataframe(test_days, include_history=False)
future = forecaster.predict(future)
res = rmse(future["yhat"], test["y"])
assert res == pytest.approx(expected, 0.01), "backend: {}".format(forecaster.stan_backend)
@pytest.mark.slow
def test_fit_predict_sampling(self, daily_univariate_ts, backend):
@ -185,6 +209,26 @@ class TestProphetDataPrep:
m2.fit(train)
assert m2.history["y_scaled"][0] == pytest.approx(1.0, 0.01)
def test_logistic_floor_minmax(self, daily_univariate_ts, backend):
"""Test the scaling of y with logistic growth and a floor/cap."""
train, _ = train_test_split(daily_univariate_ts, daily_univariate_ts.shape[0] // 2)
train["floor"] = 10.0
train["cap"] = 80.0
m = Prophet(growth="logistic", stan_backend=backend, scaling="minmax")
m.fit(train)
assert m.logistic_floor
assert "floor" in m.history
assert m.history["y_scaled"].min() > 0.0
assert m.history["y_scaled"].max() < 1.0
for col in ["y", "floor", "cap"]:
train[col] += 10.0
m2 = Prophet(growth="logistic", stan_backend=backend, scaling="minmax")
m2.fit(train)
assert m2.history["y_scaled"].min() > 0.0
assert m2.history["y_scaled"].max() < 1.0
# Check that the scaling is the same
assert m2.history['y_scaled'].mean() == m.history['y_scaled'].mean()
def test_make_future_dataframe(self, daily_univariate_ts, backend):
train = daily_univariate_ts.head(468 // 2)
forecaster = Prophet(stan_backend=backend)
@ -225,8 +269,28 @@ class TestProphetTrendComponent:
assert k == 0
assert m == pytest.approx(0.49335657, abs=1e-4)
def test_flat_growth(self, backend):
m = Prophet(growth="flat", stan_backend=backend)
def test_growth_init_minmax(self, daily_univariate_ts, backend):
model = Prophet(growth="logistic", stan_backend=backend, scaling="minmax")
train = daily_univariate_ts.iloc[:468].copy()
train["cap"] = train["y"].max()
history = model.setup_dataframe(train, initialize_scales=True)
k, m = model.linear_growth_init(history)
assert k == pytest.approx(0.4053406)
assert m == pytest.approx(0.3775322)
k, m = model.logistic_growth_init(history)
assert k == pytest.approx(1.782523, abs=1e-4)
assert m == pytest.approx(0.280521, abs=1e-4)
k, m = model.flat_growth_init(history)
assert k == 0
assert m == pytest.approx(0.32792770, abs=1e-4)
@pytest.mark.parametrize("scaling",["absmax","minmax"])
def test_flat_growth(self, backend, scaling):
m = Prophet(growth="flat", stan_backend=backend, scaling=scaling)
x = np.linspace(0, 2 * np.pi, 8 * 7)
history = pd.DataFrame(
{
@ -240,8 +304,8 @@ class TestProphetTrendComponent:
m_ = m.params["m"][0, 0]
k = m.params["k"][0, 0]
assert k == pytest.approx(0.0)
assert fcst["trend"].unique()[0] == pytest.approx(m_ * m.y_scale)
assert np.round(m_ * m.y_scale) == 30.0
assert fcst["trend"].unique()[0] == pytest.approx((m_ * m.y_scale) + m.y_min)
assert np.round((m_ * m.y_scale) + m.y_min) == 30.0
def test_piecewise_linear(self, backend):
model = Prophet(stan_backend=backend)
@ -791,6 +855,7 @@ class TestProphetHolidays:
assert sum(fcst["special_day"] == 0) == 575
class TestProphetRegressors:
def test_added_regressors(self, daily_univariate_ts, backend):
m = Prophet(stan_backend=backend)