Add support for MinMax scaling (#2470)

2026-05-14 20:48:08 +00:00 · 2023-09-15 00:51:16 +02:00 · 2023-09-15 00:51:16 +02:00 · 415bb6ef97
commit 415bb6ef97
parent 2ac9e8fa76
5 changed files with 187 additions and 18 deletions
--- a/python/prophet/forecaster.py
+++ b/python/prophet/forecaster.py
@ -94,7 +94,8 @@ class Prophet(object):
            mcmc_samples=0,
            interval_width=0.80,
            uncertainty_samples=1000,
-            stan_backend=None
+            stan_backend=None,
+            scaling: str = 'absmax',
    ):
        self.growth = growth

@ -121,9 +122,13 @@ class Prophet(object):
        self.mcmc_samples = mcmc_samples
        self.interval_width = interval_width
        self.uncertainty_samples = uncertainty_samples
+        if scaling not in ("absmax", "minmax"):
+            raise ValueError("scaling must be one of 'absmax' or 'minmax'")
+        self.scaling = scaling

        # Set during fitting or by other methods
        self.start = None
+        self.y_min = None
        self.y_scale = None
        self.logistic_floor = False
        self.t_scale = None
@ -313,7 +318,10 @@ class Prophet(object):
            if 'floor' not in df:
                raise ValueError('Expected column "floor".')
        else:
-            df['floor'] = 0
+            if self.scaling == "absmax":
+                df['floor'] = 0.
+            elif self.scaling == "minmax":
+                df['floor'] = self.y_min
        if self.growth == 'logistic':
            if 'cap' not in df:
                raise ValueError(
@ -346,14 +354,25 @@ class Prophet(object):
        """
        if not initialize_scales:
            return
+
        if self.growth == 'logistic' and 'floor' in df:
            self.logistic_floor = True
-            floor = df['floor']
+            if self.scaling == "absmax":
+                self.y_min = float((df['y'] - df['floor']).abs().min())
+                self.y_scale = float((df['y'] - df['floor']).abs().max())
+            elif self.scaling == "minmax":
+                self.y_min = df['floor'].min()
+                self.y_scale = float(df['cap'].max() - self.y_min)
        else:
-            floor = 0.
-        self.y_scale = float((df['y'] - floor).abs().max())
+            if self.scaling == "absmax":
+                self.y_min = 0.
+                self.y_scale = float((df['y']).abs().max())
+            elif self.scaling == "minmax":
+                self.y_min = df['y'].min()
+                self.y_scale =  float(df['y'].max() - self.y_min)
        if self.y_scale == 0:
            self.y_scale = 1.0
+
        self.start = df['ds'].min()
        self.t_scale = df['ds'].max() - self.start
        for name, props in self.extra_regressors.items():
--- a/python/prophet/serialize.py
+++ b/python/prophet/serialize.py
@ -27,7 +27,7 @@ SIMPLE_ATTRIBUTES = [
    'yearly_seasonality', 'weekly_seasonality', 'daily_seasonality',
    'seasonality_mode', 'seasonality_prior_scale', 'changepoint_prior_scale',
    'holidays_prior_scale', 'mcmc_samples', 'interval_width', 'uncertainty_samples',
-    'y_scale', 'logistic_floor', 'country_holidays', 'component_modes'
+    'y_scale', 'y_min', 'scaling', 'logistic_floor', 'country_holidays', 'component_modes'
 ]

 PD_SERIES = ['changepoints', 'history_dates', 'train_holiday_names']
@ -129,6 +129,13 @@ def model_to_json(model):
    return json.dumps(model_json)


+def _handle_simple_attributes_backwards_compat(model_dict):
+    """Handle backwards compatibility for SIMPLE_ATTRIBUTES."""
+    # prophet<=1.1.4: handle scaling parameters introduced in #2470
+    if 'scaling' not in model_dict:
+        model_dict['scaling'] = 'absmax'
+        model_dict['y_min'] = 0.
+
 def model_from_dict(model_dict):
    """Recreate a Prophet model from a dictionary.

@ -144,6 +151,7 @@ def model_from_dict(model_dict):
    """
    model = Prophet()  # We will overwrite all attributes set in init anyway
    # Simple types
+    _handle_simple_attributes_backwards_compat(model_dict)
    for attribute in SIMPLE_ATTRIBUTES:
        setattr(model, attribute, model_dict[attribute])
    for attribute in PD_SERIES:
--- a/python/prophet/tests/conftest.py
+++ b/python/prophet/tests/conftest.py
@ -16,6 +16,12 @@ def subdaily_univariate_ts() -> pd.DataFrame:
    return pd.read_csv(Path(__file__).parent / "data2.csv", parse_dates=["ds"])


+@pytest.fixture(scope="package")
+def large_numbers_ts() -> pd.DataFrame:
+    """Univariate time series with large values to test scaling"""
+    return pd.read_csv(Path(__file__).parent / "data3.csv", parse_dates=["ds"])
+
+
 def pytest_configure(config):
    config.addinivalue_line("markers", "slow: mark tests as slow (include in run with --test-slow)")

--- a/python/prophet/tests/data3.csv
+++ b/python/prophet/tests/data3.csv
@ -0,0 +1,71 @@
+ds,y
+2023-03-02,623031970.0
+2023-03-06,623032040.0
+2023-03-07,623032054.0
+2023-03-08,623032091.0
+2023-03-09,623032123.0
+2023-03-10,623032152.0
+2023-03-11,623032177.0
+2023-03-12,623032184.0
+2023-03-13,623032193.0
+2023-03-16,623032296.0
+2023-03-17,623032316.0
+2023-03-18,623032328.0
+2023-03-19,623032339.0
+2023-03-20,623032352.0
+2023-03-21,623032385.0
+2023-03-22,623032410.0
+2023-03-23,623032427.0
+2023-03-25,623032479.0
+2023-03-26,623032496.0
+2023-03-27,623032506.0
+2023-03-28,623032533.0
+2023-03-29,623032598.0
+2023-03-30,623032643.0
+2023-03-31,623032681.0
+2023-04-01,623032727.0
+2023-04-02,623032756.0
+2023-04-03,623032767.0
+2023-04-04,623032799.0
+2023-04-05,623032843.0
+2023-04-06,623032890.0
+2023-04-07,623032934.0
+2023-04-08,623032954.0
+2023-04-09,623032959.0
+2023-04-10,623032964.0
+2023-04-11,623032997.0
+2023-04-12,623033041.0
+2023-04-13,623033062.0
+2023-04-14,623033095.0
+2023-04-15,623033122.0
+2023-04-16,623033163.0
+2023-04-17,623033190.0
+2023-04-18,623033227.0
+2023-04-19,623033258.0
+2023-04-20,623033294.0
+2023-04-21,623033329.0
+2023-04-22,623033361.0
+2023-04-23,623033385.0
+2023-04-24,623033397.0
+2023-04-25,623033419.0
+2023-04-26,623033440.0
+2023-04-27,623033482.0
+2023-04-28,623033535.0
+2023-04-29,623033575.0
+2023-04-30,623033600.0
+2023-05-01,623033610.0
+2023-05-02,623033632.0
+2023-05-03,623033666.0
+2023-05-04,623033704.0
+2023-05-05,623033714.0
+2023-05-06,623033752.0
+2023-05-07,623033760.0
+2023-05-08,623033769.0
+2023-05-09,623033784.0
+2023-05-10,623033823.0
+2023-05-11,623033853.0
+2023-05-12,623034010.0
+2023-05-13,623034041.0
+2023-05-14,623034060.0
+2023-05-15,623034068.0
+2023-05-16,623034084.0
--- a/python/prophet/tests/test_prophet.py
+++ b/python/prophet/tests/test_prophet.py
@ -22,29 +22,53 @@ def rmse(predictions, targets) -> float:


 class TestProphetFitPredictDefault:
-    def test_fit_predict(self, daily_univariate_ts, backend):
+    @pytest.mark.parametrize(
+        "scaling,expected",
+        [("absmax", 10.64), ("minmax", 11.13)],
+        ids=["absmax", "minmax"]
+    )
+    def test_fit_predict(self, daily_univariate_ts, backend, scaling, expected):
        test_days = 30
        train, test = train_test_split(daily_univariate_ts, test_days)
-        forecaster = Prophet(stan_backend=backend)
+        forecaster = Prophet(stan_backend=backend, scaling=scaling)
        forecaster.fit(train, seed=1237861298)
        np.random.seed(876543987)
        future = forecaster.make_future_dataframe(test_days, include_history=False)
        future = forecaster.predict(future)
        res = rmse(future["yhat"], test["y"])
-        # this gives ~ 10.64
-        assert 15 > res > 5, "backend: {}".format(forecaster.stan_backend)
+        assert res == pytest.approx(expected, 0.02), "backend: {}".format(forecaster.stan_backend)

-    def test_fit_predict_newton(self, daily_univariate_ts, backend):
+    @pytest.mark.parametrize(
+        "scaling,expected",
+        [("absmax", 23.44), ("minmax", 11.29)],
+        ids=["absmax", "minmax"]
+    )
+    def test_fit_predict_newton(self, daily_univariate_ts, backend, scaling, expected):
        test_days = 30
        train, test = train_test_split(daily_univariate_ts, test_days)
-        forecaster = Prophet(stan_backend=backend)
+        forecaster = Prophet(stan_backend=backend, scaling=scaling)
        forecaster.fit(train, algorithm="Newton", seed=1237861298)
        np.random.seed(876543987)
        future = forecaster.make_future_dataframe(test_days, include_history=False)
        future = forecaster.predict(future)
-        # this gives ~ 10.64
        res = rmse(future["yhat"], test["y"])
-        assert res == pytest.approx(23.44, 0.01), "backend: {}".format(forecaster.stan_backend)
+        assert res == pytest.approx(expected, 0.01), "backend: {}".format(forecaster.stan_backend)
+
+    @pytest.mark.parametrize(
+        "scaling,expected",
+        [("absmax", 127.01), ("minmax", 93.45)],
+        ids=["absmax", "minmax"]
+    )
+    def test_fit_predict_large_numbers(self, large_numbers_ts, backend, scaling, expected):
+        test_days = 30
+        train, test = train_test_split(large_numbers_ts, test_days)
+        forecaster = Prophet(stan_backend=backend, scaling=scaling)
+        forecaster.fit(train, seed=1237861298)
+        np.random.seed(876543987)
+        future = forecaster.make_future_dataframe(test_days, include_history=False)
+        future = forecaster.predict(future)
+        res = rmse(future["yhat"], test["y"])
+        assert res == pytest.approx(expected, 0.01), "backend: {}".format(forecaster.stan_backend)

    @pytest.mark.slow
    def test_fit_predict_sampling(self, daily_univariate_ts, backend):
@ -185,6 +209,26 @@ class TestProphetDataPrep:
        m2.fit(train)
        assert m2.history["y_scaled"][0] == pytest.approx(1.0, 0.01)

+    def test_logistic_floor_minmax(self, daily_univariate_ts, backend):
+        """Test the scaling of y with logistic growth and a floor/cap."""
+        train, _ = train_test_split(daily_univariate_ts, daily_univariate_ts.shape[0] // 2)
+        train["floor"] = 10.0
+        train["cap"] = 80.0
+        m = Prophet(growth="logistic", stan_backend=backend, scaling="minmax")
+        m.fit(train)
+        assert m.logistic_floor
+        assert "floor" in m.history
+        assert m.history["y_scaled"].min() > 0.0
+        assert m.history["y_scaled"].max() < 1.0
+        for col in ["y", "floor", "cap"]:
+            train[col] += 10.0
+        m2 = Prophet(growth="logistic", stan_backend=backend, scaling="minmax")
+        m2.fit(train)
+        assert m2.history["y_scaled"].min() > 0.0
+        assert m2.history["y_scaled"].max() < 1.0
+        # Check that the scaling is the same
+        assert m2.history['y_scaled'].mean() == m.history['y_scaled'].mean()
+
    def test_make_future_dataframe(self, daily_univariate_ts, backend):
        train = daily_univariate_ts.head(468 // 2)
        forecaster = Prophet(stan_backend=backend)
@ -225,8 +269,28 @@ class TestProphetTrendComponent:
        assert k == 0
        assert m == pytest.approx(0.49335657, abs=1e-4)

-    def test_flat_growth(self, backend):
-        m = Prophet(growth="flat", stan_backend=backend)
+    def test_growth_init_minmax(self, daily_univariate_ts, backend):
+        model = Prophet(growth="logistic", stan_backend=backend, scaling="minmax")
+        train = daily_univariate_ts.iloc[:468].copy()
+        train["cap"] = train["y"].max()
+
+        history = model.setup_dataframe(train, initialize_scales=True)
+
+        k, m = model.linear_growth_init(history)
+        assert k == pytest.approx(0.4053406)
+        assert m == pytest.approx(0.3775322)
+
+        k, m = model.logistic_growth_init(history)
+        assert k == pytest.approx(1.782523, abs=1e-4)
+        assert m == pytest.approx(0.280521, abs=1e-4)
+
+        k, m = model.flat_growth_init(history)
+        assert k == 0
+        assert m == pytest.approx(0.32792770, abs=1e-4)
+
+    @pytest.mark.parametrize("scaling",["absmax","minmax"])
+    def test_flat_growth(self, backend, scaling):
+        m = Prophet(growth="flat", stan_backend=backend, scaling=scaling)
        x = np.linspace(0, 2 * np.pi, 8 * 7)
        history = pd.DataFrame(
            {
@ -240,8 +304,8 @@ class TestProphetTrendComponent:
        m_ = m.params["m"][0, 0]
        k = m.params["k"][0, 0]
        assert k == pytest.approx(0.0)
-        assert fcst["trend"].unique()[0] == pytest.approx(m_ * m.y_scale)
-        assert np.round(m_ * m.y_scale) == 30.0
+        assert fcst["trend"].unique()[0] == pytest.approx((m_ * m.y_scale) + m.y_min)
+        assert np.round((m_ * m.y_scale) + m.y_min) == 30.0

    def test_piecewise_linear(self, backend):
        model = Prophet(stan_backend=backend)
@ -791,6 +855,7 @@ class TestProphetHolidays:
        assert sum(fcst["special_day"] == 0) == 575


+
 class TestProphetRegressors:
    def test_added_regressors(self, daily_univariate_ts, backend):
        m = Prophet(stan_backend=backend)