From 218283f157cca0f2f442208aca38900d2b8b9da1 Mon Sep 17 00:00:00 2001 From: Ben Letham Date: Wed, 30 May 2018 12:09:54 -0700 Subject: [PATCH] Simplify logic/interfaces for cross_validation, to better handle irregularly spaced data and actually respect initial window --- R/NAMESPACE | 1 - R/R/diagnostics.R | 146 +++++++++------------ R/man/cross_validation.Rd | 5 +- R/man/dyplot.prophet.Rd | 5 +- R/man/generate_cutoffs.Rd | 4 +- R/man/prophet.Rd | 19 ++- R/man/simulated_historical_forecasts.Rd | 27 ---- R/tests/testthat/test_diagnostics.R | 124 +++++++---------- python/fbprophet/diagnostics.py | 105 ++++++--------- python/fbprophet/tests/test_diagnostics.py | 128 +++++++----------- 10 files changed, 218 insertions(+), 346 deletions(-) delete mode 100644 R/man/simulated_historical_forecasts.Rd diff --git a/R/NAMESPACE b/R/NAMESPACE index 86ace30..45c5d6b 100644 --- a/R/NAMESPACE +++ b/R/NAMESPACE @@ -15,6 +15,5 @@ export(plot_forecast_component) export(predictive_samples) export(prophet) export(prophet_plot_components) -export(simulated_historical_forecasts) import(Rcpp) importFrom(dplyr,"%>%") diff --git a/R/R/diagnostics.R b/R/R/diagnostics.R index c03f4cf..632fcd2 100644 --- a/R/R/diagnostics.R +++ b/R/R/diagnostics.R @@ -11,102 +11,50 @@ globalVariables(c( #' Generate cutoff dates #' -#' @param df Dataframe with historical data -#' @param horizon timediff forecast horizon -#' @param k integer number of forecast points +#' @param df Dataframe with historical data. +#' @param horizon timediff forecast horizon. +#' @param initial timediff initial window. #' @param period timediff Simulated forecasts are done with this period. #' -#' @return Array of datetimes +#' @return Array of datetimes. #' #' @keywords internal -generate_cutoffs <- function(df, horizon, k, period) { +generate_cutoffs <- function(df, horizon, initial, period) { # Last cutoff is (latest date in data) - (horizon). cutoff <- max(df$ds) - horizon - if (cutoff < min(df$ds)) { - stop('Less data than horizon.') - } tzone <- attr(cutoff, "tzone") # Timezone is wiped by putting in array - result <- cutoff - if (k > 1) { - for (i in 2:k) { - cutoff <- cutoff - period - # If data does not exist in data range (cutoff, cutoff + horizon] - if (!any((df$ds > cutoff) & (df$ds <= cutoff + horizon))) { + result <- c(cutoff) + while (result[length(result)] >= min(df$ds) + initial) { + cutoff <- cutoff - period + # If data does not exist in data range (cutoff, cutoff + horizon] + if (!any((df$ds > cutoff) & (df$ds <= cutoff + horizon))) { # Next cutoff point is 'closest date before cutoff in data - horizon' closest.date <- max(df$ds[df$ds <= cutoff]) cutoff <- closest.date - horizon - } - if (cutoff < min(df$ds)) { - warning('Not enough data for requested number of cutoffs! Using ', i) - break - } - result <- c(result, cutoff) } + result <- c(result, cutoff) + } + result <- head(result, -1) + if (length(result) == 0) { + stop(paste( + 'Less data than horizon after initial window.', + 'Make horizon or initial shorter.' + )) } # Reset timezones attr(result, "tzone") <- tzone + message(paste( + 'Making', length(result), 'forecasts with cutoffs between', + result[length(result)], 'and', result[1] + )) return(rev(result)) } -#' Simulated historical forecasts. -#' -#' Make forecasts from k historical cutoff points, working backwards from -#' (end - horizon) with a spacing of period between each cutoff. -#' -#' @param model Fitted Prophet model. -#' @param horizon Integer size of the horizon -#' @param units String unit of the horizon, e.g., "days", "secs". -#' @param k integer number of forecast points -#' @param period Integer amount of time between cutoff dates. Same units as -#' horizon. If not provided, will use 0.5 * horizon. -#' -#' @return A dataframe with the forecast, actual value, and cutoff date. -#' -#' @export -simulated_historical_forecasts <- function(model, horizon, units, k, - period = NULL) { - df <- model$history - horizon <- as.difftime(horizon, units = units) - if (is.null(period)) { - period <- horizon / 2 - } else { - period <- as.difftime(period, units = units) - } - cutoffs <- generate_cutoffs(df, horizon, k, period) - predicts <- data.frame() - for (i in seq_along(cutoffs)) { - cutoff <- cutoffs[i] - # Copy the model - m <- prophet_copy(model, cutoff) - # Train model - history.c <- dplyr::filter(df, ds <= cutoff) - m <- fit.prophet(m, history.c) - # Calculate yhat - df.predict <- dplyr::filter(df, ds > cutoff, ds <= cutoff + horizon) - # Get the columns for the future dataframe - columns <- 'ds' - if (m$growth == 'logistic') { - columns <- c(columns, 'cap') - if (m$logistic.floor) { - columns <- c(columns, 'floor') - } - } - columns <- c(columns, names(m$extra_regressors)) - future <- df[columns] - yhat <- stats::predict(m, future) - # Merge yhat, y, and cutoff. - df.c <- dplyr::inner_join(df.predict, yhat, by = "ds") - df.c <- dplyr::select(df.c, ds, y, yhat, yhat_lower, yhat_upper) - df.c$cutoff <- cutoff - predicts <- rbind(predicts, df.c) - } - return(predicts) -} - #' Cross-validation for time series. #' -#' Computes forecasts from historical cutoff points. Beginning from initial, -#' makes cutoffs with a spacing of period up to (end - horizon). +#' Computes forecasts from historical cutoff points. Beginning from +#' (end - horizon), works backwards making cutoffs with a spacing of period +#' until initial is reached. #' #' When period is equal to the time interval of the data, this is the #' technique described in https://robjhyndman.com/hyndsight/tscv/ . @@ -124,8 +72,9 @@ simulated_historical_forecasts <- function(model, horizon, units, k, #' @export cross_validation <- function( model, horizon, units, period = NULL, initial = NULL) { - te <- max(model$history$ds) - ts <- min(model$history$ds) + df <- model$history + te <- max(df$ds) + ts <- min(df$ds) if (is.null(period)) { period <- 0.5 * horizon } @@ -135,14 +84,39 @@ cross_validation <- function( horizon.dt <- as.difftime(horizon, units = units) initial.dt <- as.difftime(initial, units = units) period.dt <- as.difftime(period, units = units) - k <- ceiling( - as.double((te - horizon.dt) - (ts + initial.dt), units='secs') / - as.double(period.dt, units = 'secs') - ) - if (k < 1) { - stop('Not enough data for specified horizon, period, and initial.') + + cutoffs <- generate_cutoffs(df, horizon.dt, initial.dt, period.dt) + predicts <- data.frame() + for (i in seq_along(cutoffs)) { + cutoff <- cutoffs[i] + # Copy the model + m <- prophet_copy(model, cutoff) + # Train model + history.c <- dplyr::filter(df, ds <= cutoff) + if (nrow(history.c) < 2) { + stop('Less than two datapoints before cutoff. Increase initial window.') + } + m <- fit.prophet(m, history.c) + # Calculate yhat + df.predict <- dplyr::filter(df, ds > cutoff, ds <= cutoff + horizon.dt) + # Get the columns for the future dataframe + columns <- 'ds' + if (m$growth == 'logistic') { + columns <- c(columns, 'cap') + if (m$logistic.floor) { + columns <- c(columns, 'floor') + } + } + columns <- c(columns, names(m$extra_regressors)) + future <- df.predict[columns] + yhat <- stats::predict(m, future) + # Merge yhat, y, and cutoff. + df.c <- dplyr::inner_join(df.predict, yhat, by = "ds") + df.c <- dplyr::select(df.c, ds, y, yhat, yhat_lower, yhat_upper) + df.c$cutoff <- cutoff + predicts <- rbind(predicts, df.c) } - return(simulated_historical_forecasts(model, horizon, units, k, period)) + return(predicts) } #' Copy Prophet object. diff --git a/R/man/cross_validation.Rd b/R/man/cross_validation.Rd index 470c1b3..9cc3f99 100644 --- a/R/man/cross_validation.Rd +++ b/R/man/cross_validation.Rd @@ -23,8 +23,9 @@ horizon. If not provided, 0.5 * horizon is used.} A dataframe with the forecast, actual value, and cutoff date. } \description{ -Computes forecasts from historical cutoff points. Beginning from initial, -makes cutoffs with a spacing of period up to (end - horizon). +Computes forecasts from historical cutoff points. Beginning from +(end - horizon), works backwards making cutoffs with a spacing of period +until initial is reached. } \details{ When period is equal to the time interval of the data, this is the diff --git a/R/man/dyplot.prophet.Rd b/R/man/dyplot.prophet.Rd index 002a7db..c3ec4f0 100644 --- a/R/man/dyplot.prophet.Rd +++ b/R/man/dyplot.prophet.Rd @@ -24,8 +24,9 @@ Plot the prophet forecast. } \examples{ \dontrun{ -history <- data.frame(ds = seq(as.Date('2015-01-01'), as.Date('2016-01-01'), by = 'd'), - y = sin(1:366/200) + rnorm(366)/10) +history <- data.frame( + ds = seq(as.Date('2015-01-01'), as.Date('2016-01-01'), by = 'd'), + y = sin(1:366/200) + rnorm(366)/10) m <- prophet(history) future <- make_future_dataframe(m, periods = 365) forecast <- predict(m, future) diff --git a/R/man/generate_cutoffs.Rd b/R/man/generate_cutoffs.Rd index f95aeae..de60fb1 100644 --- a/R/man/generate_cutoffs.Rd +++ b/R/man/generate_cutoffs.Rd @@ -4,14 +4,14 @@ \alias{generate_cutoffs} \title{Generate cutoff dates} \usage{ -generate_cutoffs(df, horizon, k, period) +generate_cutoffs(df, horizon, initial, period) } \arguments{ \item{df}{Dataframe with historical data} \item{horizon}{timediff forecast horizon} -\item{k}{integer number of forecast points} +\item{initial}{timediff initial window} \item{period}{timediff Simulated forecasts are done with this period.} } diff --git a/R/man/prophet.Rd b/R/man/prophet.Rd index 1f085d5..15807a7 100644 --- a/R/man/prophet.Rd +++ b/R/man/prophet.Rd @@ -5,12 +5,13 @@ \title{Prophet forecaster.} \usage{ prophet(df = NULL, growth = "linear", changepoints = NULL, - n.changepoints = 25, yearly.seasonality = "auto", - weekly.seasonality = "auto", daily.seasonality = "auto", - holidays = NULL, seasonality.mode = "additive", - seasonality.prior.scale = 10, holidays.prior.scale = 10, - changepoint.prior.scale = 0.05, mcmc.samples = 0, interval.width = 0.8, - uncertainty.samples = 1000, fit = TRUE, ...) + n.changepoints = 25, changepoint.range = 0.8, + yearly.seasonality = "auto", weekly.seasonality = "auto", + daily.seasonality = "auto", holidays = NULL, + seasonality.mode = "additive", seasonality.prior.scale = 10, + holidays.prior.scale = 10, changepoint.prior.scale = 0.05, + mcmc.samples = 0, interval.width = 0.8, uncertainty.samples = 1000, + fit = TRUE, ...) } \arguments{ \item{df}{(optional) Dataframe containing the history. Must have columns ds @@ -29,7 +30,11 @@ automatically.} \item{n.changepoints}{Number of potential changepoints to include. Not used if input `changepoints` is supplied. If `changepoints` is not supplied, then n.changepoints potential changepoints are selected uniformly from the -first 80 percent of df$ds.} +first `changepoint.range` proportion of df$ds.} + +\item{changepoint.range}{Proportion of history in which trend changepoints +will be estimated. Defaults to 0.8 for the first 80%. Not used if +`changepoints` is specified.} \item{yearly.seasonality}{Fit yearly seasonality. Can be 'auto', TRUE, FALSE, or a number of Fourier terms to generate.} diff --git a/R/man/simulated_historical_forecasts.Rd b/R/man/simulated_historical_forecasts.Rd deleted file mode 100644 index e65ff5d..0000000 --- a/R/man/simulated_historical_forecasts.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/diagnostics.R -\name{simulated_historical_forecasts} -\alias{simulated_historical_forecasts} -\title{Simulated historical forecasts.} -\usage{ -simulated_historical_forecasts(model, horizon, units, k, period = NULL) -} -\arguments{ -\item{model}{Fitted Prophet model.} - -\item{horizon}{Integer size of the horizon} - -\item{units}{String unit of the horizon, e.g., "days", "secs".} - -\item{k}{integer number of forecast points} - -\item{period}{Integer amount of time between cutoff dates. Same units as -horizon. If not provided, will use 0.5 * horizon.} -} -\value{ -A dataframe with the forecast, actual value, and cutoff date. -} -\description{ -Make forecasts from k historical cutoff points, working backwards from -(end - horizon) with a spacing of period between each cutoff. -} diff --git a/R/tests/testthat/test_diagnostics.R b/R/tests/testthat/test_diagnostics.R index 6319a42..6a10bc2 100644 --- a/R/tests/testthat/test_diagnostics.R +++ b/R/tests/testthat/test_diagnostics.R @@ -8,75 +8,6 @@ DATA_all <- read.csv('data.csv') DATA_all$ds <- as.Date(DATA_all$ds) DATA <- head(DATA_all, 100) -test_that("simulated_historical_forecasts", { - skip_if_not(Sys.getenv('R_ARCH') != '/i386') - m <- prophet(DATA) - k <- 2 - for (p in c(1, 10)) { - for (h in c(1, 3)) { - df.shf <- simulated_historical_forecasts( - m, horizon = h, units = 'days', k = k, period = p) - # All cutoff dates should be less than ds dates - expect_true(all(df.shf$cutoff < df.shf$ds)) - # The unique size of output cutoff should be equal to 'k' - expect_equal(length(unique(df.shf$cutoff)), k) - expect_equal(max(df.shf$ds - df.shf$cutoff), - as.difftime(h, units = 'days')) - dc <- diff(df.shf$cutoff) - dc <- min(dc[dc > 0]) - expect_true(dc >= as.difftime(p, units = 'days')) - # Each y in df_shf and DATA with same ds should be equal - df.merged <- dplyr::left_join(df.shf, m$history, by="ds") - expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0) - } - } -}) - -test_that("simulated_historical_forecasts_logistic", { - skip_if_not(Sys.getenv('R_ARCH') != '/i386') - df <- DATA - df$cap <- 40 - m <- prophet(df, growth='logistic') - df.shf <- simulated_historical_forecasts( - m, horizon = 3, units = 'days', k = 2, period = 3) - # All cutoff dates should be less than ds dates - expect_true(all(df.shf$cutoff < df.shf$ds)) - # The unique size of output cutoff should be equal to 'k' - expect_equal(length(unique(df.shf$cutoff)), 2) - # Each y in df_shf and DATA with same ds should be equal - df.merged <- dplyr::left_join(df.shf, m$history, by="ds") - expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0) -}) - -test_that("simulated_historical_forecasts_extra_regressors", { - skip_if_not(Sys.getenv('R_ARCH') != '/i386') - df <- DATA - df$extra <- seq(0, nrow(df) - 1) - m <- prophet() - m <- add_seasonality(m, name = 'monthly', period = 30.5, fourier.order = 5) - m <- add_regressor(m, 'extra') - m <- fit.prophet(m, df) - df.shf <- simulated_historical_forecasts( - m, horizon = 3, units = 'days', k = 2, period = 3) - # All cutoff dates should be less than ds dates - expect_true(all(df.shf$cutoff < df.shf$ds)) - # The unique size of output cutoff should be equal to 'k' - expect_equal(length(unique(df.shf$cutoff)), 2) - # Each y in df_shf and DATA with same ds should be equal - df.merged <- dplyr::left_join(df.shf, m$history, by="ds") - expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0) -}) - -test_that("simulated_historical_forecasts_default_value_check", { - skip_if_not(Sys.getenv('R_ARCH') != '/i386') - m <- prophet(DATA) - df.shf1 <- simulated_historical_forecasts( - m, horizon = 10, units = 'days', k = 1) - df.shf2 <- simulated_historical_forecasts( - m, horizon = 10, units = 'days', k = 1, period = 5) - expect_equal(sum(dplyr::select(df.shf1 - df.shf2, y, yhat)), 0) -}) - test_that("cross_validation", { skip_if_not(Sys.getenv('R_ARCH') != '/i386') m <- prophet(DATA) @@ -85,14 +16,59 @@ test_that("cross_validation", { ts <- min(DATA$ds) horizon <- as.difftime(4, units = "days") period <- as.difftime(10, units = "days") - k <- 5 + initial <- as.difftime(115, units = "days") df.cv <- cross_validation( - m, horizon = 4, units = "days", period = 10, initial = 90) - expect_equal(length(unique(df.cv$cutoff)), k) + m, horizon = 4, units = "days", period = 10, initial = 115) + expect_equal(length(unique(df.cv$cutoff)), 3) expect_equal(max(df.cv$ds - df.cv$cutoff), horizon) + expect_true(min(df.cv$cutoff) >= ts + initial) dc <- diff(df.cv$cutoff) dc <- min(dc[dc > 0]) expect_true(dc >= period) + expect_true(all(df.cv$cutoff < df.cv$ds)) + # Each y in df.cv and DATA with same ds should be equal + df.merged <- dplyr::left_join(df.cv, m$history, by="ds") + expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0) + df.cv <- cross_validation( + m, horizon = 4, units = "days", period = 10, initial = 135) + expect_equal(length(unique(df.cv$cutoff)), 1) + expect_error( + cross_validation( + m, horizon = 10, units = "days", period = 10, initial = 140) + ) +}) + +test_that("cross_validation_logistic", { + skip_if_not(Sys.getenv('R_ARCH') != '/i386') + df <- DATA + df$cap <- 40 + m <- prophet(df, growth = 'logistic') + df.cv <- cross_validation( + m, horizon = 1, units = "days", period = 1, initial = 140) + expect_equal(length(unique(df.cv$cutoff)), 2) + expect_true(all(df.cv$cutoff < df.cv$ds)) + df.merged <- dplyr::left_join(df.cv, m$history, by="ds") + expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0) +}) + +test_that("cross_validation_extra_regressors", { + skip_if_not(Sys.getenv('R_ARCH') != '/i386') + df <- DATA + df$extra <- seq(0, nrow(df) - 1) + m <- prophet() + m <- add_seasonality(m, name = 'monthly', period = 30.5, fourier.order = 5) + m <- add_regressor(m, 'extra') + m <- fit.prophet(m, df) + df.cv <- cross_validation( + m, horizon = 4, units = "days", period = 4, initial = 135) + expect_equal(length(unique(df.cv$cutoff)), 2) + period <- as.difftime(4, units = "days") + dc <- diff(df.cv$cutoff) + dc <- min(dc[dc > 0]) + expect_true(dc >= period) + expect_true(all(df.cv$cutoff < df.cv$ds)) + df.merged <- dplyr::left_join(df.cv, m$history, by="ds") + expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0) }) test_that("cross_validation_default_value_check", { @@ -116,11 +92,11 @@ test_that("performance_metrics", { sort(colnames(df_none)) == sort(c('horizon', 'coverage', 'mae', 'mape', 'mse', 'rmse')) )) - expect_equal(nrow(df_none), 14) + expect_equal(nrow(df_none), 16) # Aggregation level 0.2 df_horizon <- performance_metrics(df_cv, rolling_window = 0.2) expect_equal(length(unique(df_horizon$horizon)), 4) - expect_equal(nrow(df_horizon), 13) + expect_equal(nrow(df_horizon), 14) # Aggregation level all df_all <- performance_metrics(df_cv, rolling_window = 1) expect_equal(nrow(df_all), 1) diff --git a/python/fbprophet/diagnostics.py b/python/fbprophet/diagnostics.py index 1a7eabf..6245bca 100644 --- a/python/fbprophet/diagnostics.py +++ b/python/fbprophet/diagnostics.py @@ -21,18 +21,15 @@ import pandas as pd logger = logging.getLogger(__name__) -def _cutoffs(df, horizon, k, period): +def generate_cutoffs(df, horizon, initial, period): """Generate cutoff dates Parameters ---------- - df: pd.DataFrame with historical data - horizon: pd.Timedelta. - Forecast horizon - k: Int number. - The number of forecasts point. - period: pd.Timedelta. - Simulated Forecast will be done at every this period. + df: pd.DataFrame with historical data. + horizon: pd.Timedelta forecast horizon. + initial: pd.Timedelta window of the initial forecast period. + period: pd.Timedelta simulated forecasts are done with this period. Returns ------- @@ -43,56 +40,70 @@ def _cutoffs(df, horizon, k, period): if cutoff < df['ds'].min(): raise ValueError('Less data than horizon.') result = [cutoff] - - for i in range(1, k): + while result[-1] >= min(df['ds']) + initial: cutoff -= period # If data does not exist in data range (cutoff, cutoff + horizon] if not (((df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)).any()): # Next cutoff point is 'last date before cutoff in data - horizon' closest_date = df[df['ds'] <= cutoff].max()['ds'] cutoff = closest_date - horizon - if cutoff < df['ds'].min(): - logger.warning( - 'Not enough data for requested number of cutoffs! ' - 'Using {}.'.format(i)) - break result.append(cutoff) - - # Sort lines in ascending order + result = result[:-1] + if len(result) == 0: + raise ValueError( + 'Less data than horizon after initial window. ' + 'Make horizon or initial shorter.' + ) + logger.info('Making {} forecasts with cutoffs between {} and {}'.format( + len(result), result[-1], result[0] + )) return reversed(result) -def simulated_historical_forecasts(model, horizon, k, period=None): - """Simulated Historical Forecasts. +def cross_validation(model, horizon, period=None, initial=None): + """Cross-Validation for time series. - Make forecasts from k historical cutoff points, working backwards from - (end - horizon) with a spacing of period between each cutoff. + Computes forecasts from historical cutoff points. Beginning from + (end - horizon), works backwards making cutoffs with a spacing of period + until initial is reached. + + When period is equal to the time interval of the data, this is the + technique described in https://robjhyndman.com/hyndsight/tscv/ . Parameters ---------- - model: Prophet class object. - Fitted Prophet model + model: Prophet class object. Fitted Prophet model horizon: string with pd.Timedelta compatible style, e.g., '5 days', '3 hours', '10 seconds'. - k: Int number of forecasts point. - period: Optional string with pd.Timedelta compatible style. Simulated - forecast will be done at every this period. If not provided, - 0.5 * horizon is used. + period: string with pd.Timedelta compatible style. Simulated forecast will + be done at every this period. If not provided, 0.5 * horizon is used. + initial: string with pd.Timedelta compatible style. The first training + period will begin here. If not provided, 3 * horizon is used. Returns ------- A pd.DataFrame with the forecast, actual value and cutoff. """ df = model.history.copy().reset_index(drop=True) + te = df['ds'].max() + ts = df['ds'].min() horizon = pd.Timedelta(horizon) period = 0.5 * horizon if period is None else pd.Timedelta(period) - cutoffs = _cutoffs(df, horizon, k, period) + initial = 3 * horizon if initial is None else pd.Timedelta(initial) + + cutoffs = generate_cutoffs(df, horizon, initial, period) predicts = [] for cutoff in cutoffs: # Generate new object with copying fitting options m = prophet_copy(model, cutoff) # Train model - m.fit(df[df['ds'] <= cutoff]) + history_c = df[df['ds'] <= cutoff] + if history_c.shape[0] < 2: + raise Exception( + 'Less than two datapoints before cutoff. ' + 'Increase initial window.' + ) + m.fit(history_c) # Calculate yhat index_predicted = (df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon) # Get the columns for the future dataframe @@ -113,42 +124,6 @@ def simulated_historical_forecasts(model, horizon, k, period=None): # Combine all predicted pd.DataFrame into one pd.DataFrame return reduce(lambda x, y: x.append(y), predicts).reset_index(drop=True) - -def cross_validation(model, horizon, period=None, initial=None): - """Cross-Validation for time series. - - Computes forecasts from historical cutoff points. Beginning from initial, - makes cutoffs with a spacing of period up to (end - horizon). - - When period is equal to the time interval of the data, this is the - technique described in https://robjhyndman.com/hyndsight/tscv/ . - - Parameters - ---------- - model: Prophet class object. Fitted Prophet model - horizon: string with pd.Timedelta compatible style, e.g., '5 days', - '3 hours', '10 seconds'. - period: string with pd.Timedelta compatible style. Simulated forecast will - be done at every this period. If not provided, 0.5 * horizon is used. - initial: string with pd.Timedelta compatible style. The first training - period will begin here. If not provided, 3 * horizon is used. - - Returns - ------- - A pd.DataFrame with the forecast, actual value and cutoff. - """ - te = model.history['ds'].max() - ts = model.history['ds'].min() - horizon = pd.Timedelta(horizon) - period = 0.5 * horizon if period is None else pd.Timedelta(period) - initial = 3 * horizon if initial is None else pd.Timedelta(initial) - k = int(np.ceil(((te - horizon) - (ts + initial)) / period)) - if k < 1: - raise ValueError( - 'Not enough data for specified horizon, period, and initial.') - return simulated_historical_forecasts(model, horizon, k, period) - - def prophet_copy(m, cutoff=None): """Copy Prophet object diff --git a/python/fbprophet/tests/test_diagnostics.py b/python/fbprophet/tests/test_diagnostics.py index 4941b1e..0f04faa 100644 --- a/python/fbprophet/tests/test_diagnostics.py +++ b/python/fbprophet/tests/test_diagnostics.py @@ -36,95 +36,63 @@ class TestDiagnostics(TestCase): # Use first 100 record in data.csv self.__df = DATA - def test_simulated_historical_forecasts(self): - m = Prophet() - m.fit(self.__df) - k = 2 - for p in [1, 10]: - for h in [1, 3]: - period = '{} days'.format(p) - horizon = '{} days'.format(h) - df_shf = diagnostics.simulated_historical_forecasts( - m, horizon=horizon, k=k, period=period) - # All cutoff dates should be less than ds dates - self.assertTrue((df_shf['cutoff'] < df_shf['ds']).all()) - # The unique size of output cutoff should be equal to 'k' - self.assertEqual(len(np.unique(df_shf['cutoff'])), k) - self.assertEqual( - max(df_shf['ds'] - df_shf['cutoff']), - pd.Timedelta(horizon), - ) - dc = df_shf['cutoff'].diff() - dc = dc[dc > pd.Timedelta(0)].min() - self.assertTrue(dc >= pd.Timedelta(period)) - # Each y in df_shf and self.__df with same ds should be equal - df_merged = pd.merge(df_shf, self.__df, 'left', on='ds') - self.assertAlmostEqual( - np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0) - - def test_simulated_historical_forecasts_logistic(self): - m = Prophet(growth='logistic') - df = self.__df.copy() - df['cap'] = 40 - m.fit(df) - df_shf = diagnostics.simulated_historical_forecasts( - m, horizon='3 days', k=2, period='3 days') - # All cutoff dates should be less than ds dates - self.assertTrue((df_shf['cutoff'] < df_shf['ds']).all()) - # The unique size of output cutoff should be equal to 'k' - self.assertEqual(len(np.unique(df_shf['cutoff'])), 2) - # Each y in df_shf and self.__df with same ds should be equal - df_merged = pd.merge(df_shf, df, 'left', on='ds') - self.assertAlmostEqual( - np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0) - - def test_simulated_historical_forecasts_extra_regressors(self): - m = Prophet() - m.add_seasonality(name='monthly', period=30.5, fourier_order=5) - m.add_regressor('extra') - df = self.__df.copy() - df['cap'] = 40 - df['extra'] = range(df.shape[0]) - m.fit(df) - df_shf = diagnostics.simulated_historical_forecasts( - m, horizon='3 days', k=2, period='3 days') - # All cutoff dates should be less than ds dates - self.assertTrue((df_shf['cutoff'] < df_shf['ds']).all()) - # The unique size of output cutoff should be equal to 'k' - self.assertEqual(len(np.unique(df_shf['cutoff'])), 2) - # Each y in df_shf and self.__df with same ds should be equal - df_merged = pd.merge(df_shf, df, 'left', on='ds') - self.assertAlmostEqual( - np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0) - - def test_simulated_historical_forecasts_default_value_check(self): - m = Prophet() - m.fit(self.__df) - # Default value of period should be equal to 0.5 * horizon - df_shf1 = diagnostics.simulated_historical_forecasts( - m, horizon='10 days', k=1) - df_shf2 = diagnostics.simulated_historical_forecasts( - m, horizon='10 days', k=1, period='5 days') - self.assertAlmostEqual( - ((df_shf1['y'] - df_shf2['y']) ** 2).sum(), 0.0) - self.assertAlmostEqual( - ((df_shf1['yhat'] - df_shf2['yhat']) ** 2).sum(), 0.0) - def test_cross_validation(self): m = Prophet() m.fit(self.__df) # Calculate the number of cutoff points(k) horizon = pd.Timedelta('4 days') period = pd.Timedelta('10 days') - k = 5 + initial = pd.Timedelta('115 days') df_cv = diagnostics.cross_validation( - m, horizon='4 days', period='10 days', initial='90 days') - # The unique size of output cutoff should be equal to 'k' - self.assertEqual(len(np.unique(df_cv['cutoff'])), k) + m, horizon='4 days', period='10 days', initial='115 days') + self.assertEqual(len(np.unique(df_cv['cutoff'])), 3) self.assertEqual(max(df_cv['ds'] - df_cv['cutoff']), horizon) + self.assertTrue(min(df_cv['cutoff']) >= min(self.__df['ds']) + initial) dc = df_cv['cutoff'].diff() dc = dc[dc > pd.Timedelta(0)].min() self.assertTrue(dc >= period) + self.assertTrue((df_cv['cutoff'] < df_cv['ds']).all()) + # Each y in df_cv and self.__df with same ds should be equal + df_merged = pd.merge(df_cv, self.__df, 'left', on='ds') + self.assertAlmostEqual( + np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0) + df_cv = diagnostics.cross_validation( + m, horizon='4 days', period='10 days', initial='135 days') + self.assertEqual(len(np.unique(df_cv['cutoff'])), 1) + with self.assertRaises(ValueError): + diagnostics.cross_validation( + m, horizon='10 days', period='10 days', initial='140 days') + + def test_cross_validation_logistic(self): + df = self.__df.copy() + df['cap'] = 40 + m = Prophet(growth='logistic').fit(df) + df_cv = diagnostics.cross_validation( + m, horizon='1 days', period='1 days', initial='140 days') + self.assertEqual(len(np.unique(df_cv['cutoff'])), 2) + self.assertTrue((df_cv['cutoff'] < df_cv['ds']).all()) + df_merged = pd.merge(df_cv, self.__df, 'left', on='ds') + self.assertAlmostEqual( + np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0) + + def test_cross_validation_extra_regressors(self): + df = self.__df.copy() + df['extra'] = range(df.shape[0]) + m = Prophet() + m.add_seasonality(name='monthly', period=30.5, fourier_order=5) + m.add_regressor('extra') + m.fit(df) + df_cv = diagnostics.cross_validation( + m, horizon='4 days', period='4 days', initial='135 days') + self.assertEqual(len(np.unique(df_cv['cutoff'])), 2) + period = pd.Timedelta('4 days') + dc = df_cv['cutoff'].diff() + dc = dc[dc > pd.Timedelta(0)].min() + self.assertTrue(dc >= period) + self.assertTrue((df_cv['cutoff'] < df_cv['ds']).all()) + df_merged = pd.merge(df_cv, self.__df, 'left', on='ds') + self.assertAlmostEqual( + np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0) def test_cross_validation_default_value_check(self): m = Prophet() @@ -150,11 +118,11 @@ class TestDiagnostics(TestCase): set(df_none.columns), {'horizon', 'coverage', 'mae', 'mape', 'mse', 'rmse'}, ) - self.assertEqual(df_none.shape[0], 14) + self.assertEqual(df_none.shape[0], 16) # Aggregation level 0.2 df_horizon = diagnostics.performance_metrics(df_cv, rolling_window=0.2) self.assertEqual(len(df_horizon['horizon'].unique()), 4) - self.assertEqual(df_horizon.shape[0], 13) + self.assertEqual(df_horizon.shape[0], 14) # Aggregation level all df_all = diagnostics.performance_metrics(df_cv, rolling_window=1) self.assertEqual(df_all.shape[0], 1)