Simplify logic/interfaces for cross_validation, to better handle irregularly spaced data and actually respect initial window

This commit is contained in:
Ben Letham 2018-05-30 12:09:54 -07:00
parent 3571a90558
commit 218283f157
10 changed files with 218 additions and 346 deletions

View file

@ -15,6 +15,5 @@ export(plot_forecast_component)
export(predictive_samples)
export(prophet)
export(prophet_plot_components)
export(simulated_historical_forecasts)
import(Rcpp)
importFrom(dplyr,"%>%")

View file

@ -11,102 +11,50 @@ globalVariables(c(
#' Generate cutoff dates
#'
#' @param df Dataframe with historical data
#' @param horizon timediff forecast horizon
#' @param k integer number of forecast points
#' @param df Dataframe with historical data.
#' @param horizon timediff forecast horizon.
#' @param initial timediff initial window.
#' @param period timediff Simulated forecasts are done with this period.
#'
#' @return Array of datetimes
#' @return Array of datetimes.
#'
#' @keywords internal
generate_cutoffs <- function(df, horizon, k, period) {
generate_cutoffs <- function(df, horizon, initial, period) {
# Last cutoff is (latest date in data) - (horizon).
cutoff <- max(df$ds) - horizon
if (cutoff < min(df$ds)) {
stop('Less data than horizon.')
}
tzone <- attr(cutoff, "tzone") # Timezone is wiped by putting in array
result <- cutoff
if (k > 1) {
for (i in 2:k) {
cutoff <- cutoff - period
# If data does not exist in data range (cutoff, cutoff + horizon]
if (!any((df$ds > cutoff) & (df$ds <= cutoff + horizon))) {
result <- c(cutoff)
while (result[length(result)] >= min(df$ds) + initial) {
cutoff <- cutoff - period
# If data does not exist in data range (cutoff, cutoff + horizon]
if (!any((df$ds > cutoff) & (df$ds <= cutoff + horizon))) {
# Next cutoff point is 'closest date before cutoff in data - horizon'
closest.date <- max(df$ds[df$ds <= cutoff])
cutoff <- closest.date - horizon
}
if (cutoff < min(df$ds)) {
warning('Not enough data for requested number of cutoffs! Using ', i)
break
}
result <- c(result, cutoff)
}
result <- c(result, cutoff)
}
result <- head(result, -1)
if (length(result) == 0) {
stop(paste(
'Less data than horizon after initial window.',
'Make horizon or initial shorter.'
))
}
# Reset timezones
attr(result, "tzone") <- tzone
message(paste(
'Making', length(result), 'forecasts with cutoffs between',
result[length(result)], 'and', result[1]
))
return(rev(result))
}
#' Simulated historical forecasts.
#'
#' Make forecasts from k historical cutoff points, working backwards from
#' (end - horizon) with a spacing of period between each cutoff.
#'
#' @param model Fitted Prophet model.
#' @param horizon Integer size of the horizon
#' @param units String unit of the horizon, e.g., "days", "secs".
#' @param k integer number of forecast points
#' @param period Integer amount of time between cutoff dates. Same units as
#' horizon. If not provided, will use 0.5 * horizon.
#'
#' @return A dataframe with the forecast, actual value, and cutoff date.
#'
#' @export
simulated_historical_forecasts <- function(model, horizon, units, k,
period = NULL) {
df <- model$history
horizon <- as.difftime(horizon, units = units)
if (is.null(period)) {
period <- horizon / 2
} else {
period <- as.difftime(period, units = units)
}
cutoffs <- generate_cutoffs(df, horizon, k, period)
predicts <- data.frame()
for (i in seq_along(cutoffs)) {
cutoff <- cutoffs[i]
# Copy the model
m <- prophet_copy(model, cutoff)
# Train model
history.c <- dplyr::filter(df, ds <= cutoff)
m <- fit.prophet(m, history.c)
# Calculate yhat
df.predict <- dplyr::filter(df, ds > cutoff, ds <= cutoff + horizon)
# Get the columns for the future dataframe
columns <- 'ds'
if (m$growth == 'logistic') {
columns <- c(columns, 'cap')
if (m$logistic.floor) {
columns <- c(columns, 'floor')
}
}
columns <- c(columns, names(m$extra_regressors))
future <- df[columns]
yhat <- stats::predict(m, future)
# Merge yhat, y, and cutoff.
df.c <- dplyr::inner_join(df.predict, yhat, by = "ds")
df.c <- dplyr::select(df.c, ds, y, yhat, yhat_lower, yhat_upper)
df.c$cutoff <- cutoff
predicts <- rbind(predicts, df.c)
}
return(predicts)
}
#' Cross-validation for time series.
#'
#' Computes forecasts from historical cutoff points. Beginning from initial,
#' makes cutoffs with a spacing of period up to (end - horizon).
#' Computes forecasts from historical cutoff points. Beginning from
#' (end - horizon), works backwards making cutoffs with a spacing of period
#' until initial is reached.
#'
#' When period is equal to the time interval of the data, this is the
#' technique described in https://robjhyndman.com/hyndsight/tscv/ .
@ -124,8 +72,9 @@ simulated_historical_forecasts <- function(model, horizon, units, k,
#' @export
cross_validation <- function(
model, horizon, units, period = NULL, initial = NULL) {
te <- max(model$history$ds)
ts <- min(model$history$ds)
df <- model$history
te <- max(df$ds)
ts <- min(df$ds)
if (is.null(period)) {
period <- 0.5 * horizon
}
@ -135,14 +84,39 @@ cross_validation <- function(
horizon.dt <- as.difftime(horizon, units = units)
initial.dt <- as.difftime(initial, units = units)
period.dt <- as.difftime(period, units = units)
k <- ceiling(
as.double((te - horizon.dt) - (ts + initial.dt), units='secs') /
as.double(period.dt, units = 'secs')
)
if (k < 1) {
stop('Not enough data for specified horizon, period, and initial.')
cutoffs <- generate_cutoffs(df, horizon.dt, initial.dt, period.dt)
predicts <- data.frame()
for (i in seq_along(cutoffs)) {
cutoff <- cutoffs[i]
# Copy the model
m <- prophet_copy(model, cutoff)
# Train model
history.c <- dplyr::filter(df, ds <= cutoff)
if (nrow(history.c) < 2) {
stop('Less than two datapoints before cutoff. Increase initial window.')
}
m <- fit.prophet(m, history.c)
# Calculate yhat
df.predict <- dplyr::filter(df, ds > cutoff, ds <= cutoff + horizon.dt)
# Get the columns for the future dataframe
columns <- 'ds'
if (m$growth == 'logistic') {
columns <- c(columns, 'cap')
if (m$logistic.floor) {
columns <- c(columns, 'floor')
}
}
columns <- c(columns, names(m$extra_regressors))
future <- df.predict[columns]
yhat <- stats::predict(m, future)
# Merge yhat, y, and cutoff.
df.c <- dplyr::inner_join(df.predict, yhat, by = "ds")
df.c <- dplyr::select(df.c, ds, y, yhat, yhat_lower, yhat_upper)
df.c$cutoff <- cutoff
predicts <- rbind(predicts, df.c)
}
return(simulated_historical_forecasts(model, horizon, units, k, period))
return(predicts)
}
#' Copy Prophet object.

View file

@ -23,8 +23,9 @@ horizon. If not provided, 0.5 * horizon is used.}
A dataframe with the forecast, actual value, and cutoff date.
}
\description{
Computes forecasts from historical cutoff points. Beginning from initial,
makes cutoffs with a spacing of period up to (end - horizon).
Computes forecasts from historical cutoff points. Beginning from
(end - horizon), works backwards making cutoffs with a spacing of period
until initial is reached.
}
\details{
When period is equal to the time interval of the data, this is the

View file

@ -24,8 +24,9 @@ Plot the prophet forecast.
}
\examples{
\dontrun{
history <- data.frame(ds = seq(as.Date('2015-01-01'), as.Date('2016-01-01'), by = 'd'),
y = sin(1:366/200) + rnorm(366)/10)
history <- data.frame(
ds = seq(as.Date('2015-01-01'), as.Date('2016-01-01'), by = 'd'),
y = sin(1:366/200) + rnorm(366)/10)
m <- prophet(history)
future <- make_future_dataframe(m, periods = 365)
forecast <- predict(m, future)

View file

@ -4,14 +4,14 @@
\alias{generate_cutoffs}
\title{Generate cutoff dates}
\usage{
generate_cutoffs(df, horizon, k, period)
generate_cutoffs(df, horizon, initial, period)
}
\arguments{
\item{df}{Dataframe with historical data}
\item{horizon}{timediff forecast horizon}
\item{k}{integer number of forecast points}
\item{initial}{timediff initial window}
\item{period}{timediff Simulated forecasts are done with this period.}
}

View file

@ -5,12 +5,13 @@
\title{Prophet forecaster.}
\usage{
prophet(df = NULL, growth = "linear", changepoints = NULL,
n.changepoints = 25, yearly.seasonality = "auto",
weekly.seasonality = "auto", daily.seasonality = "auto",
holidays = NULL, seasonality.mode = "additive",
seasonality.prior.scale = 10, holidays.prior.scale = 10,
changepoint.prior.scale = 0.05, mcmc.samples = 0, interval.width = 0.8,
uncertainty.samples = 1000, fit = TRUE, ...)
n.changepoints = 25, changepoint.range = 0.8,
yearly.seasonality = "auto", weekly.seasonality = "auto",
daily.seasonality = "auto", holidays = NULL,
seasonality.mode = "additive", seasonality.prior.scale = 10,
holidays.prior.scale = 10, changepoint.prior.scale = 0.05,
mcmc.samples = 0, interval.width = 0.8, uncertainty.samples = 1000,
fit = TRUE, ...)
}
\arguments{
\item{df}{(optional) Dataframe containing the history. Must have columns ds
@ -29,7 +30,11 @@ automatically.}
\item{n.changepoints}{Number of potential changepoints to include. Not used
if input `changepoints` is supplied. If `changepoints` is not supplied,
then n.changepoints potential changepoints are selected uniformly from the
first 80 percent of df$ds.}
first `changepoint.range` proportion of df$ds.}
\item{changepoint.range}{Proportion of history in which trend changepoints
will be estimated. Defaults to 0.8 for the first 80%. Not used if
`changepoints` is specified.}
\item{yearly.seasonality}{Fit yearly seasonality. Can be 'auto', TRUE,
FALSE, or a number of Fourier terms to generate.}

View file

@ -1,27 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/diagnostics.R
\name{simulated_historical_forecasts}
\alias{simulated_historical_forecasts}
\title{Simulated historical forecasts.}
\usage{
simulated_historical_forecasts(model, horizon, units, k, period = NULL)
}
\arguments{
\item{model}{Fitted Prophet model.}
\item{horizon}{Integer size of the horizon}
\item{units}{String unit of the horizon, e.g., "days", "secs".}
\item{k}{integer number of forecast points}
\item{period}{Integer amount of time between cutoff dates. Same units as
horizon. If not provided, will use 0.5 * horizon.}
}
\value{
A dataframe with the forecast, actual value, and cutoff date.
}
\description{
Make forecasts from k historical cutoff points, working backwards from
(end - horizon) with a spacing of period between each cutoff.
}

View file

@ -8,75 +8,6 @@ DATA_all <- read.csv('data.csv')
DATA_all$ds <- as.Date(DATA_all$ds)
DATA <- head(DATA_all, 100)
test_that("simulated_historical_forecasts", {
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
m <- prophet(DATA)
k <- 2
for (p in c(1, 10)) {
for (h in c(1, 3)) {
df.shf <- simulated_historical_forecasts(
m, horizon = h, units = 'days', k = k, period = p)
# All cutoff dates should be less than ds dates
expect_true(all(df.shf$cutoff < df.shf$ds))
# The unique size of output cutoff should be equal to 'k'
expect_equal(length(unique(df.shf$cutoff)), k)
expect_equal(max(df.shf$ds - df.shf$cutoff),
as.difftime(h, units = 'days'))
dc <- diff(df.shf$cutoff)
dc <- min(dc[dc > 0])
expect_true(dc >= as.difftime(p, units = 'days'))
# Each y in df_shf and DATA with same ds should be equal
df.merged <- dplyr::left_join(df.shf, m$history, by="ds")
expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
}
}
})
test_that("simulated_historical_forecasts_logistic", {
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
df <- DATA
df$cap <- 40
m <- prophet(df, growth='logistic')
df.shf <- simulated_historical_forecasts(
m, horizon = 3, units = 'days', k = 2, period = 3)
# All cutoff dates should be less than ds dates
expect_true(all(df.shf$cutoff < df.shf$ds))
# The unique size of output cutoff should be equal to 'k'
expect_equal(length(unique(df.shf$cutoff)), 2)
# Each y in df_shf and DATA with same ds should be equal
df.merged <- dplyr::left_join(df.shf, m$history, by="ds")
expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
})
test_that("simulated_historical_forecasts_extra_regressors", {
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
df <- DATA
df$extra <- seq(0, nrow(df) - 1)
m <- prophet()
m <- add_seasonality(m, name = 'monthly', period = 30.5, fourier.order = 5)
m <- add_regressor(m, 'extra')
m <- fit.prophet(m, df)
df.shf <- simulated_historical_forecasts(
m, horizon = 3, units = 'days', k = 2, period = 3)
# All cutoff dates should be less than ds dates
expect_true(all(df.shf$cutoff < df.shf$ds))
# The unique size of output cutoff should be equal to 'k'
expect_equal(length(unique(df.shf$cutoff)), 2)
# Each y in df_shf and DATA with same ds should be equal
df.merged <- dplyr::left_join(df.shf, m$history, by="ds")
expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
})
test_that("simulated_historical_forecasts_default_value_check", {
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
m <- prophet(DATA)
df.shf1 <- simulated_historical_forecasts(
m, horizon = 10, units = 'days', k = 1)
df.shf2 <- simulated_historical_forecasts(
m, horizon = 10, units = 'days', k = 1, period = 5)
expect_equal(sum(dplyr::select(df.shf1 - df.shf2, y, yhat)), 0)
})
test_that("cross_validation", {
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
m <- prophet(DATA)
@ -85,14 +16,59 @@ test_that("cross_validation", {
ts <- min(DATA$ds)
horizon <- as.difftime(4, units = "days")
period <- as.difftime(10, units = "days")
k <- 5
initial <- as.difftime(115, units = "days")
df.cv <- cross_validation(
m, horizon = 4, units = "days", period = 10, initial = 90)
expect_equal(length(unique(df.cv$cutoff)), k)
m, horizon = 4, units = "days", period = 10, initial = 115)
expect_equal(length(unique(df.cv$cutoff)), 3)
expect_equal(max(df.cv$ds - df.cv$cutoff), horizon)
expect_true(min(df.cv$cutoff) >= ts + initial)
dc <- diff(df.cv$cutoff)
dc <- min(dc[dc > 0])
expect_true(dc >= period)
expect_true(all(df.cv$cutoff < df.cv$ds))
# Each y in df.cv and DATA with same ds should be equal
df.merged <- dplyr::left_join(df.cv, m$history, by="ds")
expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
df.cv <- cross_validation(
m, horizon = 4, units = "days", period = 10, initial = 135)
expect_equal(length(unique(df.cv$cutoff)), 1)
expect_error(
cross_validation(
m, horizon = 10, units = "days", period = 10, initial = 140)
)
})
test_that("cross_validation_logistic", {
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
df <- DATA
df$cap <- 40
m <- prophet(df, growth = 'logistic')
df.cv <- cross_validation(
m, horizon = 1, units = "days", period = 1, initial = 140)
expect_equal(length(unique(df.cv$cutoff)), 2)
expect_true(all(df.cv$cutoff < df.cv$ds))
df.merged <- dplyr::left_join(df.cv, m$history, by="ds")
expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
})
test_that("cross_validation_extra_regressors", {
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
df <- DATA
df$extra <- seq(0, nrow(df) - 1)
m <- prophet()
m <- add_seasonality(m, name = 'monthly', period = 30.5, fourier.order = 5)
m <- add_regressor(m, 'extra')
m <- fit.prophet(m, df)
df.cv <- cross_validation(
m, horizon = 4, units = "days", period = 4, initial = 135)
expect_equal(length(unique(df.cv$cutoff)), 2)
period <- as.difftime(4, units = "days")
dc <- diff(df.cv$cutoff)
dc <- min(dc[dc > 0])
expect_true(dc >= period)
expect_true(all(df.cv$cutoff < df.cv$ds))
df.merged <- dplyr::left_join(df.cv, m$history, by="ds")
expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
})
test_that("cross_validation_default_value_check", {
@ -116,11 +92,11 @@ test_that("performance_metrics", {
sort(colnames(df_none))
== sort(c('horizon', 'coverage', 'mae', 'mape', 'mse', 'rmse'))
))
expect_equal(nrow(df_none), 14)
expect_equal(nrow(df_none), 16)
# Aggregation level 0.2
df_horizon <- performance_metrics(df_cv, rolling_window = 0.2)
expect_equal(length(unique(df_horizon$horizon)), 4)
expect_equal(nrow(df_horizon), 13)
expect_equal(nrow(df_horizon), 14)
# Aggregation level all
df_all <- performance_metrics(df_cv, rolling_window = 1)
expect_equal(nrow(df_all), 1)

View file

@ -21,18 +21,15 @@ import pandas as pd
logger = logging.getLogger(__name__)
def _cutoffs(df, horizon, k, period):
def generate_cutoffs(df, horizon, initial, period):
"""Generate cutoff dates
Parameters
----------
df: pd.DataFrame with historical data
horizon: pd.Timedelta.
Forecast horizon
k: Int number.
The number of forecasts point.
period: pd.Timedelta.
Simulated Forecast will be done at every this period.
df: pd.DataFrame with historical data.
horizon: pd.Timedelta forecast horizon.
initial: pd.Timedelta window of the initial forecast period.
period: pd.Timedelta simulated forecasts are done with this period.
Returns
-------
@ -43,56 +40,70 @@ def _cutoffs(df, horizon, k, period):
if cutoff < df['ds'].min():
raise ValueError('Less data than horizon.')
result = [cutoff]
for i in range(1, k):
while result[-1] >= min(df['ds']) + initial:
cutoff -= period
# If data does not exist in data range (cutoff, cutoff + horizon]
if not (((df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)).any()):
# Next cutoff point is 'last date before cutoff in data - horizon'
closest_date = df[df['ds'] <= cutoff].max()['ds']
cutoff = closest_date - horizon
if cutoff < df['ds'].min():
logger.warning(
'Not enough data for requested number of cutoffs! '
'Using {}.'.format(i))
break
result.append(cutoff)
# Sort lines in ascending order
result = result[:-1]
if len(result) == 0:
raise ValueError(
'Less data than horizon after initial window. '
'Make horizon or initial shorter.'
)
logger.info('Making {} forecasts with cutoffs between {} and {}'.format(
len(result), result[-1], result[0]
))
return reversed(result)
def simulated_historical_forecasts(model, horizon, k, period=None):
"""Simulated Historical Forecasts.
def cross_validation(model, horizon, period=None, initial=None):
"""Cross-Validation for time series.
Make forecasts from k historical cutoff points, working backwards from
(end - horizon) with a spacing of period between each cutoff.
Computes forecasts from historical cutoff points. Beginning from
(end - horizon), works backwards making cutoffs with a spacing of period
until initial is reached.
When period is equal to the time interval of the data, this is the
technique described in https://robjhyndman.com/hyndsight/tscv/ .
Parameters
----------
model: Prophet class object.
Fitted Prophet model
model: Prophet class object. Fitted Prophet model
horizon: string with pd.Timedelta compatible style, e.g., '5 days',
'3 hours', '10 seconds'.
k: Int number of forecasts point.
period: Optional string with pd.Timedelta compatible style. Simulated
forecast will be done at every this period. If not provided,
0.5 * horizon is used.
period: string with pd.Timedelta compatible style. Simulated forecast will
be done at every this period. If not provided, 0.5 * horizon is used.
initial: string with pd.Timedelta compatible style. The first training
period will begin here. If not provided, 3 * horizon is used.
Returns
-------
A pd.DataFrame with the forecast, actual value and cutoff.
"""
df = model.history.copy().reset_index(drop=True)
te = df['ds'].max()
ts = df['ds'].min()
horizon = pd.Timedelta(horizon)
period = 0.5 * horizon if period is None else pd.Timedelta(period)
cutoffs = _cutoffs(df, horizon, k, period)
initial = 3 * horizon if initial is None else pd.Timedelta(initial)
cutoffs = generate_cutoffs(df, horizon, initial, period)
predicts = []
for cutoff in cutoffs:
# Generate new object with copying fitting options
m = prophet_copy(model, cutoff)
# Train model
m.fit(df[df['ds'] <= cutoff])
history_c = df[df['ds'] <= cutoff]
if history_c.shape[0] < 2:
raise Exception(
'Less than two datapoints before cutoff. '
'Increase initial window.'
)
m.fit(history_c)
# Calculate yhat
index_predicted = (df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)
# Get the columns for the future dataframe
@ -113,42 +124,6 @@ def simulated_historical_forecasts(model, horizon, k, period=None):
# Combine all predicted pd.DataFrame into one pd.DataFrame
return reduce(lambda x, y: x.append(y), predicts).reset_index(drop=True)
def cross_validation(model, horizon, period=None, initial=None):
"""Cross-Validation for time series.
Computes forecasts from historical cutoff points. Beginning from initial,
makes cutoffs with a spacing of period up to (end - horizon).
When period is equal to the time interval of the data, this is the
technique described in https://robjhyndman.com/hyndsight/tscv/ .
Parameters
----------
model: Prophet class object. Fitted Prophet model
horizon: string with pd.Timedelta compatible style, e.g., '5 days',
'3 hours', '10 seconds'.
period: string with pd.Timedelta compatible style. Simulated forecast will
be done at every this period. If not provided, 0.5 * horizon is used.
initial: string with pd.Timedelta compatible style. The first training
period will begin here. If not provided, 3 * horizon is used.
Returns
-------
A pd.DataFrame with the forecast, actual value and cutoff.
"""
te = model.history['ds'].max()
ts = model.history['ds'].min()
horizon = pd.Timedelta(horizon)
period = 0.5 * horizon if period is None else pd.Timedelta(period)
initial = 3 * horizon if initial is None else pd.Timedelta(initial)
k = int(np.ceil(((te - horizon) - (ts + initial)) / period))
if k < 1:
raise ValueError(
'Not enough data for specified horizon, period, and initial.')
return simulated_historical_forecasts(model, horizon, k, period)
def prophet_copy(m, cutoff=None):
"""Copy Prophet object

View file

@ -36,95 +36,63 @@ class TestDiagnostics(TestCase):
# Use first 100 record in data.csv
self.__df = DATA
def test_simulated_historical_forecasts(self):
m = Prophet()
m.fit(self.__df)
k = 2
for p in [1, 10]:
for h in [1, 3]:
period = '{} days'.format(p)
horizon = '{} days'.format(h)
df_shf = diagnostics.simulated_historical_forecasts(
m, horizon=horizon, k=k, period=period)
# All cutoff dates should be less than ds dates
self.assertTrue((df_shf['cutoff'] < df_shf['ds']).all())
# The unique size of output cutoff should be equal to 'k'
self.assertEqual(len(np.unique(df_shf['cutoff'])), k)
self.assertEqual(
max(df_shf['ds'] - df_shf['cutoff']),
pd.Timedelta(horizon),
)
dc = df_shf['cutoff'].diff()
dc = dc[dc > pd.Timedelta(0)].min()
self.assertTrue(dc >= pd.Timedelta(period))
# Each y in df_shf and self.__df with same ds should be equal
df_merged = pd.merge(df_shf, self.__df, 'left', on='ds')
self.assertAlmostEqual(
np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
def test_simulated_historical_forecasts_logistic(self):
m = Prophet(growth='logistic')
df = self.__df.copy()
df['cap'] = 40
m.fit(df)
df_shf = diagnostics.simulated_historical_forecasts(
m, horizon='3 days', k=2, period='3 days')
# All cutoff dates should be less than ds dates
self.assertTrue((df_shf['cutoff'] < df_shf['ds']).all())
# The unique size of output cutoff should be equal to 'k'
self.assertEqual(len(np.unique(df_shf['cutoff'])), 2)
# Each y in df_shf and self.__df with same ds should be equal
df_merged = pd.merge(df_shf, df, 'left', on='ds')
self.assertAlmostEqual(
np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
def test_simulated_historical_forecasts_extra_regressors(self):
m = Prophet()
m.add_seasonality(name='monthly', period=30.5, fourier_order=5)
m.add_regressor('extra')
df = self.__df.copy()
df['cap'] = 40
df['extra'] = range(df.shape[0])
m.fit(df)
df_shf = diagnostics.simulated_historical_forecasts(
m, horizon='3 days', k=2, period='3 days')
# All cutoff dates should be less than ds dates
self.assertTrue((df_shf['cutoff'] < df_shf['ds']).all())
# The unique size of output cutoff should be equal to 'k'
self.assertEqual(len(np.unique(df_shf['cutoff'])), 2)
# Each y in df_shf and self.__df with same ds should be equal
df_merged = pd.merge(df_shf, df, 'left', on='ds')
self.assertAlmostEqual(
np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
def test_simulated_historical_forecasts_default_value_check(self):
m = Prophet()
m.fit(self.__df)
# Default value of period should be equal to 0.5 * horizon
df_shf1 = diagnostics.simulated_historical_forecasts(
m, horizon='10 days', k=1)
df_shf2 = diagnostics.simulated_historical_forecasts(
m, horizon='10 days', k=1, period='5 days')
self.assertAlmostEqual(
((df_shf1['y'] - df_shf2['y']) ** 2).sum(), 0.0)
self.assertAlmostEqual(
((df_shf1['yhat'] - df_shf2['yhat']) ** 2).sum(), 0.0)
def test_cross_validation(self):
m = Prophet()
m.fit(self.__df)
# Calculate the number of cutoff points(k)
horizon = pd.Timedelta('4 days')
period = pd.Timedelta('10 days')
k = 5
initial = pd.Timedelta('115 days')
df_cv = diagnostics.cross_validation(
m, horizon='4 days', period='10 days', initial='90 days')
# The unique size of output cutoff should be equal to 'k'
self.assertEqual(len(np.unique(df_cv['cutoff'])), k)
m, horizon='4 days', period='10 days', initial='115 days')
self.assertEqual(len(np.unique(df_cv['cutoff'])), 3)
self.assertEqual(max(df_cv['ds'] - df_cv['cutoff']), horizon)
self.assertTrue(min(df_cv['cutoff']) >= min(self.__df['ds']) + initial)
dc = df_cv['cutoff'].diff()
dc = dc[dc > pd.Timedelta(0)].min()
self.assertTrue(dc >= period)
self.assertTrue((df_cv['cutoff'] < df_cv['ds']).all())
# Each y in df_cv and self.__df with same ds should be equal
df_merged = pd.merge(df_cv, self.__df, 'left', on='ds')
self.assertAlmostEqual(
np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
df_cv = diagnostics.cross_validation(
m, horizon='4 days', period='10 days', initial='135 days')
self.assertEqual(len(np.unique(df_cv['cutoff'])), 1)
with self.assertRaises(ValueError):
diagnostics.cross_validation(
m, horizon='10 days', period='10 days', initial='140 days')
def test_cross_validation_logistic(self):
df = self.__df.copy()
df['cap'] = 40
m = Prophet(growth='logistic').fit(df)
df_cv = diagnostics.cross_validation(
m, horizon='1 days', period='1 days', initial='140 days')
self.assertEqual(len(np.unique(df_cv['cutoff'])), 2)
self.assertTrue((df_cv['cutoff'] < df_cv['ds']).all())
df_merged = pd.merge(df_cv, self.__df, 'left', on='ds')
self.assertAlmostEqual(
np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
def test_cross_validation_extra_regressors(self):
df = self.__df.copy()
df['extra'] = range(df.shape[0])
m = Prophet()
m.add_seasonality(name='monthly', period=30.5, fourier_order=5)
m.add_regressor('extra')
m.fit(df)
df_cv = diagnostics.cross_validation(
m, horizon='4 days', period='4 days', initial='135 days')
self.assertEqual(len(np.unique(df_cv['cutoff'])), 2)
period = pd.Timedelta('4 days')
dc = df_cv['cutoff'].diff()
dc = dc[dc > pd.Timedelta(0)].min()
self.assertTrue(dc >= period)
self.assertTrue((df_cv['cutoff'] < df_cv['ds']).all())
df_merged = pd.merge(df_cv, self.__df, 'left', on='ds')
self.assertAlmostEqual(
np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
def test_cross_validation_default_value_check(self):
m = Prophet()
@ -150,11 +118,11 @@ class TestDiagnostics(TestCase):
set(df_none.columns),
{'horizon', 'coverage', 'mae', 'mape', 'mse', 'rmse'},
)
self.assertEqual(df_none.shape[0], 14)
self.assertEqual(df_none.shape[0], 16)
# Aggregation level 0.2
df_horizon = diagnostics.performance_metrics(df_cv, rolling_window=0.2)
self.assertEqual(len(df_horizon['horizon'].unique()), 4)
self.assertEqual(df_horizon.shape[0], 13)
self.assertEqual(df_horizon.shape[0], 14)
# Aggregation level all
df_all = diagnostics.performance_metrics(df_cv, rolling_window=1)
self.assertEqual(df_all.shape[0], 1)