mirror of
https://github.com/saymrwulf/prophet.git
synced 2026-07-04 04:07:25 +00:00
Simplify logic/interfaces for cross_validation, to better handle irregularly spaced data and actually respect initial window
This commit is contained in:
parent
3571a90558
commit
218283f157
10 changed files with 218 additions and 346 deletions
|
|
@ -15,6 +15,5 @@ export(plot_forecast_component)
|
|||
export(predictive_samples)
|
||||
export(prophet)
|
||||
export(prophet_plot_components)
|
||||
export(simulated_historical_forecasts)
|
||||
import(Rcpp)
|
||||
importFrom(dplyr,"%>%")
|
||||
|
|
|
|||
|
|
@ -11,102 +11,50 @@ globalVariables(c(
|
|||
|
||||
#' Generate cutoff dates
|
||||
#'
|
||||
#' @param df Dataframe with historical data
|
||||
#' @param horizon timediff forecast horizon
|
||||
#' @param k integer number of forecast points
|
||||
#' @param df Dataframe with historical data.
|
||||
#' @param horizon timediff forecast horizon.
|
||||
#' @param initial timediff initial window.
|
||||
#' @param period timediff Simulated forecasts are done with this period.
|
||||
#'
|
||||
#' @return Array of datetimes
|
||||
#' @return Array of datetimes.
|
||||
#'
|
||||
#' @keywords internal
|
||||
generate_cutoffs <- function(df, horizon, k, period) {
|
||||
generate_cutoffs <- function(df, horizon, initial, period) {
|
||||
# Last cutoff is (latest date in data) - (horizon).
|
||||
cutoff <- max(df$ds) - horizon
|
||||
if (cutoff < min(df$ds)) {
|
||||
stop('Less data than horizon.')
|
||||
}
|
||||
tzone <- attr(cutoff, "tzone") # Timezone is wiped by putting in array
|
||||
result <- cutoff
|
||||
if (k > 1) {
|
||||
for (i in 2:k) {
|
||||
cutoff <- cutoff - period
|
||||
# If data does not exist in data range (cutoff, cutoff + horizon]
|
||||
if (!any((df$ds > cutoff) & (df$ds <= cutoff + horizon))) {
|
||||
result <- c(cutoff)
|
||||
while (result[length(result)] >= min(df$ds) + initial) {
|
||||
cutoff <- cutoff - period
|
||||
# If data does not exist in data range (cutoff, cutoff + horizon]
|
||||
if (!any((df$ds > cutoff) & (df$ds <= cutoff + horizon))) {
|
||||
# Next cutoff point is 'closest date before cutoff in data - horizon'
|
||||
closest.date <- max(df$ds[df$ds <= cutoff])
|
||||
cutoff <- closest.date - horizon
|
||||
}
|
||||
if (cutoff < min(df$ds)) {
|
||||
warning('Not enough data for requested number of cutoffs! Using ', i)
|
||||
break
|
||||
}
|
||||
result <- c(result, cutoff)
|
||||
}
|
||||
result <- c(result, cutoff)
|
||||
}
|
||||
result <- head(result, -1)
|
||||
if (length(result) == 0) {
|
||||
stop(paste(
|
||||
'Less data than horizon after initial window.',
|
||||
'Make horizon or initial shorter.'
|
||||
))
|
||||
}
|
||||
# Reset timezones
|
||||
attr(result, "tzone") <- tzone
|
||||
message(paste(
|
||||
'Making', length(result), 'forecasts with cutoffs between',
|
||||
result[length(result)], 'and', result[1]
|
||||
))
|
||||
return(rev(result))
|
||||
}
|
||||
|
||||
#' Simulated historical forecasts.
|
||||
#'
|
||||
#' Make forecasts from k historical cutoff points, working backwards from
|
||||
#' (end - horizon) with a spacing of period between each cutoff.
|
||||
#'
|
||||
#' @param model Fitted Prophet model.
|
||||
#' @param horizon Integer size of the horizon
|
||||
#' @param units String unit of the horizon, e.g., "days", "secs".
|
||||
#' @param k integer number of forecast points
|
||||
#' @param period Integer amount of time between cutoff dates. Same units as
|
||||
#' horizon. If not provided, will use 0.5 * horizon.
|
||||
#'
|
||||
#' @return A dataframe with the forecast, actual value, and cutoff date.
|
||||
#'
|
||||
#' @export
|
||||
simulated_historical_forecasts <- function(model, horizon, units, k,
|
||||
period = NULL) {
|
||||
df <- model$history
|
||||
horizon <- as.difftime(horizon, units = units)
|
||||
if (is.null(period)) {
|
||||
period <- horizon / 2
|
||||
} else {
|
||||
period <- as.difftime(period, units = units)
|
||||
}
|
||||
cutoffs <- generate_cutoffs(df, horizon, k, period)
|
||||
predicts <- data.frame()
|
||||
for (i in seq_along(cutoffs)) {
|
||||
cutoff <- cutoffs[i]
|
||||
# Copy the model
|
||||
m <- prophet_copy(model, cutoff)
|
||||
# Train model
|
||||
history.c <- dplyr::filter(df, ds <= cutoff)
|
||||
m <- fit.prophet(m, history.c)
|
||||
# Calculate yhat
|
||||
df.predict <- dplyr::filter(df, ds > cutoff, ds <= cutoff + horizon)
|
||||
# Get the columns for the future dataframe
|
||||
columns <- 'ds'
|
||||
if (m$growth == 'logistic') {
|
||||
columns <- c(columns, 'cap')
|
||||
if (m$logistic.floor) {
|
||||
columns <- c(columns, 'floor')
|
||||
}
|
||||
}
|
||||
columns <- c(columns, names(m$extra_regressors))
|
||||
future <- df[columns]
|
||||
yhat <- stats::predict(m, future)
|
||||
# Merge yhat, y, and cutoff.
|
||||
df.c <- dplyr::inner_join(df.predict, yhat, by = "ds")
|
||||
df.c <- dplyr::select(df.c, ds, y, yhat, yhat_lower, yhat_upper)
|
||||
df.c$cutoff <- cutoff
|
||||
predicts <- rbind(predicts, df.c)
|
||||
}
|
||||
return(predicts)
|
||||
}
|
||||
|
||||
#' Cross-validation for time series.
|
||||
#'
|
||||
#' Computes forecasts from historical cutoff points. Beginning from initial,
|
||||
#' makes cutoffs with a spacing of period up to (end - horizon).
|
||||
#' Computes forecasts from historical cutoff points. Beginning from
|
||||
#' (end - horizon), works backwards making cutoffs with a spacing of period
|
||||
#' until initial is reached.
|
||||
#'
|
||||
#' When period is equal to the time interval of the data, this is the
|
||||
#' technique described in https://robjhyndman.com/hyndsight/tscv/ .
|
||||
|
|
@ -124,8 +72,9 @@ simulated_historical_forecasts <- function(model, horizon, units, k,
|
|||
#' @export
|
||||
cross_validation <- function(
|
||||
model, horizon, units, period = NULL, initial = NULL) {
|
||||
te <- max(model$history$ds)
|
||||
ts <- min(model$history$ds)
|
||||
df <- model$history
|
||||
te <- max(df$ds)
|
||||
ts <- min(df$ds)
|
||||
if (is.null(period)) {
|
||||
period <- 0.5 * horizon
|
||||
}
|
||||
|
|
@ -135,14 +84,39 @@ cross_validation <- function(
|
|||
horizon.dt <- as.difftime(horizon, units = units)
|
||||
initial.dt <- as.difftime(initial, units = units)
|
||||
period.dt <- as.difftime(period, units = units)
|
||||
k <- ceiling(
|
||||
as.double((te - horizon.dt) - (ts + initial.dt), units='secs') /
|
||||
as.double(period.dt, units = 'secs')
|
||||
)
|
||||
if (k < 1) {
|
||||
stop('Not enough data for specified horizon, period, and initial.')
|
||||
|
||||
cutoffs <- generate_cutoffs(df, horizon.dt, initial.dt, period.dt)
|
||||
predicts <- data.frame()
|
||||
for (i in seq_along(cutoffs)) {
|
||||
cutoff <- cutoffs[i]
|
||||
# Copy the model
|
||||
m <- prophet_copy(model, cutoff)
|
||||
# Train model
|
||||
history.c <- dplyr::filter(df, ds <= cutoff)
|
||||
if (nrow(history.c) < 2) {
|
||||
stop('Less than two datapoints before cutoff. Increase initial window.')
|
||||
}
|
||||
m <- fit.prophet(m, history.c)
|
||||
# Calculate yhat
|
||||
df.predict <- dplyr::filter(df, ds > cutoff, ds <= cutoff + horizon.dt)
|
||||
# Get the columns for the future dataframe
|
||||
columns <- 'ds'
|
||||
if (m$growth == 'logistic') {
|
||||
columns <- c(columns, 'cap')
|
||||
if (m$logistic.floor) {
|
||||
columns <- c(columns, 'floor')
|
||||
}
|
||||
}
|
||||
columns <- c(columns, names(m$extra_regressors))
|
||||
future <- df.predict[columns]
|
||||
yhat <- stats::predict(m, future)
|
||||
# Merge yhat, y, and cutoff.
|
||||
df.c <- dplyr::inner_join(df.predict, yhat, by = "ds")
|
||||
df.c <- dplyr::select(df.c, ds, y, yhat, yhat_lower, yhat_upper)
|
||||
df.c$cutoff <- cutoff
|
||||
predicts <- rbind(predicts, df.c)
|
||||
}
|
||||
return(simulated_historical_forecasts(model, horizon, units, k, period))
|
||||
return(predicts)
|
||||
}
|
||||
|
||||
#' Copy Prophet object.
|
||||
|
|
|
|||
|
|
@ -23,8 +23,9 @@ horizon. If not provided, 0.5 * horizon is used.}
|
|||
A dataframe with the forecast, actual value, and cutoff date.
|
||||
}
|
||||
\description{
|
||||
Computes forecasts from historical cutoff points. Beginning from initial,
|
||||
makes cutoffs with a spacing of period up to (end - horizon).
|
||||
Computes forecasts from historical cutoff points. Beginning from
|
||||
(end - horizon), works backwards making cutoffs with a spacing of period
|
||||
until initial is reached.
|
||||
}
|
||||
\details{
|
||||
When period is equal to the time interval of the data, this is the
|
||||
|
|
|
|||
|
|
@ -24,8 +24,9 @@ Plot the prophet forecast.
|
|||
}
|
||||
\examples{
|
||||
\dontrun{
|
||||
history <- data.frame(ds = seq(as.Date('2015-01-01'), as.Date('2016-01-01'), by = 'd'),
|
||||
y = sin(1:366/200) + rnorm(366)/10)
|
||||
history <- data.frame(
|
||||
ds = seq(as.Date('2015-01-01'), as.Date('2016-01-01'), by = 'd'),
|
||||
y = sin(1:366/200) + rnorm(366)/10)
|
||||
m <- prophet(history)
|
||||
future <- make_future_dataframe(m, periods = 365)
|
||||
forecast <- predict(m, future)
|
||||
|
|
|
|||
|
|
@ -4,14 +4,14 @@
|
|||
\alias{generate_cutoffs}
|
||||
\title{Generate cutoff dates}
|
||||
\usage{
|
||||
generate_cutoffs(df, horizon, k, period)
|
||||
generate_cutoffs(df, horizon, initial, period)
|
||||
}
|
||||
\arguments{
|
||||
\item{df}{Dataframe with historical data}
|
||||
|
||||
\item{horizon}{timediff forecast horizon}
|
||||
|
||||
\item{k}{integer number of forecast points}
|
||||
\item{initial}{timediff initial window}
|
||||
|
||||
\item{period}{timediff Simulated forecasts are done with this period.}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,12 +5,13 @@
|
|||
\title{Prophet forecaster.}
|
||||
\usage{
|
||||
prophet(df = NULL, growth = "linear", changepoints = NULL,
|
||||
n.changepoints = 25, yearly.seasonality = "auto",
|
||||
weekly.seasonality = "auto", daily.seasonality = "auto",
|
||||
holidays = NULL, seasonality.mode = "additive",
|
||||
seasonality.prior.scale = 10, holidays.prior.scale = 10,
|
||||
changepoint.prior.scale = 0.05, mcmc.samples = 0, interval.width = 0.8,
|
||||
uncertainty.samples = 1000, fit = TRUE, ...)
|
||||
n.changepoints = 25, changepoint.range = 0.8,
|
||||
yearly.seasonality = "auto", weekly.seasonality = "auto",
|
||||
daily.seasonality = "auto", holidays = NULL,
|
||||
seasonality.mode = "additive", seasonality.prior.scale = 10,
|
||||
holidays.prior.scale = 10, changepoint.prior.scale = 0.05,
|
||||
mcmc.samples = 0, interval.width = 0.8, uncertainty.samples = 1000,
|
||||
fit = TRUE, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{df}{(optional) Dataframe containing the history. Must have columns ds
|
||||
|
|
@ -29,7 +30,11 @@ automatically.}
|
|||
\item{n.changepoints}{Number of potential changepoints to include. Not used
|
||||
if input `changepoints` is supplied. If `changepoints` is not supplied,
|
||||
then n.changepoints potential changepoints are selected uniformly from the
|
||||
first 80 percent of df$ds.}
|
||||
first `changepoint.range` proportion of df$ds.}
|
||||
|
||||
\item{changepoint.range}{Proportion of history in which trend changepoints
|
||||
will be estimated. Defaults to 0.8 for the first 80%. Not used if
|
||||
`changepoints` is specified.}
|
||||
|
||||
\item{yearly.seasonality}{Fit yearly seasonality. Can be 'auto', TRUE,
|
||||
FALSE, or a number of Fourier terms to generate.}
|
||||
|
|
|
|||
|
|
@ -1,27 +0,0 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/diagnostics.R
|
||||
\name{simulated_historical_forecasts}
|
||||
\alias{simulated_historical_forecasts}
|
||||
\title{Simulated historical forecasts.}
|
||||
\usage{
|
||||
simulated_historical_forecasts(model, horizon, units, k, period = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{Fitted Prophet model.}
|
||||
|
||||
\item{horizon}{Integer size of the horizon}
|
||||
|
||||
\item{units}{String unit of the horizon, e.g., "days", "secs".}
|
||||
|
||||
\item{k}{integer number of forecast points}
|
||||
|
||||
\item{period}{Integer amount of time between cutoff dates. Same units as
|
||||
horizon. If not provided, will use 0.5 * horizon.}
|
||||
}
|
||||
\value{
|
||||
A dataframe with the forecast, actual value, and cutoff date.
|
||||
}
|
||||
\description{
|
||||
Make forecasts from k historical cutoff points, working backwards from
|
||||
(end - horizon) with a spacing of period between each cutoff.
|
||||
}
|
||||
|
|
@ -8,75 +8,6 @@ DATA_all <- read.csv('data.csv')
|
|||
DATA_all$ds <- as.Date(DATA_all$ds)
|
||||
DATA <- head(DATA_all, 100)
|
||||
|
||||
test_that("simulated_historical_forecasts", {
|
||||
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
|
||||
m <- prophet(DATA)
|
||||
k <- 2
|
||||
for (p in c(1, 10)) {
|
||||
for (h in c(1, 3)) {
|
||||
df.shf <- simulated_historical_forecasts(
|
||||
m, horizon = h, units = 'days', k = k, period = p)
|
||||
# All cutoff dates should be less than ds dates
|
||||
expect_true(all(df.shf$cutoff < df.shf$ds))
|
||||
# The unique size of output cutoff should be equal to 'k'
|
||||
expect_equal(length(unique(df.shf$cutoff)), k)
|
||||
expect_equal(max(df.shf$ds - df.shf$cutoff),
|
||||
as.difftime(h, units = 'days'))
|
||||
dc <- diff(df.shf$cutoff)
|
||||
dc <- min(dc[dc > 0])
|
||||
expect_true(dc >= as.difftime(p, units = 'days'))
|
||||
# Each y in df_shf and DATA with same ds should be equal
|
||||
df.merged <- dplyr::left_join(df.shf, m$history, by="ds")
|
||||
expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
test_that("simulated_historical_forecasts_logistic", {
|
||||
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
|
||||
df <- DATA
|
||||
df$cap <- 40
|
||||
m <- prophet(df, growth='logistic')
|
||||
df.shf <- simulated_historical_forecasts(
|
||||
m, horizon = 3, units = 'days', k = 2, period = 3)
|
||||
# All cutoff dates should be less than ds dates
|
||||
expect_true(all(df.shf$cutoff < df.shf$ds))
|
||||
# The unique size of output cutoff should be equal to 'k'
|
||||
expect_equal(length(unique(df.shf$cutoff)), 2)
|
||||
# Each y in df_shf and DATA with same ds should be equal
|
||||
df.merged <- dplyr::left_join(df.shf, m$history, by="ds")
|
||||
expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
|
||||
})
|
||||
|
||||
test_that("simulated_historical_forecasts_extra_regressors", {
|
||||
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
|
||||
df <- DATA
|
||||
df$extra <- seq(0, nrow(df) - 1)
|
||||
m <- prophet()
|
||||
m <- add_seasonality(m, name = 'monthly', period = 30.5, fourier.order = 5)
|
||||
m <- add_regressor(m, 'extra')
|
||||
m <- fit.prophet(m, df)
|
||||
df.shf <- simulated_historical_forecasts(
|
||||
m, horizon = 3, units = 'days', k = 2, period = 3)
|
||||
# All cutoff dates should be less than ds dates
|
||||
expect_true(all(df.shf$cutoff < df.shf$ds))
|
||||
# The unique size of output cutoff should be equal to 'k'
|
||||
expect_equal(length(unique(df.shf$cutoff)), 2)
|
||||
# Each y in df_shf and DATA with same ds should be equal
|
||||
df.merged <- dplyr::left_join(df.shf, m$history, by="ds")
|
||||
expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
|
||||
})
|
||||
|
||||
test_that("simulated_historical_forecasts_default_value_check", {
|
||||
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
|
||||
m <- prophet(DATA)
|
||||
df.shf1 <- simulated_historical_forecasts(
|
||||
m, horizon = 10, units = 'days', k = 1)
|
||||
df.shf2 <- simulated_historical_forecasts(
|
||||
m, horizon = 10, units = 'days', k = 1, period = 5)
|
||||
expect_equal(sum(dplyr::select(df.shf1 - df.shf2, y, yhat)), 0)
|
||||
})
|
||||
|
||||
test_that("cross_validation", {
|
||||
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
|
||||
m <- prophet(DATA)
|
||||
|
|
@ -85,14 +16,59 @@ test_that("cross_validation", {
|
|||
ts <- min(DATA$ds)
|
||||
horizon <- as.difftime(4, units = "days")
|
||||
period <- as.difftime(10, units = "days")
|
||||
k <- 5
|
||||
initial <- as.difftime(115, units = "days")
|
||||
df.cv <- cross_validation(
|
||||
m, horizon = 4, units = "days", period = 10, initial = 90)
|
||||
expect_equal(length(unique(df.cv$cutoff)), k)
|
||||
m, horizon = 4, units = "days", period = 10, initial = 115)
|
||||
expect_equal(length(unique(df.cv$cutoff)), 3)
|
||||
expect_equal(max(df.cv$ds - df.cv$cutoff), horizon)
|
||||
expect_true(min(df.cv$cutoff) >= ts + initial)
|
||||
dc <- diff(df.cv$cutoff)
|
||||
dc <- min(dc[dc > 0])
|
||||
expect_true(dc >= period)
|
||||
expect_true(all(df.cv$cutoff < df.cv$ds))
|
||||
# Each y in df.cv and DATA with same ds should be equal
|
||||
df.merged <- dplyr::left_join(df.cv, m$history, by="ds")
|
||||
expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
|
||||
df.cv <- cross_validation(
|
||||
m, horizon = 4, units = "days", period = 10, initial = 135)
|
||||
expect_equal(length(unique(df.cv$cutoff)), 1)
|
||||
expect_error(
|
||||
cross_validation(
|
||||
m, horizon = 10, units = "days", period = 10, initial = 140)
|
||||
)
|
||||
})
|
||||
|
||||
test_that("cross_validation_logistic", {
|
||||
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
|
||||
df <- DATA
|
||||
df$cap <- 40
|
||||
m <- prophet(df, growth = 'logistic')
|
||||
df.cv <- cross_validation(
|
||||
m, horizon = 1, units = "days", period = 1, initial = 140)
|
||||
expect_equal(length(unique(df.cv$cutoff)), 2)
|
||||
expect_true(all(df.cv$cutoff < df.cv$ds))
|
||||
df.merged <- dplyr::left_join(df.cv, m$history, by="ds")
|
||||
expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
|
||||
})
|
||||
|
||||
test_that("cross_validation_extra_regressors", {
|
||||
skip_if_not(Sys.getenv('R_ARCH') != '/i386')
|
||||
df <- DATA
|
||||
df$extra <- seq(0, nrow(df) - 1)
|
||||
m <- prophet()
|
||||
m <- add_seasonality(m, name = 'monthly', period = 30.5, fourier.order = 5)
|
||||
m <- add_regressor(m, 'extra')
|
||||
m <- fit.prophet(m, df)
|
||||
df.cv <- cross_validation(
|
||||
m, horizon = 4, units = "days", period = 4, initial = 135)
|
||||
expect_equal(length(unique(df.cv$cutoff)), 2)
|
||||
period <- as.difftime(4, units = "days")
|
||||
dc <- diff(df.cv$cutoff)
|
||||
dc <- min(dc[dc > 0])
|
||||
expect_true(dc >= period)
|
||||
expect_true(all(df.cv$cutoff < df.cv$ds))
|
||||
df.merged <- dplyr::left_join(df.cv, m$history, by="ds")
|
||||
expect_equal(sum((df.merged$y.x - df.merged$y.y) ** 2), 0)
|
||||
})
|
||||
|
||||
test_that("cross_validation_default_value_check", {
|
||||
|
|
@ -116,11 +92,11 @@ test_that("performance_metrics", {
|
|||
sort(colnames(df_none))
|
||||
== sort(c('horizon', 'coverage', 'mae', 'mape', 'mse', 'rmse'))
|
||||
))
|
||||
expect_equal(nrow(df_none), 14)
|
||||
expect_equal(nrow(df_none), 16)
|
||||
# Aggregation level 0.2
|
||||
df_horizon <- performance_metrics(df_cv, rolling_window = 0.2)
|
||||
expect_equal(length(unique(df_horizon$horizon)), 4)
|
||||
expect_equal(nrow(df_horizon), 13)
|
||||
expect_equal(nrow(df_horizon), 14)
|
||||
# Aggregation level all
|
||||
df_all <- performance_metrics(df_cv, rolling_window = 1)
|
||||
expect_equal(nrow(df_all), 1)
|
||||
|
|
|
|||
|
|
@ -21,18 +21,15 @@ import pandas as pd
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cutoffs(df, horizon, k, period):
|
||||
def generate_cutoffs(df, horizon, initial, period):
|
||||
"""Generate cutoff dates
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df: pd.DataFrame with historical data
|
||||
horizon: pd.Timedelta.
|
||||
Forecast horizon
|
||||
k: Int number.
|
||||
The number of forecasts point.
|
||||
period: pd.Timedelta.
|
||||
Simulated Forecast will be done at every this period.
|
||||
df: pd.DataFrame with historical data.
|
||||
horizon: pd.Timedelta forecast horizon.
|
||||
initial: pd.Timedelta window of the initial forecast period.
|
||||
period: pd.Timedelta simulated forecasts are done with this period.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
|
@ -43,56 +40,70 @@ def _cutoffs(df, horizon, k, period):
|
|||
if cutoff < df['ds'].min():
|
||||
raise ValueError('Less data than horizon.')
|
||||
result = [cutoff]
|
||||
|
||||
for i in range(1, k):
|
||||
while result[-1] >= min(df['ds']) + initial:
|
||||
cutoff -= period
|
||||
# If data does not exist in data range (cutoff, cutoff + horizon]
|
||||
if not (((df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)).any()):
|
||||
# Next cutoff point is 'last date before cutoff in data - horizon'
|
||||
closest_date = df[df['ds'] <= cutoff].max()['ds']
|
||||
cutoff = closest_date - horizon
|
||||
if cutoff < df['ds'].min():
|
||||
logger.warning(
|
||||
'Not enough data for requested number of cutoffs! '
|
||||
'Using {}.'.format(i))
|
||||
break
|
||||
result.append(cutoff)
|
||||
|
||||
# Sort lines in ascending order
|
||||
result = result[:-1]
|
||||
if len(result) == 0:
|
||||
raise ValueError(
|
||||
'Less data than horizon after initial window. '
|
||||
'Make horizon or initial shorter.'
|
||||
)
|
||||
logger.info('Making {} forecasts with cutoffs between {} and {}'.format(
|
||||
len(result), result[-1], result[0]
|
||||
))
|
||||
return reversed(result)
|
||||
|
||||
|
||||
def simulated_historical_forecasts(model, horizon, k, period=None):
|
||||
"""Simulated Historical Forecasts.
|
||||
def cross_validation(model, horizon, period=None, initial=None):
|
||||
"""Cross-Validation for time series.
|
||||
|
||||
Make forecasts from k historical cutoff points, working backwards from
|
||||
(end - horizon) with a spacing of period between each cutoff.
|
||||
Computes forecasts from historical cutoff points. Beginning from
|
||||
(end - horizon), works backwards making cutoffs with a spacing of period
|
||||
until initial is reached.
|
||||
|
||||
When period is equal to the time interval of the data, this is the
|
||||
technique described in https://robjhyndman.com/hyndsight/tscv/ .
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model: Prophet class object.
|
||||
Fitted Prophet model
|
||||
model: Prophet class object. Fitted Prophet model
|
||||
horizon: string with pd.Timedelta compatible style, e.g., '5 days',
|
||||
'3 hours', '10 seconds'.
|
||||
k: Int number of forecasts point.
|
||||
period: Optional string with pd.Timedelta compatible style. Simulated
|
||||
forecast will be done at every this period. If not provided,
|
||||
0.5 * horizon is used.
|
||||
period: string with pd.Timedelta compatible style. Simulated forecast will
|
||||
be done at every this period. If not provided, 0.5 * horizon is used.
|
||||
initial: string with pd.Timedelta compatible style. The first training
|
||||
period will begin here. If not provided, 3 * horizon is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A pd.DataFrame with the forecast, actual value and cutoff.
|
||||
"""
|
||||
df = model.history.copy().reset_index(drop=True)
|
||||
te = df['ds'].max()
|
||||
ts = df['ds'].min()
|
||||
horizon = pd.Timedelta(horizon)
|
||||
period = 0.5 * horizon if period is None else pd.Timedelta(period)
|
||||
cutoffs = _cutoffs(df, horizon, k, period)
|
||||
initial = 3 * horizon if initial is None else pd.Timedelta(initial)
|
||||
|
||||
cutoffs = generate_cutoffs(df, horizon, initial, period)
|
||||
predicts = []
|
||||
for cutoff in cutoffs:
|
||||
# Generate new object with copying fitting options
|
||||
m = prophet_copy(model, cutoff)
|
||||
# Train model
|
||||
m.fit(df[df['ds'] <= cutoff])
|
||||
history_c = df[df['ds'] <= cutoff]
|
||||
if history_c.shape[0] < 2:
|
||||
raise Exception(
|
||||
'Less than two datapoints before cutoff. '
|
||||
'Increase initial window.'
|
||||
)
|
||||
m.fit(history_c)
|
||||
# Calculate yhat
|
||||
index_predicted = (df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)
|
||||
# Get the columns for the future dataframe
|
||||
|
|
@ -113,42 +124,6 @@ def simulated_historical_forecasts(model, horizon, k, period=None):
|
|||
# Combine all predicted pd.DataFrame into one pd.DataFrame
|
||||
return reduce(lambda x, y: x.append(y), predicts).reset_index(drop=True)
|
||||
|
||||
|
||||
def cross_validation(model, horizon, period=None, initial=None):
|
||||
"""Cross-Validation for time series.
|
||||
|
||||
Computes forecasts from historical cutoff points. Beginning from initial,
|
||||
makes cutoffs with a spacing of period up to (end - horizon).
|
||||
|
||||
When period is equal to the time interval of the data, this is the
|
||||
technique described in https://robjhyndman.com/hyndsight/tscv/ .
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model: Prophet class object. Fitted Prophet model
|
||||
horizon: string with pd.Timedelta compatible style, e.g., '5 days',
|
||||
'3 hours', '10 seconds'.
|
||||
period: string with pd.Timedelta compatible style. Simulated forecast will
|
||||
be done at every this period. If not provided, 0.5 * horizon is used.
|
||||
initial: string with pd.Timedelta compatible style. The first training
|
||||
period will begin here. If not provided, 3 * horizon is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A pd.DataFrame with the forecast, actual value and cutoff.
|
||||
"""
|
||||
te = model.history['ds'].max()
|
||||
ts = model.history['ds'].min()
|
||||
horizon = pd.Timedelta(horizon)
|
||||
period = 0.5 * horizon if period is None else pd.Timedelta(period)
|
||||
initial = 3 * horizon if initial is None else pd.Timedelta(initial)
|
||||
k = int(np.ceil(((te - horizon) - (ts + initial)) / period))
|
||||
if k < 1:
|
||||
raise ValueError(
|
||||
'Not enough data for specified horizon, period, and initial.')
|
||||
return simulated_historical_forecasts(model, horizon, k, period)
|
||||
|
||||
|
||||
def prophet_copy(m, cutoff=None):
|
||||
"""Copy Prophet object
|
||||
|
||||
|
|
|
|||
|
|
@ -36,95 +36,63 @@ class TestDiagnostics(TestCase):
|
|||
# Use first 100 record in data.csv
|
||||
self.__df = DATA
|
||||
|
||||
def test_simulated_historical_forecasts(self):
|
||||
m = Prophet()
|
||||
m.fit(self.__df)
|
||||
k = 2
|
||||
for p in [1, 10]:
|
||||
for h in [1, 3]:
|
||||
period = '{} days'.format(p)
|
||||
horizon = '{} days'.format(h)
|
||||
df_shf = diagnostics.simulated_historical_forecasts(
|
||||
m, horizon=horizon, k=k, period=period)
|
||||
# All cutoff dates should be less than ds dates
|
||||
self.assertTrue((df_shf['cutoff'] < df_shf['ds']).all())
|
||||
# The unique size of output cutoff should be equal to 'k'
|
||||
self.assertEqual(len(np.unique(df_shf['cutoff'])), k)
|
||||
self.assertEqual(
|
||||
max(df_shf['ds'] - df_shf['cutoff']),
|
||||
pd.Timedelta(horizon),
|
||||
)
|
||||
dc = df_shf['cutoff'].diff()
|
||||
dc = dc[dc > pd.Timedelta(0)].min()
|
||||
self.assertTrue(dc >= pd.Timedelta(period))
|
||||
# Each y in df_shf and self.__df with same ds should be equal
|
||||
df_merged = pd.merge(df_shf, self.__df, 'left', on='ds')
|
||||
self.assertAlmostEqual(
|
||||
np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
|
||||
|
||||
def test_simulated_historical_forecasts_logistic(self):
|
||||
m = Prophet(growth='logistic')
|
||||
df = self.__df.copy()
|
||||
df['cap'] = 40
|
||||
m.fit(df)
|
||||
df_shf = diagnostics.simulated_historical_forecasts(
|
||||
m, horizon='3 days', k=2, period='3 days')
|
||||
# All cutoff dates should be less than ds dates
|
||||
self.assertTrue((df_shf['cutoff'] < df_shf['ds']).all())
|
||||
# The unique size of output cutoff should be equal to 'k'
|
||||
self.assertEqual(len(np.unique(df_shf['cutoff'])), 2)
|
||||
# Each y in df_shf and self.__df with same ds should be equal
|
||||
df_merged = pd.merge(df_shf, df, 'left', on='ds')
|
||||
self.assertAlmostEqual(
|
||||
np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
|
||||
|
||||
def test_simulated_historical_forecasts_extra_regressors(self):
|
||||
m = Prophet()
|
||||
m.add_seasonality(name='monthly', period=30.5, fourier_order=5)
|
||||
m.add_regressor('extra')
|
||||
df = self.__df.copy()
|
||||
df['cap'] = 40
|
||||
df['extra'] = range(df.shape[0])
|
||||
m.fit(df)
|
||||
df_shf = diagnostics.simulated_historical_forecasts(
|
||||
m, horizon='3 days', k=2, period='3 days')
|
||||
# All cutoff dates should be less than ds dates
|
||||
self.assertTrue((df_shf['cutoff'] < df_shf['ds']).all())
|
||||
# The unique size of output cutoff should be equal to 'k'
|
||||
self.assertEqual(len(np.unique(df_shf['cutoff'])), 2)
|
||||
# Each y in df_shf and self.__df with same ds should be equal
|
||||
df_merged = pd.merge(df_shf, df, 'left', on='ds')
|
||||
self.assertAlmostEqual(
|
||||
np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
|
||||
|
||||
def test_simulated_historical_forecasts_default_value_check(self):
|
||||
m = Prophet()
|
||||
m.fit(self.__df)
|
||||
# Default value of period should be equal to 0.5 * horizon
|
||||
df_shf1 = diagnostics.simulated_historical_forecasts(
|
||||
m, horizon='10 days', k=1)
|
||||
df_shf2 = diagnostics.simulated_historical_forecasts(
|
||||
m, horizon='10 days', k=1, period='5 days')
|
||||
self.assertAlmostEqual(
|
||||
((df_shf1['y'] - df_shf2['y']) ** 2).sum(), 0.0)
|
||||
self.assertAlmostEqual(
|
||||
((df_shf1['yhat'] - df_shf2['yhat']) ** 2).sum(), 0.0)
|
||||
|
||||
def test_cross_validation(self):
|
||||
m = Prophet()
|
||||
m.fit(self.__df)
|
||||
# Calculate the number of cutoff points(k)
|
||||
horizon = pd.Timedelta('4 days')
|
||||
period = pd.Timedelta('10 days')
|
||||
k = 5
|
||||
initial = pd.Timedelta('115 days')
|
||||
df_cv = diagnostics.cross_validation(
|
||||
m, horizon='4 days', period='10 days', initial='90 days')
|
||||
# The unique size of output cutoff should be equal to 'k'
|
||||
self.assertEqual(len(np.unique(df_cv['cutoff'])), k)
|
||||
m, horizon='4 days', period='10 days', initial='115 days')
|
||||
self.assertEqual(len(np.unique(df_cv['cutoff'])), 3)
|
||||
self.assertEqual(max(df_cv['ds'] - df_cv['cutoff']), horizon)
|
||||
self.assertTrue(min(df_cv['cutoff']) >= min(self.__df['ds']) + initial)
|
||||
dc = df_cv['cutoff'].diff()
|
||||
dc = dc[dc > pd.Timedelta(0)].min()
|
||||
self.assertTrue(dc >= period)
|
||||
self.assertTrue((df_cv['cutoff'] < df_cv['ds']).all())
|
||||
# Each y in df_cv and self.__df with same ds should be equal
|
||||
df_merged = pd.merge(df_cv, self.__df, 'left', on='ds')
|
||||
self.assertAlmostEqual(
|
||||
np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
|
||||
df_cv = diagnostics.cross_validation(
|
||||
m, horizon='4 days', period='10 days', initial='135 days')
|
||||
self.assertEqual(len(np.unique(df_cv['cutoff'])), 1)
|
||||
with self.assertRaises(ValueError):
|
||||
diagnostics.cross_validation(
|
||||
m, horizon='10 days', period='10 days', initial='140 days')
|
||||
|
||||
def test_cross_validation_logistic(self):
|
||||
df = self.__df.copy()
|
||||
df['cap'] = 40
|
||||
m = Prophet(growth='logistic').fit(df)
|
||||
df_cv = diagnostics.cross_validation(
|
||||
m, horizon='1 days', period='1 days', initial='140 days')
|
||||
self.assertEqual(len(np.unique(df_cv['cutoff'])), 2)
|
||||
self.assertTrue((df_cv['cutoff'] < df_cv['ds']).all())
|
||||
df_merged = pd.merge(df_cv, self.__df, 'left', on='ds')
|
||||
self.assertAlmostEqual(
|
||||
np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
|
||||
|
||||
def test_cross_validation_extra_regressors(self):
|
||||
df = self.__df.copy()
|
||||
df['extra'] = range(df.shape[0])
|
||||
m = Prophet()
|
||||
m.add_seasonality(name='monthly', period=30.5, fourier_order=5)
|
||||
m.add_regressor('extra')
|
||||
m.fit(df)
|
||||
df_cv = diagnostics.cross_validation(
|
||||
m, horizon='4 days', period='4 days', initial='135 days')
|
||||
self.assertEqual(len(np.unique(df_cv['cutoff'])), 2)
|
||||
period = pd.Timedelta('4 days')
|
||||
dc = df_cv['cutoff'].diff()
|
||||
dc = dc[dc > pd.Timedelta(0)].min()
|
||||
self.assertTrue(dc >= period)
|
||||
self.assertTrue((df_cv['cutoff'] < df_cv['ds']).all())
|
||||
df_merged = pd.merge(df_cv, self.__df, 'left', on='ds')
|
||||
self.assertAlmostEqual(
|
||||
np.sum((df_merged['y_x'] - df_merged['y_y']) ** 2), 0.0)
|
||||
|
||||
def test_cross_validation_default_value_check(self):
|
||||
m = Prophet()
|
||||
|
|
@ -150,11 +118,11 @@ class TestDiagnostics(TestCase):
|
|||
set(df_none.columns),
|
||||
{'horizon', 'coverage', 'mae', 'mape', 'mse', 'rmse'},
|
||||
)
|
||||
self.assertEqual(df_none.shape[0], 14)
|
||||
self.assertEqual(df_none.shape[0], 16)
|
||||
# Aggregation level 0.2
|
||||
df_horizon = diagnostics.performance_metrics(df_cv, rolling_window=0.2)
|
||||
self.assertEqual(len(df_horizon['horizon'].unique()), 4)
|
||||
self.assertEqual(df_horizon.shape[0], 13)
|
||||
self.assertEqual(df_horizon.shape[0], 14)
|
||||
# Aggregation level all
|
||||
df_all = diagnostics.performance_metrics(df_cv, rolling_window=1)
|
||||
self.assertEqual(df_all.shape[0], 1)
|
||||
|
|
|
|||
Loading…
Reference in a new issue