From 994db64942e4d7ed910c49bdc69d4ec91b7b7074 Mon Sep 17 00:00:00 2001 From: Ben Letham Date: Thu, 3 May 2018 17:39:31 -0700 Subject: [PATCH] Refactor diagnostic metrics to allow/include grouping by horizon (Py) --- python/fbprophet/diagnostics.py | 151 ++++++++++++++------- python/fbprophet/tests/test_diagnostics.py | 61 +++++++++ 2 files changed, 160 insertions(+), 52 deletions(-) diff --git a/python/fbprophet/diagnostics.py b/python/fbprophet/diagnostics.py index e0dc470..bd97466 100644 --- a/python/fbprophet/diagnostics.py +++ b/python/fbprophet/diagnostics.py @@ -196,67 +196,114 @@ def prophet_copy(m, cutoff=None): return m2 -def me(df): - return((df['yhat'] - df['y']).sum()/len(df['yhat'])) -def mse(df): - return((df['yhat'] - df['y']).pow(2).sum()/len(df)) -def rmse(df): - return(np.sqrt((df['yhat'] - df['y']).pow(2).sum()/len(df))) -def mae(df): - return((df['yhat'] - df['y']).abs().sum()/len(df)) -def mpe(df): - return((df['yhat'] - df['y']).div(df['y']).sum()*(1/len(df))) -def mape(df): - return((df['yhat'] - df['y']).div(df['y']).abs().sum()*(1/len(df))) +def performance_metrics(df, metrics=None, aggregation='horizon'): + """Compute performance metrics from cross-validation results. -def all_metrics(model, df_cv = None): - """Compute model fit metrics for time series. + Computes a suite of performance metrics on the output of cross-validation. + By default the following metrics are included: + 'mse': mean squared error + 'mae': mean absolute error + 'mape': mean percent error + 'coverage': coverage of the upper and lower intervals - Computes the following metrics about each time series that has been through - Cross Validation; + A subset of these can be specified by passing a list of names as the + `metrics` argument. - Mean Error (ME) - Mean Squared Error (MSE) - Root Mean Square Error (RMSE, - Mean Absolute Error (MAE) - Mean Percentage Error (MPE) - Mean Absolute Percentage Error (MAPE) + By default, metrics will be computed for each horizon (ds - cutoff). + Alternatively, metrics can be computed at the level of individual ds/cutoff + pairs (aggregation='none'), or aggregated over all ds/cutoffs + (aggregation='all'). + + The output is a dataframe containing the columns corresponding to the level + of aggregation ('horizon', 'ds' and 'cutoff', or none) along with columns + for each of the metrics computed. Parameters ---------- - df: A pandas dataframe. Contains y and yhat produced by cross-validation + df: The dataframe returned by cross_validation. + metrics: A list of performance metrics to compute. If not provided, will + use ['mse', 'mae', 'mape', 'coverage']. + aggregation: Level of aggregation for computing performance statistics. + Must be 'horizon', 'none', or 'all'. Returns ------- - A dictionary where the key = the error type, and value is the value of the error + Dataframe with a column for each metric, and a combination of columns 'ds', + 'cutoff', and 'horizon', depending on the aggregation level. """ - - - - df = [] - - if df_cv is not None: - df = df_cv - else: - # run a forecast on your own data with period = 0 so that it is in-sample data onlyl - #df = model.predict(model.make_future_dataframe(periods=0))[['y', 'yhat']] - df = (model - .history[['ds', 'y']] - .merge( - model.predict(model.make_future_dataframe(periods=0))[['ds', 'yhat']], - how='inner', on='ds' - ) - ) - - if 'yhat' not in df.columns: + # Input validation + valid_aggregations = ['horizon', 'all', 'none'] + if aggregation not in valid_aggregations: raise ValueError( - 'Please run Cross-Validation first before computing quality metrics.') + 'Aggregation {} is not valid; must be one of {}'.format( + aggregation, valid_agggregations + ) + ) + valid_metrics = ['mse', 'mae', 'mape', 'coverage'] + if metrics is None: + metrics = valid_metrics + if len(set(metrics)) != len(metrics): + raise ValueError('Input metrics must be a list of unique values') + if not set(metrics).issubset(set(valid_metrics)): + raise ValueError( + 'Valid values for metrics are: {}'.format(valid_metrics) + ) + # Get function for the metrics we want + metric_fns = {m: eval(m) for m in metrics} + def all_metrics(df_g): + return pd.Series({name: fn(df_g) for name, fn in metric_fns.items()}) + # Apply functions to groupby + if aggregation == 'all': + return all_metrics(df) + # else, + df_m = df.copy() + df_m['horizon'] = df_m['ds'] - df_m['cutoff'] + if aggregation == 'horizon': + return df_m.groupby('horizon').apply(all_metrics).reset_index() + # else, + for name, fn in metric_fns.items(): + df_m[name] = fn(df_m, agg=False) + return df_m - return { - 'ME':me(df), - 'MSE':mse(df), - 'RMSE': rmse(df), - 'MAE': mae(df), - 'MPE': mpe(df), - 'MAPE': mape(df) - } + +# The functions below specify performance metrics for cross-validation results. +# Each takes as input the output of cross_validation, and has two modes of +# return: if agg=True, returns a float that is the metric aggregated over the +# input. If agg=False, returns results without aggregation (for +# aggregation='none' in performance_metrics). + + +def mse(df, agg=True): + """Mean squared error + """ + se = (df['y'] - df['yhat']) ** 2 + if agg: + return np.mean(se) + return se + + +def mae(df, agg=True): + """Mean absolute error + """ + ae = np.abs(df['y'] - df['yhat']) + if agg: + return np.mean(ae) + return ae + + +def mape(df, agg=True): + """Mean absolute percent error + """ + ape = np.abs((df['y'] - df['yhat']) / df['y']) + if agg: + return np.mean(ape) + return ape + + +def coverage(df, agg=True): + """Coverage + """ + is_covered = (df['y'] >= df['yhat_lower']) & (df['y'] <= df['yhat_upper']) + if agg: + return np.mean(is_covered) + return is_covered diff --git a/python/fbprophet/tests/test_diagnostics.py b/python/fbprophet/tests/test_diagnostics.py index 02e480c..996d36e 100644 --- a/python/fbprophet/tests/test_diagnostics.py +++ b/python/fbprophet/tests/test_diagnostics.py @@ -135,3 +135,64 @@ class TestDiagnostics(TestCase): ((df_cv1['y'] - df_cv2['y']) ** 2).sum(), 0.0) self.assertAlmostEqual( ((df_cv1['yhat'] - df_cv2['yhat']) ** 2).sum(), 0.0) + + def test_performance_metrics(self): + m = Prophet() + m.fit(self.__df) + df_cv = diagnostics.cross_validation( + m, horizon='4 days', period='10 days', initial='90 days') + # Aggregation level none + df_none = diagnostics.performance_metrics(df_cv, aggregation='none') + self.assertEqual( + set(df_none.columns), + { + 'y', 'yhat', 'yhat_lower', 'yhat_upper', 'ds', 'cutoff', + 'horizon', 'coverage', 'mae', 'mape', 'mse', + }, + ) + # Check each metric + self.assertEqual( + np.abs(df_cv['yhat'][0] - df_cv['y'][0]), + df_none['mae'][0], + ) + self.assertEqual( + np.abs((df_cv['yhat'][0] - df_cv['y'][0]) / df_cv['y'][0]), + df_none['mape'][0], + ) + self.assertEqual( + (df_cv['yhat'][0] - df_cv['y'][0]) ** 2, + df_none['mse'][0], + ) + self.assertEqual( + ( + (df_cv['y'][0] >= df_cv['yhat_lower'][0]) + and (df_cv['y'][0] <= df_cv['yhat_upper'][0]) + ), + df_none['coverage'][0], + ) + # Aggregation level horizon (default) + df_horizon = diagnostics.performance_metrics(df_cv) + self.assertEqual(len(df_horizon['horizon'].unique()), 4) + self.assertEqual( + set(df_horizon.columns), + {'coverage', 'mse', 'mape', 'mae', 'horizon'}, + ) + self.assertEqual(df_horizon.shape[0], 4) + # Check aggregation + agg = df_none.groupby('horizon', as_index=False).agg('mean') + for metric in ['mse', 'mape', 'mae', 'horizon']: + self.assertTrue((agg[metric] == df_horizon[metric]).all()) + # Aggregation level all + df_all = diagnostics.performance_metrics(df_cv, aggregation='all') + self.assertEqual(df_all.shape, (4,)) + self.assertEqual(set(df_all.index), {'coverage', 'mse', 'mae', 'mape'}) + for metric in ['mse', 'mape', 'mae', 'coverage']: + self.assertEqual(df_all[metric], df_all[metric].mean()) + # Custom list of metrics + df_horizon = diagnostics.performance_metrics( + df_cv, metrics=['coverage', 'mse'], + ) + self.assertEqual( + set(df_horizon.columns), + {'coverage', 'mse', 'horizon'}, + )