prophet/python/scripts/generate_holidays_file.py

# Copyright (c) Facebook, Inc. and its affiliates.

# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import re
import unicodedata

import pandas as pd
import numpy as np

from holidays import list_supported_countries
from prophet.make_holidays import make_holidays_df


def utf8_to_ascii(text: str) -> str:
    """Holidays often have utf-8 characters. These are not allowed in R package data (they generate a NOTE).
    TODO: revisit whether we want to do this lossy conversion.
    """
    ascii_text = unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii")
    # Remove trailing empty brackets and spaces.
    ascii_text = re.sub(r"\(\)$", "", ascii_text).strip()

    # Check if anything converted
    if sum(1 for x in ascii_text if x not in [" ", "(", ")", ","]) == 0:
        return "FAILED_TO_PARSE"
    else:
        return ascii_text


def generate_holidays_df() -> pd.DataFrame:
    """Generate csv file of all possible holiday names, ds, and countries, year combination."""
    country_codes = set(list_supported_countries().keys())

    # For compatibility with Turkey as 'TU' cases.
    country_codes.add("TU")

    all_holidays = []
    for country_code in country_codes:
        df = make_holidays_df(
            year_list=np.arange(1995, 2045, 1).tolist(),
            country=country_code,
        )
        df["country"] = country_code
        all_holidays.append(df)

    generated_holidays = pd.concat(all_holidays, axis=0, ignore_index=True)
    generated_holidays["year"] = generated_holidays.ds.dt.year
    generated_holidays.sort_values(["country", "ds", "holiday"], inplace=True)

    # Convert to ASCII, and drop holidays that fail to convert
    generated_holidays["holiday"] = generated_holidays["holiday"].apply(utf8_to_ascii)
    failed_countries = generated_holidays.loc[
        generated_holidays["holiday"] == "FAILED_TO_PARSE", "country"
    ].unique()
    if len(failed_countries) > 0:
        print("Failed to convert UTF-8 holidays for:")
        print("\n".join(failed_countries))
    assert "FAILED_TO_PARSE" not in generated_holidays["holiday"].unique()
    return generated_holidays


if __name__ == "__main__":
    import argparse
    import pathlib

    if not pathlib.Path.cwd().stem == "python":
        raise RuntimeError("Run script from prophet/python directory")
    OUT_CSV_PATH = pathlib.Path(".") / ".." / "R/data-raw/generated_holidays.csv"
    parser = argparse.ArgumentParser()
    parser.add_argument("-o", "--outfile", default=OUT_CSV_PATH)
    args = parser.parse_args()
    df = generate_holidays_df()
    df.to_csv(args.outfile, index=False)
Change to MIT license 2019-05-21 18:40:04 +00:00			`# Copyright (c) Facebook, Inc. and its affiliates.`

			`# This source code is licensed under the MIT license found in the`
			`# LICENSE file in the root directory of this source tree.`
Fix bug about holiday year different from input year and others (#670) * Allow both both hoidays and append holidays Match holidays in predict and fit Add test for append_holiday features; minor fixes Add column name validation for append_holidays names; allow only one country Fix bug about holiday year different from input year and also other bugs Change function description Add append holiday feature for R * Add test for R/Pyrhon; fix bugs 2018-10-19 00:40:17 +00:00
			`from __future__ import absolute_import`
			`from __future__ import division`
			`from __future__ import print_function`
			`from __future__ import unicode_literals`

Update python-holidays integration (#2379) 2023-05-28 07:31:59 +00:00			`import re`
Clean non-ASCII characters out of generated_holidays 2018-12-04 22:49:44 +00:00			`import unicodedata`

Fix bug about holiday year different from input year and others (#670) * Allow both both hoidays and append holidays Match holidays in predict and fit Add test for append_holiday features; minor fixes Add column name validation for append_holidays names; allow only one country Fix bug about holiday year different from input year and also other bugs Change function description Add append holiday feature for R * Add test for R/Pyrhon; fix bugs 2018-10-19 00:40:17 +00:00			`import pandas as pd`
			`import numpy as np`
Clean non-ASCII characters out of generated_holidays 2018-12-04 22:49:44 +00:00
Update python-holidays integration (#2379) 2023-05-28 07:31:59 +00:00			`from holidays import list_supported_countries`
Pakage rename (#1844) * Rename package from fbprophet to prophet, and add shim * Untrack files that should have been ignored * Update github actions build commands 2021-03-21 21:13:50 +00:00			`from prophet.make_holidays import make_holidays_df`
Fix bug about holiday year different from input year and others (#670) * Allow both both hoidays and append holidays Match holidays in predict and fit Add test for append_holiday features; minor fixes Add column name validation for append_holidays names; allow only one country Fix bug about holiday year different from input year and also other bugs Change function description Add append holiday feature for R * Add test for R/Pyrhon; fix bugs 2018-10-19 00:40:17 +00:00

Update python-holidays integration (#2379) 2023-05-28 07:31:59 +00:00			`def utf8_to_ascii(text: str) -> str:`
			`"""Holidays often have utf-8 characters. These are not allowed in R package data (they generate a NOTE).`
Update holidays csv for R, and fix bad utf-8 parsing 2020-02-04 23:12:28 +00:00			`TODO: revisit whether we want to do this lossy conversion.`
			`"""`
Update python-holidays integration (#2379) 2023-05-28 07:31:59 +00:00			`ascii_text = unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii")`
			`# Remove trailing empty brackets and spaces.`
			`ascii_text = re.sub(r"\(\)$", "", ascii_text).strip()`

Update holidays csv for R, and fix bad utf-8 parsing 2020-02-04 23:12:28 +00:00			`# Check if anything converted`
Update python-holidays integration (#2379) 2023-05-28 07:31:59 +00:00			`if sum(1 for x in ascii_text if x not in [" ", "(", ")", ","]) == 0:`
			`return "FAILED_TO_PARSE"`
Update holidays csv for R, and fix bad utf-8 parsing 2020-02-04 23:12:28 +00:00			`else:`
			`return ascii_text`


Update python-holidays integration (#2379) 2023-05-28 07:31:59 +00:00			`def generate_holidays_df() -> pd.DataFrame:`
			`"""Generate csv file of all possible holiday names, ds, and countries, year combination."""`
			`country_codes = set(list_supported_countries().keys())`

			`# For compatibility with Turkey as 'TU' cases.`
			`country_codes.add("TU")`

Fix bug about holiday year different from input year and others (#670) * Allow both both hoidays and append holidays Match holidays in predict and fit Add test for append_holiday features; minor fixes Add column name validation for append_holidays names; allow only one country Fix bug about holiday year different from input year and also other bugs Change function description Add append holiday feature for R * Add test for R/Pyrhon; fix bugs 2018-10-19 00:40:17 +00:00			`all_holidays = []`
Update python-holidays integration (#2379) 2023-05-28 07:31:59 +00:00			`for country_code in country_codes:`
			`df = make_holidays_df(`
			`year_list=np.arange(1995, 2045, 1).tolist(),`
			`country=country_code,`
			`)`
			`df["country"] = country_code`
Clean up script for R holidays, and regenerate 2020-08-28 01:36:18 +00:00			`all_holidays.append(df)`
Fix bug about holiday year different from input year and others (#670) * Allow both both hoidays and append holidays Match holidays in predict and fit Add test for append_holiday features; minor fixes Add column name validation for append_holidays names; allow only one country Fix bug about holiday year different from input year and also other bugs Change function description Add append holiday feature for R * Add test for R/Pyrhon; fix bugs 2018-10-19 00:40:17 +00:00
			`generated_holidays = pd.concat(all_holidays, axis=0, ignore_index=True)`
Update python-holidays integration (#2379) 2023-05-28 07:31:59 +00:00			`generated_holidays["year"] = generated_holidays.ds.dt.year`
			`generated_holidays.sort_values(["country", "ds", "holiday"], inplace=True)`
Clean non-ASCII characters out of generated_holidays 2018-12-04 22:49:44 +00:00
Update holidays csv for R, and fix bad utf-8 parsing 2020-02-04 23:12:28 +00:00			`# Convert to ASCII, and drop holidays that fail to convert`
Update python-holidays integration (#2379) 2023-05-28 07:31:59 +00:00			`generated_holidays["holiday"] = generated_holidays["holiday"].apply(utf8_to_ascii)`
			`failed_countries = generated_holidays.loc[`
			`generated_holidays["holiday"] == "FAILED_TO_PARSE", "country"`
			`].unique()`
Bump R version and update holidays file for release (#2349) 2023-01-20 10:52:45 +00:00			`if len(failed_countries) > 0:`
Update holidays data (#2179) 2022-05-24 14:10:46 +00:00			`print("Failed to convert UTF-8 holidays for:")`
Update python-holidays integration (#2379) 2023-05-28 07:31:59 +00:00			`print("\n".join(failed_countries))`
			`assert "FAILED_TO_PARSE" not in generated_holidays["holiday"].unique()`
			`return generated_holidays`
Fix bug about holiday year different from input year and others (#670) * Allow both both hoidays and append holidays Match holidays in predict and fit Add test for append_holiday features; minor fixes Add column name validation for append_holidays names; allow only one country Fix bug about holiday year different from input year and also other bugs Change function description Add append holiday feature for R * Add test for R/Pyrhon; fix bugs 2018-10-19 00:40:17 +00:00

			`if __name__ == "__main__":`
Update python-holidays integration (#2379) 2023-05-28 07:31:59 +00:00			`import argparse`
			`import pathlib`

			`if not pathlib.Path.cwd().stem == "python":`
			`raise RuntimeError("Run script from prophet/python directory")`
			`OUT_CSV_PATH = pathlib.Path(".") / ".." / "R/data-raw/generated_holidays.csv"`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("-o", "--outfile", default=OUT_CSV_PATH)`
			`args = parser.parse_args()`
			`df = generate_holidays_df()`
			`df.to_csv(args.outfile, index=False)`