2019-05-21 18:40:04 +00:00
|
|
|
# Copyright (c) Facebook, Inc. and its affiliates.
|
|
|
|
|
|
|
|
|
|
# This source code is licensed under the MIT license found in the
|
|
|
|
|
# LICENSE file in the root directory of this source tree.
|
2018-10-19 00:40:17 +00:00
|
|
|
|
|
|
|
|
from __future__ import absolute_import
|
|
|
|
|
from __future__ import division
|
|
|
|
|
from __future__ import print_function
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
2023-05-28 07:31:59 +00:00
|
|
|
import re
|
2018-12-04 22:49:44 +00:00
|
|
|
import unicodedata
|
|
|
|
|
|
2018-10-19 00:40:17 +00:00
|
|
|
import pandas as pd
|
|
|
|
|
import numpy as np
|
2018-12-04 22:49:44 +00:00
|
|
|
|
2023-05-28 07:31:59 +00:00
|
|
|
from holidays import list_supported_countries
|
2021-03-21 21:13:50 +00:00
|
|
|
from prophet.make_holidays import make_holidays_df
|
2018-10-19 00:40:17 +00:00
|
|
|
|
|
|
|
|
|
2023-05-28 07:31:59 +00:00
|
|
|
def utf8_to_ascii(text: str) -> str:
|
|
|
|
|
"""Holidays often have utf-8 characters. These are not allowed in R package data (they generate a NOTE).
|
2020-02-04 23:12:28 +00:00
|
|
|
TODO: revisit whether we want to do this lossy conversion.
|
|
|
|
|
"""
|
2023-05-28 07:31:59 +00:00
|
|
|
ascii_text = unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii")
|
|
|
|
|
# Remove trailing empty brackets and spaces.
|
|
|
|
|
ascii_text = re.sub(r"\(\)$", "", ascii_text).strip()
|
|
|
|
|
|
2020-02-04 23:12:28 +00:00
|
|
|
# Check if anything converted
|
2023-05-28 07:31:59 +00:00
|
|
|
if sum(1 for x in ascii_text if x not in [" ", "(", ")", ","]) == 0:
|
|
|
|
|
return "FAILED_TO_PARSE"
|
2020-02-04 23:12:28 +00:00
|
|
|
else:
|
|
|
|
|
return ascii_text
|
|
|
|
|
|
|
|
|
|
|
2023-05-28 07:31:59 +00:00
|
|
|
def generate_holidays_df() -> pd.DataFrame:
|
|
|
|
|
"""Generate csv file of all possible holiday names, ds, and countries, year combination."""
|
|
|
|
|
country_codes = set(list_supported_countries().keys())
|
|
|
|
|
|
|
|
|
|
# For compatibility with Turkey as 'TU' cases.
|
|
|
|
|
country_codes.add("TU")
|
|
|
|
|
|
2018-10-19 00:40:17 +00:00
|
|
|
all_holidays = []
|
2023-05-28 07:31:59 +00:00
|
|
|
for country_code in country_codes:
|
|
|
|
|
df = make_holidays_df(
|
|
|
|
|
year_list=np.arange(1995, 2045, 1).tolist(),
|
|
|
|
|
country=country_code,
|
|
|
|
|
)
|
|
|
|
|
df["country"] = country_code
|
2020-08-28 01:36:18 +00:00
|
|
|
all_holidays.append(df)
|
2018-10-19 00:40:17 +00:00
|
|
|
|
|
|
|
|
generated_holidays = pd.concat(all_holidays, axis=0, ignore_index=True)
|
2023-05-28 07:31:59 +00:00
|
|
|
generated_holidays["year"] = generated_holidays.ds.dt.year
|
|
|
|
|
generated_holidays.sort_values(["country", "ds", "holiday"], inplace=True)
|
2018-12-04 22:49:44 +00:00
|
|
|
|
2020-02-04 23:12:28 +00:00
|
|
|
# Convert to ASCII, and drop holidays that fail to convert
|
2023-05-28 07:31:59 +00:00
|
|
|
generated_holidays["holiday"] = generated_holidays["holiday"].apply(utf8_to_ascii)
|
|
|
|
|
failed_countries = generated_holidays.loc[
|
|
|
|
|
generated_holidays["holiday"] == "FAILED_TO_PARSE", "country"
|
|
|
|
|
].unique()
|
2023-01-20 10:52:45 +00:00
|
|
|
if len(failed_countries) > 0:
|
2022-05-24 14:10:46 +00:00
|
|
|
print("Failed to convert UTF-8 holidays for:")
|
2023-05-28 07:31:59 +00:00
|
|
|
print("\n".join(failed_countries))
|
|
|
|
|
assert "FAILED_TO_PARSE" not in generated_holidays["holiday"].unique()
|
|
|
|
|
return generated_holidays
|
2018-10-19 00:40:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2023-05-28 07:31:59 +00:00
|
|
|
import argparse
|
|
|
|
|
import pathlib
|
|
|
|
|
|
|
|
|
|
if not pathlib.Path.cwd().stem == "python":
|
|
|
|
|
raise RuntimeError("Run script from prophet/python directory")
|
|
|
|
|
OUT_CSV_PATH = pathlib.Path(".") / ".." / "R/data-raw/generated_holidays.csv"
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
|
parser.add_argument("-o", "--outfile", default=OUT_CSV_PATH)
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
df = generate_holidays_df()
|
|
|
|
|
df.to_csv(args.outfile, index=False)
|