diff --git a/.circleci/config.yml b/.circleci/config.yml index 63c6162fc..66678d0d4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -43,6 +43,12 @@ jobs: else touch test_preparation/test_list.txt fi + - run: | + if [ -f doctest_list.txt ]; then + cp doctest_list.txt test_preparation/doctest_list.txt + else + touch test_preparation/doctest_list.txt + fi - run: | if [ -f test_repo_utils.txt ]; then mv test_repo_utils.txt test_preparation/test_repo_utils.txt @@ -71,6 +77,8 @@ jobs: fi - store_artifacts: path: test_preparation/test_list.txt + - store_artifacts: + path: test_preparation/doctest_list.txt - store_artifacts: path: ~/transformers/test_preparation/filtered_test_list.txt - store_artifacts: diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index ef100bdbb..4bc5ce17d 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -483,7 +483,6 @@ REGULAR_TESTS = [ hub_job, onnx_job, exotic_models_job, - doc_test_job ] EXAMPLES_TESTS = [ examples_torch_job, @@ -495,6 +494,8 @@ PIPELINE_TESTS = [ pipelines_tf_job, ] REPO_UTIL_TESTS = [repo_utils_job] +DOC_TESTS = [doc_test_job] + def create_circleci_config(folder=None): if folder is None: @@ -552,6 +553,15 @@ def create_circleci_config(folder=None): if os.path.exists(example_file) and os.path.getsize(example_file) > 0: jobs.extend(EXAMPLES_TESTS) + doctest_file = os.path.join(folder, "doctest_list.txt") + if os.path.exists(doctest_file): + with open(doctest_file) as f: + doctest_list = f.read() + else: + doctest_list = [] + if len(doctest_list) > 0: + jobs.extend(DOC_TESTS) + repo_util_file = os.path.join(folder, "test_repo_utils.txt") if os.path.exists(repo_util_file) and os.path.getsize(repo_util_file) > 0: jobs.extend(REPO_UTIL_TESTS) diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py index 8aa1015f3..05009e975 100644 --- a/utils/tests_fetcher.py +++ b/utils/tests_fetcher.py @@ -116,6 +116,26 @@ def clean_code(content): return "\n".join(lines_to_keep) +def keep_doc_examples_only(content): + """ + Remove code, docstring that is not code example, empty line or comments from `content`. + """ + # Keep doc examples only by splitting on triple "`" + splits = content.split("```") + # Add leading and trailing "```" so the navigation is easier when compared to the original input `content` + content = "```" + "```".join(splits[1::2]) + "```" + + # Remove empty lines and comments + lines_to_keep = [] + for line in content.split("\n"): + # remove anything that is after a # sign. + line = re.sub("#.*$", "", line) + if len(line) == 0 or line.isspace(): + continue + lines_to_keep.append(line) + return "\n".join(lines_to_keep) + + def get_all_tests(): """ Return a list of paths to all test folders and files under `tests`. All paths are rooted at `tests`. @@ -162,6 +182,24 @@ def diff_is_docstring_only(repo, branching_point, filename): return old_content_clean == new_content_clean +def diff_contains_doc_examples(repo, branching_point, filename): + """ + Check if the diff is only in code in a filename. + """ + folder = Path(repo.working_dir) + with checkout_commit(repo, branching_point): + with open(folder / filename, "r", encoding="utf-8") as f: + old_content = f.read() + + with open(folder / filename, "r", encoding="utf-8") as f: + new_content = f.read() + + old_content_clean = keep_doc_examples_only(old_content) + new_content_clean = keep_doc_examples_only(new_content) + + return old_content_clean != new_content_clean + + def get_diff(repo, base_commit, commits): """ Get's the diff between one or several commits and the head of the repository. @@ -216,32 +254,46 @@ def get_modified_python_files(diff_with_last_commit=False): return get_diff(repo, repo.head.commit, parent_commits) -def get_diff_for_py_and_mdx_files(repo, base_commit, commits): +def get_diff_for_doctesting(repo, base_commit, commits): """ - Get's the diff between one or several commits and the head of the repository. + Get's the diff between one or several commits and the head of the repository where some doc example(s) are changed. """ print("\n### DIFF ###\n") code_diff = [] for commit in commits: for diff_obj in commit.diff(base_commit): - # We always add new python files - if diff_obj.change_type in ["A", "M", "R"] and ( - diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx") - ): + # We always add new python/mdx files + if diff_obj.change_type in ["A"] and (diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx")): code_diff.append(diff_obj.b_path) + # Now for modified files + elif ( + diff_obj.change_type in ["M", "R"] + and diff_obj.b_path.endswith(".py") + or diff_obj.b_path.endswith(".mdx") + ): + # In case of renames, we'll look at the tests using both the old and new name. + if diff_obj.a_path != diff_obj.b_path: + code_diff.extend([diff_obj.a_path, diff_obj.b_path]) + else: + # Otherwise, we check modifications contain some doc example(s). + if diff_contains_doc_examples(repo, commit, diff_obj.b_path): + code_diff.append(diff_obj.a_path) + else: + print(f"Ignoring diff in {diff_obj.b_path} as it doesn't contain any doc example.") return code_diff -def get_modified_python_and_mdx_files(diff_with_last_commit=False): +def get_doctest_files(diff_with_last_commit=False): """ - Return a list of python and mdx files that have been modified between: + Return a list of python and mdx files where some doc example(s) in them have been modified between: - the current head and the main branch if `diff_with_last_commit=False` (default) - the current head and its parent commit otherwise. """ repo = Repo(PATH_TO_REPO) + test_files_to_run = [] # noqa if not diff_with_last_commit: print(f"main is at {repo.refs.main.commit}") print(f"Current head is at {repo.head.commit}") @@ -249,23 +301,14 @@ def get_modified_python_and_mdx_files(diff_with_last_commit=False): branching_commits = repo.merge_base(repo.refs.main, repo.head) for commit in branching_commits: print(f"Branching commit: {commit}") - return get_diff_for_py_and_mdx_files(repo, repo.head.commit, branching_commits) + test_files_to_run = get_diff_for_doctesting(repo, repo.head.commit, branching_commits) else: print(f"main is at {repo.head.commit}") parent_commits = repo.head.commit.parents for commit in parent_commits: print(f"Parent commit: {commit}") - return get_diff_for_py_and_mdx_files(repo, repo.head.commit, parent_commits) + test_files_to_run = get_diff_for_doctesting(repo, repo.head.commit, parent_commits) - -def get_doctest_files(diff_with_last_commit=False): - """ - Return a list of python and mdx files that have been modified between: - - - the current head and the main branch if `diff_with_last_commit=False` (default) - - the current head and its parent commit otherwise. - """ - test_files_to_run = get_modified_python_and_mdx_files(diff_with_last_commit) with open("utils/documentation_tests.txt") as fp: documentation_tests = set(fp.read().strip().split("\n")) # So far we don't have 100% coverage for doctest. This line will be removed once we achieve 100%. @@ -647,6 +690,14 @@ def infer_tests_to_run( create_json_map(test_files_to_run, json_output_file) + doctest_list = get_doctest_files() + + print(f"\n### DOCTEST TO RUN ###\n{_print_list(doctest_list)}") + if len(doctest_list) > 0: + doctest_file = Path(output_file).parent / "doctest_list.txt" + with open(doctest_file, "w", encoding="utf-8") as f: + f.write(" ".join(doctest_list)) + def filter_tests(output_file, filters): """