diff --git a/docs/source/en/tasks/language_modeling.mdx b/docs/source/en/tasks/language_modeling.mdx index b3b6dd753..4aa368fe0 100644 --- a/docs/source/en/tasks/language_modeling.mdx +++ b/docs/source/en/tasks/language_modeling.mdx @@ -141,6 +141,7 @@ Now you need a second preprocessing function to capture text truncated from any >>> def group_texts(examples): ... concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} ... total_length = len(concatenated_examples[list(examples.keys())[0]]) +... total_length = (total_length // block_size) * block_size ... result = { ... k: [t[i : i + block_size] for i in range(0, total_length, block_size)] ... for k, t in concatenated_examples.items() diff --git a/docs/source/es/tasks/language_modeling.mdx b/docs/source/es/tasks/language_modeling.mdx index 33962a498..565185072 100644 --- a/docs/source/es/tasks/language_modeling.mdx +++ b/docs/source/es/tasks/language_modeling.mdx @@ -141,6 +141,7 @@ Ahora necesitas una segunda función de preprocesamiento para capturar el texto >>> def group_texts(examples): ... concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} ... total_length = len(concatenated_examples[list(examples.keys())[0]]) +... total_length = (total_length // block_size) * block_size ... result = { ... k: [t[i : i + block_size] for i in range(0, total_length, block_size)] ... for k, t in concatenated_examples.items()