diff --git a/docs/source/custom_datasets.rst b/docs/source/custom_datasets.rst index d28373c55..06a85010d 100644 --- a/docs/source/custom_datasets.rst +++ b/docs/source/custom_datasets.rst @@ -558,15 +558,13 @@ we can use the built in :func:`~transformers.BatchEncoding.char_to_token` method end_positions = [] for i in range(len(answers)): start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'])) - end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'])) + end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1)) # if start position is None, the answer passage has been truncated if start_positions[-1] is None: start_positions[-1] = tokenizer.model_max_length + end_positions[-1] = tokenizer.model_max_length - # if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1 - if end_positions[-1] is None: - end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] + 1) encodings.update({'start_positions': start_positions, 'end_positions': end_positions}) add_token_positions(train_encodings, train_answers)