From 9ccea7acb1a75dc18d47906dc9baed883ccfeb19 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 3 Nov 2022 14:18:45 +0100 Subject: [PATCH] Fix some doctests after PR 15775 (#20036) * Add skip_special_tokens=True in some doctest * For T5 * Fix for speech_to_text.mdx Co-authored-by: ydshieh --- docs/source/en/model_doc/speech_to_text.mdx | 6 +++--- docs/source/en/model_doc/t5.mdx | 2 +- .../models/speech_to_text/modeling_speech_to_text.py | 2 +- src/transformers/utils/doc.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/en/model_doc/speech_to_text.mdx b/docs/source/en/model_doc/speech_to_text.mdx index 9d855fceb..95efc5504 100644 --- a/docs/source/en/model_doc/speech_to_text.mdx +++ b/docs/source/en/model_doc/speech_to_text.mdx @@ -57,7 +57,7 @@ be installed as follows: `apt install libsndfile1-dev` >>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt") >>> generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"]) ->>> transcription = processor.batch_decode(generated_ids) +>>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True) >>> transcription ['mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'] ``` @@ -87,9 +87,9 @@ be installed as follows: `apt install libsndfile1-dev` ... forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"], ... ) ->>> translation = processor.batch_decode(generated_ids) +>>> translation = processor.batch_decode(generated_ids, skip_special_tokens=True) >>> translation -[" (Vidéo) Si M. Kilder est l'apossible des classes moyennes, et nous sommes heureux d'être accueillis dans son évangile."] +["(Vidéo) Si M. Kilder est l'apossible des classes moyennes, et nous sommes heureux d'être accueillis dans son évangile."] ``` See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look for Speech2Text checkpoints. diff --git a/docs/source/en/model_doc/t5.mdx b/docs/source/en/model_doc/t5.mdx index 966d7ebd3..a03d02d9e 100644 --- a/docs/source/en/model_doc/t5.mdx +++ b/docs/source/en/model_doc/t5.mdx @@ -285,7 +285,7 @@ The predicted tokens will then be placed between the sentinel tokens. >>> sequence_ids = model.generate(input_ids) >>> sequences = tokenizer.batch_decode(sequence_ids) >>> sequences -[' park offers the park.'] +[' park offers the park.'] ``` diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 730e130b2..0444d7ddb 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -1334,7 +1334,7 @@ class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel): >>> generated_ids = model.generate(inputs=input_features) - >>> transcription = processor.batch_decode(generated_ids)[0] + >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] >>> transcription 'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel' ```""" diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py index 9e3c7fce7..a5610a32b 100644 --- a/src/transformers/utils/doc.py +++ b/src/transformers/utils/doc.py @@ -201,7 +201,7 @@ PT_QUESTION_ANSWERING_SAMPLE = r""" >>> answer_end_index = outputs.end_logits.argmax() >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] - >>> tokenizer.decode(predict_answer_tokens) + >>> tokenizer.decode(predict_answer_tokens, skip_special_tokens=True) {expected_output} ```