From ed0b4303e3bbfe4e452af2adc917ef5c17a5839f Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 5 Dec 2024 18:31:08 +0000 Subject: [PATCH] Fix the structure of images output by the processor --- src/transformers/models/pixtral/processing_pixtral.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py index 2e6d50f9e..53a31abf1 100644 --- a/src/transformers/models/pixtral/processing_pixtral.py +++ b/src/transformers/models/pixtral/processing_pixtral.py @@ -206,7 +206,11 @@ class PixtralProcessor(ProcessorMixin): if is_image_or_image_url(images): images = [[images]] elif isinstance(images, list) and is_image_or_image_url(images[0]): - images = [images] + if isinstance(text, str) or isinstance(text, list) and len(text) == 1: + # If there's a single sample, all images must belong to it + images = [images] + else: + raise ValueError("You have supplied multiple text samples, but only a flat list of images. When processing multiple samples, `images` should be a list of lists of images, one list per sample.") elif isinstance(images, list) and isinstance(images[0], list) and is_image_or_image_url(images[0][0]): pass else: