diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index e9307ee37..349a2253a 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -18,6 +18,8 @@ Processor class for LLaVa-NeXT-Video. from typing import TYPE_CHECKING, List, Optional, Union +import numpy as np + from ...feature_extraction_utils import BatchFeature from ...image_processing_utils import select_best_resolution from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array @@ -193,7 +195,11 @@ class LlavaNextVideoProcessor(ProcessorMixin): # videos are easier, simply get frames and multiply if videos_inputs: - one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0]) + one_video = videos_inputs.get("pixel_values_videos")[0] + if isinstance(one_video, (list, tuple)): + one_video = np.array(one_video) + else: + one_video = to_numpy_array(one_video) height, width = get_image_size(one_video[0]) num_frames = one_video.shape[0] # frame dim is always after batch dim diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py index 81760fb40..eded6084e 100644 --- a/src/transformers/models/llava_onevision/processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py @@ -20,6 +20,8 @@ import math import os from typing import Iterable, List, Union +import numpy as np + from ...feature_extraction_utils import BatchFeature from ...image_processing_utils import select_best_resolution from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array @@ -164,7 +166,11 @@ class LlavaOnevisionProcessor(ProcessorMixin): if videos is not None: video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"]) - one_video = to_numpy_array(video_inputs.get("pixel_values_videos")[0]) + one_video = video_inputs.get("pixel_values_videos")[0] + if isinstance(video_inputs.get("pixel_values_videos")[0], (list, tuple)): + one_video = np.array(one_video) + else: + one_video = to_numpy_array(one_video) height, width = get_image_size(one_video[0], channel_dim=output_kwargs["images_kwargs"].get("data_format")) num_frames = one_video.shape[0] # frame dim is always after batch dim patches_height_width = int(math.sqrt(self.num_image_tokens)) diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py index 9d11f4268..19c10f8b4 100644 --- a/src/transformers/models/video_llava/processing_video_llava.py +++ b/src/transformers/models/video_llava/processing_video_llava.py @@ -18,6 +18,8 @@ Processor class for VideoLlava. from typing import List, Optional, Union +import numpy as np + from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, get_image_size, to_numpy_array from ...processing_utils import ProcessorMixin @@ -165,7 +167,11 @@ class VideoLlavaProcessor(ProcessorMixin): num_frames = 1 if "pixel_values_videos" in encoded_images.keys(): - one_video = to_numpy_array(encoded_images.get("pixel_values_videos")[0]) + one_video = encoded_images.get("pixel_values_videos")[0] + if isinstance(encoded_images.get("pixel_values_videos")[0], (list, tuple)): + one_video = np.array(one_video) + else: + one_video = to_numpy_array(one_video) height, width = get_image_size(one_video[0]) num_frames = one_video.shape[0] # frame dim is always after batch dim