diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 228e4b5c2..d97e582c3 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -680,6 +680,7 @@ if is_torch_available(): "MODEL_FOR_OBJECT_DETECTION_MAPPING", "MODEL_FOR_PRETRAINING_MAPPING", "MODEL_FOR_QUESTION_ANSWERING_MAPPING", + "MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING", "MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", "MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING", "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING", @@ -2850,6 +2851,7 @@ if TYPE_CHECKING: MODEL_FOR_OBJECT_DETECTION_MAPPING, MODEL_FOR_PRETRAINING_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index 4e20e7124..c43627e3a 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -103,6 +103,7 @@ if is_torch_available(): AutoModelForMaskedLM, AutoModelForObjectDetection, AutoModelForQuestionAnswering, + AutoModelForSemanticSegmentation, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoModelForSpeechSeq2Seq, @@ -264,7 +265,7 @@ SUPPORTED_TASKS = { "image-segmentation": { "impl": ImageSegmentationPipeline, "tf": (), - "pt": (AutoModelForImageSegmentation,) if is_torch_available() else (), + "pt": (AutoModelForImageSegmentation, AutoModelForSemanticSegmentation) if is_torch_available() else (), "default": {"model": {"pt": "facebook/detr-resnet-50-panoptic"}}, "type": "image", }, diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py index 4306f035e..a1eef2e3f 100644 --- a/src/transformers/pipelines/image_segmentation.py +++ b/src/transformers/pipelines/image_segmentation.py @@ -1,5 +1,3 @@ -import base64 -import io from typing import Any, Dict, List, Union import numpy as np @@ -16,8 +14,13 @@ if is_vision_available(): if is_torch_available(): import torch + from torch import nn + + from ..models.auto.modeling_auto import ( + MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING, + ) - from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_SEGMENTATION_MAPPING logger = logging.get_logger(__name__) @@ -46,7 +49,9 @@ class ImageSegmentationPipeline(Pipeline): raise ValueError(f"The {self.__class__} is only available in PyTorch.") requires_backends(self, "vision") - self.check_model_type(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING) + self.check_model_type( + dict(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items() + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items()) + ) def _sanitize_parameters(self, **kwargs): postprocess_kwargs = {} @@ -77,16 +82,16 @@ class ImageSegmentationPipeline(Pipeline): Return: A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a - dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to - each image. + list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries + corresponding to each image. The dictionaries contain the following keys: - **label** (`str`) -- The class label identified by the model. - - **score** (`float`) -- The score attributed by the model for that label. - - **mask** (`str`) -- base64 string of a grayscale (single-channel) PNG image that contain masks - information. The PNG image has size (heigth, width) of the original image. Pixel values in the image are - either 0 or 255 (i.e. mask is absent VS mask is present). + - **mask** (`PIL.Image`) -- Pil Image with size (heigth, width) of the original image. Pixel values in the + image are in the range 0-255. 0 means the pixel is *not* part of the *label*, 255 means it definitely is. + - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the + "object" described by the label and the mask. """ return super().__call__(*args, **kwargs) @@ -104,40 +109,55 @@ class ImageSegmentationPipeline(Pipeline): model_outputs["target_size"] = target_size return model_outputs - def postprocess(self, model_outputs, threshold=0.9, mask_threshold=0.5): - raw_annotations = self.feature_extractor.post_process_segmentation( - model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5 - ) - raw_annotation = raw_annotations[0] + def postprocess(self, model_outputs, raw_image=False, threshold=0.9, mask_threshold=0.5): + if hasattr(self.feature_extractor, "post_process_segmentation"): + # Panoptic + raw_annotations = self.feature_extractor.post_process_segmentation( + model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5 + ) + raw_annotation = raw_annotations[0] + raw_annotation["masks"] *= 255 # [0,1] -> [0,255] black and white pixels + raw_annotation["scores"] = raw_annotation["scores"].tolist() + raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]] + raw_annotation["masks"] = [ + Image.fromarray(mask.numpy().astype(np.uint8), mode="L") for mask in raw_annotation["masks"] + ] + # {"scores": [...], ...} --> [{"score":x, ...}, ...] + keys = ["score", "label", "mask"] + annotation = [ + dict(zip(keys, vals)) + for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"]) + ] + else: + # Default logits + logits = model_outputs.logits + logits = logits.softmax(dim=1) + if len(logits.shape) != 4: + raise ValueError(f"Logits don't have expected dimensions, expected [1, N, H, W], got {logits.shape}") + batch_size, num_labels, height, width = logits.shape + expected_num_labels = len(self.model.config.id2label) + if num_labels != expected_num_labels: + raise ValueError( + f"Logits don't have expected dimensions, expected [1, {num_labels}, H, W], got {logits.shape}" + ) + size = model_outputs["target_size"].squeeze(0).tolist() + logits_reshaped = nn.functional.interpolate(logits, size=size, mode="bilinear", align_corners=False) + classes = logits_reshaped.argmax(dim=1)[0] + annotation = [] - raw_annotation["masks"] *= 255 # [0,1] -> [0,255] black and white pixels - - raw_annotation["scores"] = raw_annotation["scores"].tolist() - raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]] - raw_annotation["masks"] = [self._get_mask_str(mask) for mask in raw_annotation["masks"].cpu().numpy()] - - # {"scores": [...], ...} --> [{"score":x, ...}, ...] - keys = ["score", "label", "mask"] - annotation = [ - dict(zip(keys, vals)) - for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"]) - ] + for label_id in range(num_labels): + label = self.model.config.id2label[label_id] + mask = classes == label_id + mask_sum = mask.sum() + # Remove empty masks. + if mask_sum == 0: + continue + mask = Image.fromarray((mask * 255).numpy().astype(np.uint8), mode="L") + # Semantic segmentation does not output a global score for the mask + # so we don't attempt to compute one. + # XXX: We could send a mask with values between 0 and 255 instead + # of a pure mask to enable users to get the probabilities that + # are really outputted by the logits. + annotation.append({"score": None, "label": label, "mask": mask}) return annotation - - def _get_mask_str(self, mask: np.array) -> str: - """ - Turns mask numpy array into mask base64 str. - - Args: - mask (`np.array`): Numpy array (with shape (heigth, width) of the original image) containing masks - information. Values in the array are either 0 or 255 (i.e. mask is absent VS mask is present). - - Returns: - A base64 string of a single-channel PNG image that contain masks information. - """ - img = Image.fromarray(mask.astype(np.int8), mode="L") - with io.BytesIO() as out: - img.save(out, format="PNG") - png_string = out.getvalue() - return base64.b64encode(png_string).decode("utf-8") diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 4f8049e23..e7462ced9 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -383,6 +383,9 @@ MODEL_FOR_PRETRAINING_MAPPING = None MODEL_FOR_QUESTION_ANSWERING_MAPPING = None +MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = None + + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = None diff --git a/tests/test_modeling_segformer.py b/tests/test_modeling_segformer.py index 3bb293065..f359e2378 100644 --- a/tests/test_modeling_segformer.py +++ b/tests/test_modeling_segformer.py @@ -101,7 +101,11 @@ class SegformerModelTester: if self.use_labels: labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) - config = SegformerConfig( + config = self.get_config() + return config, pixel_values, labels + + def get_config(self): + return SegformerConfig( image_size=self.image_size, num_channels=self.num_channels, num_encoder_blocks=self.num_encoder_blocks, @@ -114,8 +118,6 @@ class SegformerModelTester: initializer_range=self.initializer_range, ) - return config, pixel_values, labels - def create_and_check_model(self, config, pixel_values, labels): model = SegformerModel(config=config) model.to(torch_device) diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py index 05fa383ce..9dbb3d2bc 100644 --- a/tests/test_pipelines_common.py +++ b/tests/test_pipelines_common.py @@ -126,14 +126,14 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config, feature_ class ANY: - def __init__(self, _type): - self._type = _type + def __init__(self, *_types): + self._types = _types def __eq__(self, other): - return isinstance(other, self._type) + return isinstance(other, self._types) def __repr__(self): - return f"ANY({self._type.__name__})" + return f"ANY({', '.join(_type.__name__ for _type in self._types)})" class PipelineTestCaseMeta(type): diff --git a/tests/test_pipelines_image_segmentation.py b/tests/test_pipelines_image_segmentation.py index 99fab7db3..afb360f36 100644 --- a/tests/test_pipelines_image_segmentation.py +++ b/tests/test_pipelines_image_segmentation.py @@ -15,10 +15,14 @@ import hashlib import unittest +import datasets + from transformers import ( MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING, AutoFeatureExtractor, AutoModelForImageSegmentation, + DetrForSegmentation, ImageSegmentationPipeline, is_vision_available, pipeline, @@ -46,12 +50,23 @@ else: pass +def hashimage(image: Image) -> str: + m = hashlib.md5(image.tobytes()) + return m.hexdigest() + + @require_vision @require_timm @require_torch @is_pipeline_test class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): - model_mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING + model_mapping = { + k: v + for k, v in ( + list(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()) if MODEL_FOR_IMAGE_SEGMENTATION_MAPPING else [] + ) + + (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else []) + } def get_test_pipeline(self, model, tokenizer, feature_extractor): image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor) @@ -62,34 +77,59 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa def run_pipeline_test(self, image_segmenter, examples): outputs = image_segmenter("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0) - self.assertEqual(outputs, [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12) - - import datasets + self.assertIsInstance(outputs, list) + n = len(outputs) + self.assertGreater(n, 1) + # XXX: PIL.Image implements __eq__ which bypasses ANY, so we inverse the comparison + # to make it work + self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, outputs) dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test") - batch = [ - Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), - "http://images.cocodataset.org/val2017/000000039769.jpg", - # RGBA - dataset[0]["file"], - # LA - dataset[1]["file"], - # L - dataset[2]["file"], - ] - outputs = image_segmenter(batch, threshold=0.0) + # RGBA + outputs = image_segmenter(dataset[0]["file"]) + m = len(outputs) + self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs) + # LA + outputs = image_segmenter(dataset[1]["file"]) + m = len(outputs) + self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs) + # L + outputs = image_segmenter(dataset[2]["file"]) + m = len(outputs) + self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs) + if isinstance(image_segmenter.model, DetrForSegmentation): + # We need to test batch_size with images with the same size. + # Detr doesn't normalize the size of the images, meaning we can have + # 800x800 or 800x1200, meaning we cannot batch simply. + # We simply bail on this + batch_size = 1 + else: + batch_size = 2 + + # 5 times the same image so the output shape is predictable + batch = [ + "./tests/fixtures/tests_samples/COCO/000000039769.png", + "./tests/fixtures/tests_samples/COCO/000000039769.png", + "./tests/fixtures/tests_samples/COCO/000000039769.png", + "./tests/fixtures/tests_samples/COCO/000000039769.png", + "./tests/fixtures/tests_samples/COCO/000000039769.png", + ] + outputs = image_segmenter(batch, threshold=0.0, batch_size=batch_size) self.assertEqual(len(batch), len(outputs)) + self.assertEqual({"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}, outputs[0][0]) + self.assertEqual(len(outputs[0]), n) self.assertEqual( - outputs, [ - [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12, - [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12, - [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12, - [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12, - [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12, + [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, + [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, + [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, + [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, + [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, ], + outputs, + f"Expected [{n}, {n}, {n}, {n}, {n}], got {[len(item) for item in outputs]}", ) @require_tf @@ -108,7 +148,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.0) for o in outputs: # shortening by hashing - o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest() + o["mask"] = hashimage(o["mask"]) self.assertEqual( nested_simplify(outputs, decimals=4), @@ -116,12 +156,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa { "score": 0.004, "label": "LABEL_0", - "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be", + "mask": "34eecd16bbfb0f476083ef947d81bf66", }, { "score": 0.004, "label": "LABEL_0", - "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be", + "mask": "34eecd16bbfb0f476083ef947d81bf66", }, ], ) @@ -135,7 +175,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa ) for output in outputs: for o in output: - o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest() + o["mask"] = hashimage(o["mask"]) self.assertEqual( nested_simplify(outputs, decimals=4), @@ -144,29 +184,54 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa { "score": 0.004, "label": "LABEL_0", - "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be", + "mask": "34eecd16bbfb0f476083ef947d81bf66", }, { "score": 0.004, "label": "LABEL_0", - "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be", + "mask": "34eecd16bbfb0f476083ef947d81bf66", }, ], [ { "score": 0.004, "label": "LABEL_0", - "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be", + "mask": "34eecd16bbfb0f476083ef947d81bf66", }, { "score": 0.004, "label": "LABEL_0", - "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be", + "mask": "34eecd16bbfb0f476083ef947d81bf66", }, ], ], ) + @require_torch + def test_small_model_pt_semantic(self): + model_id = "hf-internal-testing/tiny-random-beit-pipeline" + image_segmenter = pipeline(model=model_id) + outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg") + for o in outputs: + # shortening by hashing + o["mask"] = hashimage(o["mask"]) + + self.assertEqual( + nested_simplify(outputs, decimals=4), + [ + { + "score": None, + "label": "LABEL_0", + "mask": "01245d8ad25d03f09493ca97965788ae", + }, + { + "score": None, + "label": "LABEL_1", + "mask": "f741516de8d5196a2c830739b9ac1c8c", + }, + ], + ) + @require_torch @slow def test_integration_torch_image_segmentation(self): @@ -176,7 +241,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg") for o in outputs: - o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest() + o["mask"] = hashimage(o["mask"]) self.assertEqual( nested_simplify(outputs, decimals=4), @@ -234,7 +299,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=threshold) for o in outputs: - o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest() + o["mask"] = hashimage(o["mask"]) self.assertEqual( nested_simplify(outputs, decimals=4),