Enable image-segmentation on AutoModelForSemanticSegmentation (#15647)

* Enabling Beit SegFormer to `image-segmentation`. * Fixing the score. * Fix import ? * Missing in type hint. * Multiple test fixes: - Add `raw_image` support. It should be the default IMHO since in Python world it doesn't make any sense to base64 encode the image (Sorry @mishig, didn't catch that in my review). I really think we should consider breaking BC here. - Add support for Segformer tiny test (needed `SegformerModelTester.get_config` to enable TinyConfig @NielsRogge) - Add the check that `batch_size` works correctly on that pipeline. Uncovered that it doesn't for Detr, which IMO is OK since images after `feature_extractor` don't have the same size. Comment should explain. * Type hint as a string. * Make fixup + update black. * torch+vision protections. * Don't use torchvision, use F.interpolate instead (no new dep). * Last fixes for Segformer. * Update test to reflect new image (which was broken) * Update tests. * Major BC modification: - Removed the string compressed PNG string, that's a job for users `transformers` stays in python land. - Removed the `score` for semantic segmentation. It has hardly a meaning on its own in this context. - Don't include the grayscale with logits for now (which could enable users to get a sense of confidence). Might be done later. - Don't include the surface of the mask (could be used for sorting by users, to filter out small masks). It's already calculable, and it's easier to add later, than to add now and break later if we need. * `make fixup`. * Small changes. * Rebase + doc fixup.
2026-05-14 20:58:08 +00:00 · 2022-02-23 17:20:26 +01:00 · 2022-02-23 17:20:26 +01:00 · 9e71d46455
commit 9e71d46455
parent 1b23979736
7 changed files with 176 additions and 83 deletions
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -680,6 +680,7 @@ if is_torch_available():
            "MODEL_FOR_OBJECT_DETECTION_MAPPING",
            "MODEL_FOR_PRETRAINING_MAPPING",
            "MODEL_FOR_QUESTION_ANSWERING_MAPPING",
+            "MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
            "MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
            "MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
            "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
@ -2850,6 +2851,7 @@ if TYPE_CHECKING:
            MODEL_FOR_OBJECT_DETECTION_MAPPING,
            MODEL_FOR_PRETRAINING_MAPPING,
            MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
            MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
            MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
            MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
--- a/src/transformers/pipelines/init.py
+++ b/src/transformers/pipelines/init.py
@ -103,6 +103,7 @@ if is_torch_available():
        AutoModelForMaskedLM,
        AutoModelForObjectDetection,
        AutoModelForQuestionAnswering,
+        AutoModelForSemanticSegmentation,
        AutoModelForSeq2SeqLM,
        AutoModelForSequenceClassification,
        AutoModelForSpeechSeq2Seq,
@ -264,7 +265,7 @@ SUPPORTED_TASKS = {
    "image-segmentation": {
        "impl": ImageSegmentationPipeline,
        "tf": (),
-        "pt": (AutoModelForImageSegmentation,) if is_torch_available() else (),
+        "pt": (AutoModelForImageSegmentation, AutoModelForSemanticSegmentation) if is_torch_available() else (),
        "default": {"model": {"pt": "facebook/detr-resnet-50-panoptic"}},
        "type": "image",
    },
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@ -1,5 +1,3 @@
-import base64
-import io
 from typing import Any, Dict, List, Union

 import numpy as np
@ -16,8 +14,13 @@ if is_vision_available():

 if is_torch_available():
    import torch
+    from torch import nn
+
+    from ..models.auto.modeling_auto import (
+        MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+        MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
+    )

-    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_SEGMENTATION_MAPPING

 logger = logging.get_logger(__name__)

@ -46,7 +49,9 @@ class ImageSegmentationPipeline(Pipeline):
            raise ValueError(f"The {self.__class__} is only available in PyTorch.")

        requires_backends(self, "vision")
-        self.check_model_type(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING)
+        self.check_model_type(
+            dict(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items() + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items())
+        )

    def _sanitize_parameters(self, **kwargs):
        postprocess_kwargs = {}
@ -77,16 +82,16 @@ class ImageSegmentationPipeline(Pipeline):

        Return:
            A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
-            dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
-            each image.
+            list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries
+            corresponding to each image.

            The dictionaries contain the following keys:

            - **label** (`str`) -- The class label identified by the model.
-            - **score** (`float`) -- The score attributed by the model for that label.
-            - **mask** (`str`) -- base64 string of a grayscale (single-channel) PNG image that contain masks
-              information. The PNG image has size (heigth, width) of the original image. Pixel values in the image are
-              either 0 or 255 (i.e. mask is absent VS mask is present).
+            - **mask** (`PIL.Image`) -- Pil Image with size (heigth, width) of the original image. Pixel values in the
+              image are in the range 0-255. 0 means the pixel is *not* part of the *label*, 255 means it definitely is.
+            - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the
+              "object" described by the label and the mask.
        """

        return super().__call__(*args, **kwargs)
@ -104,40 +109,55 @@ class ImageSegmentationPipeline(Pipeline):
        model_outputs["target_size"] = target_size
        return model_outputs

-    def postprocess(self, model_outputs, threshold=0.9, mask_threshold=0.5):
-        raw_annotations = self.feature_extractor.post_process_segmentation(
-            model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5
-        )
-        raw_annotation = raw_annotations[0]
+    def postprocess(self, model_outputs, raw_image=False, threshold=0.9, mask_threshold=0.5):
+        if hasattr(self.feature_extractor, "post_process_segmentation"):
+            # Panoptic
+            raw_annotations = self.feature_extractor.post_process_segmentation(
+                model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5
+            )
+            raw_annotation = raw_annotations[0]
+            raw_annotation["masks"] *= 255  # [0,1] -> [0,255] black and white pixels
+            raw_annotation["scores"] = raw_annotation["scores"].tolist()
+            raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]]
+            raw_annotation["masks"] = [
+                Image.fromarray(mask.numpy().astype(np.uint8), mode="L") for mask in raw_annotation["masks"]
+            ]
+            # {"scores": [...], ...} --> [{"score":x, ...}, ...]
+            keys = ["score", "label", "mask"]
+            annotation = [
+                dict(zip(keys, vals))
+                for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"])
+            ]
+        else:
+            # Default logits
+            logits = model_outputs.logits
+            logits = logits.softmax(dim=1)
+            if len(logits.shape) != 4:
+                raise ValueError(f"Logits don't have expected dimensions, expected [1, N, H, W], got {logits.shape}")
+            batch_size, num_labels, height, width = logits.shape
+            expected_num_labels = len(self.model.config.id2label)
+            if num_labels != expected_num_labels:
+                raise ValueError(
+                    f"Logits don't have expected dimensions, expected [1, {num_labels}, H, W], got {logits.shape}"
+                )
+            size = model_outputs["target_size"].squeeze(0).tolist()
+            logits_reshaped = nn.functional.interpolate(logits, size=size, mode="bilinear", align_corners=False)
+            classes = logits_reshaped.argmax(dim=1)[0]
+            annotation = []

-        raw_annotation["masks"] *= 255  # [0,1] -> [0,255] black and white pixels
-
-        raw_annotation["scores"] = raw_annotation["scores"].tolist()
-        raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]]
-        raw_annotation["masks"] = [self._get_mask_str(mask) for mask in raw_annotation["masks"].cpu().numpy()]
-
-        # {"scores": [...], ...} --> [{"score":x, ...}, ...]
-        keys = ["score", "label", "mask"]
-        annotation = [
-            dict(zip(keys, vals))
-            for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"])
-        ]
+            for label_id in range(num_labels):
+                label = self.model.config.id2label[label_id]
+                mask = classes == label_id
+                mask_sum = mask.sum()

+                # Remove empty masks.
+                if mask_sum == 0:
+                    continue
+                mask = Image.fromarray((mask * 255).numpy().astype(np.uint8), mode="L")
+                # Semantic segmentation does not output a global score for the mask
+                # so we don't attempt to compute one.
+                # XXX: We could send a mask with values between 0 and 255 instead
+                # of a pure mask to enable users to get the probabilities that
+                # are really outputted by the logits.
+                annotation.append({"score": None, "label": label, "mask": mask})
        return annotation
-
-    def _get_mask_str(self, mask: np.array) -> str:
-        """
-        Turns mask numpy array into mask base64 str.
-
-        Args:
-            mask (`np.array`): Numpy array (with shape (heigth, width) of the original image) containing masks
-                information. Values in the array are either 0 or 255 (i.e. mask is absent VS mask is present).
-
-        Returns:
-            A base64 string of a single-channel PNG image that contain masks information.
-        """
-        img = Image.fromarray(mask.astype(np.int8), mode="L")
-        with io.BytesIO() as out:
-            img.save(out, format="PNG")
-            png_string = out.getvalue()
-            return base64.b64encode(png_string).decode("utf-8")
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@ -383,6 +383,9 @@ MODEL_FOR_PRETRAINING_MAPPING = None
 MODEL_FOR_QUESTION_ANSWERING_MAPPING = None


+MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = None
+
+
 MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = None


--- a/tests/test_modeling_segformer.py
+++ b/tests/test_modeling_segformer.py
@ -101,7 +101,11 @@ class SegformerModelTester:
        if self.use_labels:
            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)

-        config = SegformerConfig(
+        config = self.get_config()
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return SegformerConfig(
            image_size=self.image_size,
            num_channels=self.num_channels,
            num_encoder_blocks=self.num_encoder_blocks,
@ -114,8 +118,6 @@ class SegformerModelTester:
            initializer_range=self.initializer_range,
        )

-        return config, pixel_values, labels
-
    def create_and_check_model(self, config, pixel_values, labels):
        model = SegformerModel(config=config)
        model.to(torch_device)
--- a/tests/test_pipelines_common.py
+++ b/tests/test_pipelines_common.py
@ -126,14 +126,14 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config, feature_


 class ANY:
-    def __init__(self, _type):
-        self._type = _type
+    def __init__(self, *_types):
+        self._types = _types

    def __eq__(self, other):
-        return isinstance(other, self._type)
+        return isinstance(other, self._types)

    def __repr__(self):
-        return f"ANY({self._type.__name__})"
+        return f"ANY({', '.join(_type.__name__ for _type in self._types)})"


 class PipelineTestCaseMeta(type):
--- a/tests/test_pipelines_image_segmentation.py
+++ b/tests/test_pipelines_image_segmentation.py
@ -15,10 +15,14 @@
 import hashlib
 import unittest

+import datasets
+
 from transformers import (
    MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
    AutoFeatureExtractor,
    AutoModelForImageSegmentation,
+    DetrForSegmentation,
    ImageSegmentationPipeline,
    is_vision_available,
    pipeline,
@ -46,12 +50,23 @@ else:
            pass


+def hashimage(image: Image) -> str:
+    m = hashlib.md5(image.tobytes())
+    return m.hexdigest()
+
+
@require_vision
@require_timm
@require_torch
@is_pipeline_test
 class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
-    model_mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING
+    model_mapping = {
+        k: v
+        for k, v in (
+            list(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()) if MODEL_FOR_IMAGE_SEGMENTATION_MAPPING else []
+        )
+        + (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else [])
+    }

    def get_test_pipeline(self, model, tokenizer, feature_extractor):
        image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor)
@ -62,34 +77,59 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa

    def run_pipeline_test(self, image_segmenter, examples):
        outputs = image_segmenter("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0)
-        self.assertEqual(outputs, [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12)
-
-        import datasets
+        self.assertIsInstance(outputs, list)
+        n = len(outputs)
+        self.assertGreater(n, 1)
+        # XXX: PIL.Image implements __eq__ which bypasses ANY, so we inverse the comparison
+        # to make it work
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, outputs)

        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")

-        batch = [
-            Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
-            "http://images.cocodataset.org/val2017/000000039769.jpg",
-            # RGBA
-            dataset[0]["file"],
-            # LA
-            dataset[1]["file"],
-            # L
-            dataset[2]["file"],
-        ]
-        outputs = image_segmenter(batch, threshold=0.0)
+        # RGBA
+        outputs = image_segmenter(dataset[0]["file"])
+        m = len(outputs)
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
+        # LA
+        outputs = image_segmenter(dataset[1]["file"])
+        m = len(outputs)
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
+        # L
+        outputs = image_segmenter(dataset[2]["file"])
+        m = len(outputs)
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)

+        if isinstance(image_segmenter.model, DetrForSegmentation):
+            # We need to test batch_size with images with the same size.
+            # Detr doesn't normalize the size of the images, meaning we can have
+            # 800x800 or 800x1200, meaning we cannot batch simply.
+            # We simply bail on this
+            batch_size = 1
+        else:
+            batch_size = 2
+
+        # 5 times the same image so the output shape is predictable
+        batch = [
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+        ]
+        outputs = image_segmenter(batch, threshold=0.0, batch_size=batch_size)
        self.assertEqual(len(batch), len(outputs))
+        self.assertEqual({"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}, outputs[0][0])
+        self.assertEqual(len(outputs[0]), n)
        self.assertEqual(
-            outputs,
            [
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
            ],
+            outputs,
+            f"Expected [{n}, {n}, {n}, {n}, {n}], got {[len(item) for item in outputs]}",
        )

    @require_tf
@ -108,7 +148,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.0)
        for o in outputs:
            # shortening by hashing
-            o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
+            o["mask"] = hashimage(o["mask"])

        self.assertEqual(
            nested_simplify(outputs, decimals=4),
@ -116,12 +156,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
                {
                    "score": 0.004,
                    "label": "LABEL_0",
-                    "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                    "mask": "34eecd16bbfb0f476083ef947d81bf66",
                },
                {
                    "score": 0.004,
                    "label": "LABEL_0",
-                    "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                    "mask": "34eecd16bbfb0f476083ef947d81bf66",
                },
            ],
        )
@ -135,7 +175,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
        )
        for output in outputs:
            for o in output:
-                o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
+                o["mask"] = hashimage(o["mask"])

        self.assertEqual(
            nested_simplify(outputs, decimals=4),
@ -144,29 +184,54 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
                    {
                        "score": 0.004,
                        "label": "LABEL_0",
-                        "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                        "mask": "34eecd16bbfb0f476083ef947d81bf66",
                    },
                    {
                        "score": 0.004,
                        "label": "LABEL_0",
-                        "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                        "mask": "34eecd16bbfb0f476083ef947d81bf66",
                    },
                ],
                [
                    {
                        "score": 0.004,
                        "label": "LABEL_0",
-                        "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                        "mask": "34eecd16bbfb0f476083ef947d81bf66",
                    },
                    {
                        "score": 0.004,
                        "label": "LABEL_0",
-                        "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                        "mask": "34eecd16bbfb0f476083ef947d81bf66",
                    },
                ],
            ],
        )

+    @require_torch
+    def test_small_model_pt_semantic(self):
+        model_id = "hf-internal-testing/tiny-random-beit-pipeline"
+        image_segmenter = pipeline(model=model_id)
+        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg")
+        for o in outputs:
+            # shortening by hashing
+            o["mask"] = hashimage(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": None,
+                    "label": "LABEL_0",
+                    "mask": "01245d8ad25d03f09493ca97965788ae",
+                },
+                {
+                    "score": None,
+                    "label": "LABEL_1",
+                    "mask": "f741516de8d5196a2c830739b9ac1c8c",
+                },
+            ],
+        )
+
    @require_torch
    @slow
    def test_integration_torch_image_segmentation(self):
@ -176,7 +241,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa

        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg")
        for o in outputs:
-            o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
+            o["mask"] = hashimage(o["mask"])

        self.assertEqual(
            nested_simplify(outputs, decimals=4),
@ -234,7 +299,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=threshold)

        for o in outputs:
-            o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
+            o["mask"] = hashimage(o["mask"])

        self.assertEqual(
            nested_simplify(outputs, decimals=4),