From 9e71d4645526911f2ea9743aa4cf8e9d479fc840 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 23 Feb 2022 17:20:26 +0100 Subject: [PATCH] Enable `image-segmentation` on `AutoModelForSemanticSegmentation` (#15647) * Enabling Beit SegFormer to `image-segmentation`. * Fixing the score. * Fix import ? * Missing in type hint. * Multiple test fixes: - Add `raw_image` support. It should be the default IMHO since in Python world it doesn't make any sense to base64 encode the image (Sorry @mishig, didn't catch that in my review). I really think we should consider breaking BC here. - Add support for Segformer tiny test (needed `SegformerModelTester.get_config` to enable TinyConfig @NielsRogge) - Add the check that `batch_size` works correctly on that pipeline. Uncovered that it doesn't for Detr, which IMO is OK since images after `feature_extractor` don't have the same size. Comment should explain. * Type hint as a string. * Make fixup + update black. * torch+vision protections. * Don't use torchvision, use F.interpolate instead (no new dep). * Last fixes for Segformer. * Update test to reflect new image (which was broken) * Update tests. * Major BC modification: - Removed the string compressed PNG string, that's a job for users `transformers` stays in python land. - Removed the `score` for semantic segmentation. It has hardly a meaning on its own in this context. - Don't include the grayscale with logits for now (which could enable users to get a sense of confidence). Might be done later. - Don't include the surface of the mask (could be used for sorting by users, to filter out small masks). It's already calculable, and it's easier to add later, than to add now and break later if we need. * `make fixup`. * Small changes. * Rebase + doc fixup. --- src/transformers/__init__.py | 2 + src/transformers/pipelines/__init__.py | 3 +- .../pipelines/image_segmentation.py | 108 +++++++++------ src/transformers/utils/dummy_pt_objects.py | 3 + tests/test_modeling_segformer.py | 8 +- tests/test_pipelines_common.py | 8 +- tests/test_pipelines_image_segmentation.py | 127 +++++++++++++----- 7 files changed, 176 insertions(+), 83 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 228e4b5c2..d97e582c3 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -680,6 +680,7 @@ if is_torch_available(): "MODEL_FOR_OBJECT_DETECTION_MAPPING", "MODEL_FOR_PRETRAINING_MAPPING", "MODEL_FOR_QUESTION_ANSWERING_MAPPING", + "MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING", "MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", "MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING", "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING", @@ -2850,6 +2851,7 @@ if TYPE_CHECKING: MODEL_FOR_OBJECT_DETECTION_MAPPING, MODEL_FOR_PRETRAINING_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index 4e20e7124..c43627e3a 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -103,6 +103,7 @@ if is_torch_available(): AutoModelForMaskedLM, AutoModelForObjectDetection, AutoModelForQuestionAnswering, + AutoModelForSemanticSegmentation, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoModelForSpeechSeq2Seq, @@ -264,7 +265,7 @@ SUPPORTED_TASKS = { "image-segmentation": { "impl": ImageSegmentationPipeline, "tf": (), - "pt": (AutoModelForImageSegmentation,) if is_torch_available() else (), + "pt": (AutoModelForImageSegmentation, AutoModelForSemanticSegmentation) if is_torch_available() else (), "default": {"model": {"pt": "facebook/detr-resnet-50-panoptic"}}, "type": "image", }, diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py index 4306f035e..a1eef2e3f 100644 --- a/src/transformers/pipelines/image_segmentation.py +++ b/src/transformers/pipelines/image_segmentation.py @@ -1,5 +1,3 @@ -import base64 -import io from typing import Any, Dict, List, Union import numpy as np @@ -16,8 +14,13 @@ if is_vision_available(): if is_torch_available(): import torch + from torch import nn + + from ..models.auto.modeling_auto import ( + MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING, + ) - from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_SEGMENTATION_MAPPING logger = logging.get_logger(__name__) @@ -46,7 +49,9 @@ class ImageSegmentationPipeline(Pipeline): raise ValueError(f"The {self.__class__} is only available in PyTorch.") requires_backends(self, "vision") - self.check_model_type(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING) + self.check_model_type( + dict(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items() + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items()) + ) def _sanitize_parameters(self, **kwargs): postprocess_kwargs = {} @@ -77,16 +82,16 @@ class ImageSegmentationPipeline(Pipeline): Return: A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a - dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to - each image. + list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries + corresponding to each image. The dictionaries contain the following keys: - **label** (`str`) -- The class label identified by the model. - - **score** (`float`) -- The score attributed by the model for that label. - - **mask** (`str`) -- base64 string of a grayscale (single-channel) PNG image that contain masks - information. The PNG image has size (heigth, width) of the original image. Pixel values in the image are - either 0 or 255 (i.e. mask is absent VS mask is present). + - **mask** (`PIL.Image`) -- Pil Image with size (heigth, width) of the original image. Pixel values in the + image are in the range 0-255. 0 means the pixel is *not* part of the *label*, 255 means it definitely is. + - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the + "object" described by the label and the mask. """ return super().__call__(*args, **kwargs) @@ -104,40 +109,55 @@ class ImageSegmentationPipeline(Pipeline): model_outputs["target_size"] = target_size return model_outputs - def postprocess(self, model_outputs, threshold=0.9, mask_threshold=0.5): - raw_annotations = self.feature_extractor.post_process_segmentation( - model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5 - ) - raw_annotation = raw_annotations[0] + def postprocess(self, model_outputs, raw_image=False, threshold=0.9, mask_threshold=0.5): + if hasattr(self.feature_extractor, "post_process_segmentation"): + # Panoptic + raw_annotations = self.feature_extractor.post_process_segmentation( + model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5 + ) + raw_annotation = raw_annotations[0] + raw_annotation["masks"] *= 255 # [0,1] -> [0,255] black and white pixels + raw_annotation["scores"] = raw_annotation["scores"].tolist() + raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]] + raw_annotation["masks"] = [ + Image.fromarray(mask.numpy().astype(np.uint8), mode="L") for mask in raw_annotation["masks"] + ] + # {"scores": [...], ...} --> [{"score":x, ...}, ...] + keys = ["score", "label", "mask"] + annotation = [ + dict(zip(keys, vals)) + for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"]) + ] + else: + # Default logits + logits = model_outputs.logits + logits = logits.softmax(dim=1) + if len(logits.shape) != 4: + raise ValueError(f"Logits don't have expected dimensions, expected [1, N, H, W], got {logits.shape}") + batch_size, num_labels, height, width = logits.shape + expected_num_labels = len(self.model.config.id2label) + if num_labels != expected_num_labels: + raise ValueError( + f"Logits don't have expected dimensions, expected [1, {num_labels}, H, W], got {logits.shape}" + ) + size = model_outputs["target_size"].squeeze(0).tolist() + logits_reshaped = nn.functional.interpolate(logits, size=size, mode="bilinear", align_corners=False) + classes = logits_reshaped.argmax(dim=1)[0] + annotation = [] - raw_annotation["masks"] *= 255 # [0,1] -> [0,255] black and white pixels - - raw_annotation["scores"] = raw_annotation["scores"].tolist() - raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]] - raw_annotation["masks"] = [self._get_mask_str(mask) for mask in raw_annotation["masks"].cpu().numpy()] - - # {"scores": [...], ...} --> [{"score":x, ...}, ...] - keys = ["score", "label", "mask"] - annotation = [ - dict(zip(keys, vals)) - for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"]) - ] + for label_id in range(num_labels): + label = self.model.config.id2label[label_id] + mask = classes == label_id + mask_sum = mask.sum() + # Remove empty masks. + if mask_sum == 0: + continue + mask = Image.fromarray((mask * 255).numpy().astype(np.uint8), mode="L") + # Semantic segmentation does not output a global score for the mask + # so we don't attempt to compute one. + # XXX: We could send a mask with values between 0 and 255 instead + # of a pure mask to enable users to get the probabilities that + # are really outputted by the logits. + annotation.append({"score": None, "label": label, "mask": mask}) return annotation - - def _get_mask_str(self, mask: np.array) -> str: - """ - Turns mask numpy array into mask base64 str. - - Args: - mask (`np.array`): Numpy array (with shape (heigth, width) of the original image) containing masks - information. Values in the array are either 0 or 255 (i.e. mask is absent VS mask is present). - - Returns: - A base64 string of a single-channel PNG image that contain masks information. - """ - img = Image.fromarray(mask.astype(np.int8), mode="L") - with io.BytesIO() as out: - img.save(out, format="PNG") - png_string = out.getvalue() - return base64.b64encode(png_string).decode("utf-8") diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 4f8049e23..e7462ced9 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -383,6 +383,9 @@ MODEL_FOR_PRETRAINING_MAPPING = None MODEL_FOR_QUESTION_ANSWERING_MAPPING = None +MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = None + + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = None diff --git a/tests/test_modeling_segformer.py b/tests/test_modeling_segformer.py index 3bb293065..f359e2378 100644 --- a/tests/test_modeling_segformer.py +++ b/tests/test_modeling_segformer.py @@ -101,7 +101,11 @@ class SegformerModelTester: if self.use_labels: labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) - config = SegformerConfig( + config = self.get_config() + return config, pixel_values, labels + + def get_config(self): + return SegformerConfig( image_size=self.image_size, num_channels=self.num_channels, num_encoder_blocks=self.num_encoder_blocks, @@ -114,8 +118,6 @@ class SegformerModelTester: initializer_range=self.initializer_range, ) - return config, pixel_values, labels - def create_and_check_model(self, config, pixel_values, labels): model = SegformerModel(config=config) model.to(torch_device) diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py index 05fa383ce..9dbb3d2bc 100644 --- a/tests/test_pipelines_common.py +++ b/tests/test_pipelines_common.py @@ -126,14 +126,14 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config, feature_ class ANY: - def __init__(self, _type): - self._type = _type + def __init__(self, *_types): + self._types = _types def __eq__(self, other): - return isinstance(other, self._type) + return isinstance(other, self._types) def __repr__(self): - return f"ANY({self._type.__name__})" + return f"ANY({', '.join(_type.__name__ for _type in self._types)})" class PipelineTestCaseMeta(type): diff --git a/tests/test_pipelines_image_segmentation.py b/tests/test_pipelines_image_segmentation.py index 99fab7db3..afb360f36 100644 --- a/tests/test_pipelines_image_segmentation.py +++ b/tests/test_pipelines_image_segmentation.py @@ -15,10 +15,14 @@ import hashlib import unittest +import datasets + from transformers import ( MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING, AutoFeatureExtractor, AutoModelForImageSegmentation, + DetrForSegmentation, ImageSegmentationPipeline, is_vision_available, pipeline, @@ -46,12 +50,23 @@ else: pass +def hashimage(image: Image) -> str: + m = hashlib.md5(image.tobytes()) + return m.hexdigest() + + @require_vision @require_timm @require_torch @is_pipeline_test class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): - model_mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING + model_mapping = { + k: v + for k, v in ( + list(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()) if MODEL_FOR_IMAGE_SEGMENTATION_MAPPING else [] + ) + + (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else []) + } def get_test_pipeline(self, model, tokenizer, feature_extractor): image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor) @@ -62,34 +77,59 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa def run_pipeline_test(self, image_segmenter, examples): outputs = image_segmenter("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0) - self.assertEqual(outputs, [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12) - - import datasets + self.assertIsInstance(outputs, list) + n = len(outputs) + self.assertGreater(n, 1) + # XXX: PIL.Image implements __eq__ which bypasses ANY, so we inverse the comparison + # to make it work + self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, outputs) dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test") - batch = [ - Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), - "http://images.cocodataset.org/val2017/000000039769.jpg", - # RGBA - dataset[0]["file"], - # LA - dataset[1]["file"], - # L - dataset[2]["file"], - ] - outputs = image_segmenter(batch, threshold=0.0) + # RGBA + outputs = image_segmenter(dataset[0]["file"]) + m = len(outputs) + self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs) + # LA + outputs = image_segmenter(dataset[1]["file"]) + m = len(outputs) + self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs) + # L + outputs = image_segmenter(dataset[2]["file"]) + m = len(outputs) + self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs) + if isinstance(image_segmenter.model, DetrForSegmentation): + # We need to test batch_size with images with the same size. + # Detr doesn't normalize the size of the images, meaning we can have + # 800x800 or 800x1200, meaning we cannot batch simply. + # We simply bail on this + batch_size = 1 + else: + batch_size = 2 + + # 5 times the same image so the output shape is predictable + batch = [ + "./tests/fixtures/tests_samples/COCO/000000039769.png", + "./tests/fixtures/tests_samples/COCO/000000039769.png", + "./tests/fixtures/tests_samples/COCO/000000039769.png", + "./tests/fixtures/tests_samples/COCO/000000039769.png", + "./tests/fixtures/tests_samples/COCO/000000039769.png", + ] + outputs = image_segmenter(batch, threshold=0.0, batch_size=batch_size) self.assertEqual(len(batch), len(outputs)) + self.assertEqual({"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}, outputs[0][0]) + self.assertEqual(len(outputs[0]), n) self.assertEqual( - outputs, [ - [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12, - [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12, - [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12, - [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12, - [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12, + [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, + [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, + [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, + [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, + [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, ], + outputs, + f"Expected [{n}, {n}, {n}, {n}, {n}], got {[len(item) for item in outputs]}", ) @require_tf @@ -108,7 +148,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.0) for o in outputs: # shortening by hashing - o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest() + o["mask"] = hashimage(o["mask"]) self.assertEqual( nested_simplify(outputs, decimals=4), @@ -116,12 +156,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa { "score": 0.004, "label": "LABEL_0", - "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be", + "mask": "34eecd16bbfb0f476083ef947d81bf66", }, { "score": 0.004, "label": "LABEL_0", - "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be", + "mask": "34eecd16bbfb0f476083ef947d81bf66", }, ], ) @@ -135,7 +175,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa ) for output in outputs: for o in output: - o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest() + o["mask"] = hashimage(o["mask"]) self.assertEqual( nested_simplify(outputs, decimals=4), @@ -144,29 +184,54 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa { "score": 0.004, "label": "LABEL_0", - "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be", + "mask": "34eecd16bbfb0f476083ef947d81bf66", }, { "score": 0.004, "label": "LABEL_0", - "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be", + "mask": "34eecd16bbfb0f476083ef947d81bf66", }, ], [ { "score": 0.004, "label": "LABEL_0", - "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be", + "mask": "34eecd16bbfb0f476083ef947d81bf66", }, { "score": 0.004, "label": "LABEL_0", - "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be", + "mask": "34eecd16bbfb0f476083ef947d81bf66", }, ], ], ) + @require_torch + def test_small_model_pt_semantic(self): + model_id = "hf-internal-testing/tiny-random-beit-pipeline" + image_segmenter = pipeline(model=model_id) + outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg") + for o in outputs: + # shortening by hashing + o["mask"] = hashimage(o["mask"]) + + self.assertEqual( + nested_simplify(outputs, decimals=4), + [ + { + "score": None, + "label": "LABEL_0", + "mask": "01245d8ad25d03f09493ca97965788ae", + }, + { + "score": None, + "label": "LABEL_1", + "mask": "f741516de8d5196a2c830739b9ac1c8c", + }, + ], + ) + @require_torch @slow def test_integration_torch_image_segmentation(self): @@ -176,7 +241,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg") for o in outputs: - o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest() + o["mask"] = hashimage(o["mask"]) self.assertEqual( nested_simplify(outputs, decimals=4), @@ -234,7 +299,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=threshold) for o in outputs: - o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest() + o["mask"] = hashimage(o["mask"]) self.assertEqual( nested_simplify(outputs, decimals=4),