diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 228e4b5c2..d97e582c3 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -680,6 +680,7 @@ if is_torch_available():
             "MODEL_FOR_OBJECT_DETECTION_MAPPING",
             "MODEL_FOR_PRETRAINING_MAPPING",
             "MODEL_FOR_QUESTION_ANSWERING_MAPPING",
+            "MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
             "MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
             "MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
             "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
@@ -2850,6 +2851,7 @@ if TYPE_CHECKING:
             MODEL_FOR_OBJECT_DETECTION_MAPPING,
             MODEL_FOR_PRETRAINING_MAPPING,
             MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
             MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
             MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
             MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 4e20e7124..c43627e3a 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -103,6 +103,7 @@ if is_torch_available():
         AutoModelForMaskedLM,
         AutoModelForObjectDetection,
         AutoModelForQuestionAnswering,
+        AutoModelForSemanticSegmentation,
         AutoModelForSeq2SeqLM,
         AutoModelForSequenceClassification,
         AutoModelForSpeechSeq2Seq,
@@ -264,7 +265,7 @@ SUPPORTED_TASKS = {
     "image-segmentation": {
         "impl": ImageSegmentationPipeline,
         "tf": (),
-        "pt": (AutoModelForImageSegmentation,) if is_torch_available() else (),
+        "pt": (AutoModelForImageSegmentation, AutoModelForSemanticSegmentation) if is_torch_available() else (),
         "default": {"model": {"pt": "facebook/detr-resnet-50-panoptic"}},
         "type": "image",
     },
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index 4306f035e..a1eef2e3f 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -1,5 +1,3 @@
-import base64
-import io
 from typing import Any, Dict, List, Union
 
 import numpy as np
@@ -16,8 +14,13 @@ if is_vision_available():
 
 if is_torch_available():
     import torch
+    from torch import nn
+
+    from ..models.auto.modeling_auto import (
+        MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+        MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
+    )
 
-    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_SEGMENTATION_MAPPING
 
 logger = logging.get_logger(__name__)
 
@@ -46,7 +49,9 @@ class ImageSegmentationPipeline(Pipeline):
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
 
         requires_backends(self, "vision")
-        self.check_model_type(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING)
+        self.check_model_type(
+            dict(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items() + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items())
+        )
 
     def _sanitize_parameters(self, **kwargs):
         postprocess_kwargs = {}
@@ -77,16 +82,16 @@ class ImageSegmentationPipeline(Pipeline):
 
         Return:
             A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
-            dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
-            each image.
+            list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries
+            corresponding to each image.
 
             The dictionaries contain the following keys:
 
             - **label** (`str`) -- The class label identified by the model.
-            - **score** (`float`) -- The score attributed by the model for that label.
-            - **mask** (`str`) -- base64 string of a grayscale (single-channel) PNG image that contain masks
-              information. The PNG image has size (heigth, width) of the original image. Pixel values in the image are
-              either 0 or 255 (i.e. mask is absent VS mask is present).
+            - **mask** (`PIL.Image`) -- Pil Image with size (heigth, width) of the original image. Pixel values in the
+              image are in the range 0-255. 0 means the pixel is *not* part of the *label*, 255 means it definitely is.
+            - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the
+              "object" described by the label and the mask.
         """
 
         return super().__call__(*args, **kwargs)
@@ -104,40 +109,55 @@ class ImageSegmentationPipeline(Pipeline):
         model_outputs["target_size"] = target_size
         return model_outputs
 
-    def postprocess(self, model_outputs, threshold=0.9, mask_threshold=0.5):
-        raw_annotations = self.feature_extractor.post_process_segmentation(
-            model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5
-        )
-        raw_annotation = raw_annotations[0]
+    def postprocess(self, model_outputs, raw_image=False, threshold=0.9, mask_threshold=0.5):
+        if hasattr(self.feature_extractor, "post_process_segmentation"):
+            # Panoptic
+            raw_annotations = self.feature_extractor.post_process_segmentation(
+                model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5
+            )
+            raw_annotation = raw_annotations[0]
+            raw_annotation["masks"] *= 255  # [0,1] -> [0,255] black and white pixels
+            raw_annotation["scores"] = raw_annotation["scores"].tolist()
+            raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]]
+            raw_annotation["masks"] = [
+                Image.fromarray(mask.numpy().astype(np.uint8), mode="L") for mask in raw_annotation["masks"]
+            ]
+            # {"scores": [...], ...} --> [{"score":x, ...}, ...]
+            keys = ["score", "label", "mask"]
+            annotation = [
+                dict(zip(keys, vals))
+                for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"])
+            ]
+        else:
+            # Default logits
+            logits = model_outputs.logits
+            logits = logits.softmax(dim=1)
+            if len(logits.shape) != 4:
+                raise ValueError(f"Logits don't have expected dimensions, expected [1, N, H, W], got {logits.shape}")
+            batch_size, num_labels, height, width = logits.shape
+            expected_num_labels = len(self.model.config.id2label)
+            if num_labels != expected_num_labels:
+                raise ValueError(
+                    f"Logits don't have expected dimensions, expected [1, {num_labels}, H, W], got {logits.shape}"
+                )
+            size = model_outputs["target_size"].squeeze(0).tolist()
+            logits_reshaped = nn.functional.interpolate(logits, size=size, mode="bilinear", align_corners=False)
+            classes = logits_reshaped.argmax(dim=1)[0]
+            annotation = []
 
-        raw_annotation["masks"] *= 255  # [0,1] -> [0,255] black and white pixels
-
-        raw_annotation["scores"] = raw_annotation["scores"].tolist()
-        raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]]
-        raw_annotation["masks"] = [self._get_mask_str(mask) for mask in raw_annotation["masks"].cpu().numpy()]
-
-        # {"scores": [...], ...} --> [{"score":x, ...}, ...]
-        keys = ["score", "label", "mask"]
-        annotation = [
-            dict(zip(keys, vals))
-            for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"])
-        ]
+            for label_id in range(num_labels):
+                label = self.model.config.id2label[label_id]
+                mask = classes == label_id
+                mask_sum = mask.sum()
 
+                # Remove empty masks.
+                if mask_sum == 0:
+                    continue
+                mask = Image.fromarray((mask * 255).numpy().astype(np.uint8), mode="L")
+                # Semantic segmentation does not output a global score for the mask
+                # so we don't attempt to compute one.
+                # XXX: We could send a mask with values between 0 and 255 instead
+                # of a pure mask to enable users to get the probabilities that
+                # are really outputted by the logits.
+                annotation.append({"score": None, "label": label, "mask": mask})
         return annotation
-
-    def _get_mask_str(self, mask: np.array) -> str:
-        """
-        Turns mask numpy array into mask base64 str.
-
-        Args:
-            mask (`np.array`): Numpy array (with shape (heigth, width) of the original image) containing masks
-                information. Values in the array are either 0 or 255 (i.e. mask is absent VS mask is present).
-
-        Returns:
-            A base64 string of a single-channel PNG image that contain masks information.
-        """
-        img = Image.fromarray(mask.astype(np.int8), mode="L")
-        with io.BytesIO() as out:
-            img.save(out, format="PNG")
-            png_string = out.getvalue()
-            return base64.b64encode(png_string).decode("utf-8")
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 4f8049e23..e7462ced9 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -383,6 +383,9 @@ MODEL_FOR_PRETRAINING_MAPPING = None
 MODEL_FOR_QUESTION_ANSWERING_MAPPING = None
 
 
+MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = None
+
+
 MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = None
 
 
diff --git a/tests/test_modeling_segformer.py b/tests/test_modeling_segformer.py
index 3bb293065..f359e2378 100644
--- a/tests/test_modeling_segformer.py
+++ b/tests/test_modeling_segformer.py
@@ -101,7 +101,11 @@ class SegformerModelTester:
         if self.use_labels:
             labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
 
-        config = SegformerConfig(
+        config = self.get_config()
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return SegformerConfig(
             image_size=self.image_size,
             num_channels=self.num_channels,
             num_encoder_blocks=self.num_encoder_blocks,
@@ -114,8 +118,6 @@ class SegformerModelTester:
             initializer_range=self.initializer_range,
         )
 
-        return config, pixel_values, labels
-
     def create_and_check_model(self, config, pixel_values, labels):
         model = SegformerModel(config=config)
         model.to(torch_device)
diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py
index 05fa383ce..9dbb3d2bc 100644
--- a/tests/test_pipelines_common.py
+++ b/tests/test_pipelines_common.py
@@ -126,14 +126,14 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config, feature_
 
 
 class ANY:
-    def __init__(self, _type):
-        self._type = _type
+    def __init__(self, *_types):
+        self._types = _types
 
     def __eq__(self, other):
-        return isinstance(other, self._type)
+        return isinstance(other, self._types)
 
     def __repr__(self):
-        return f"ANY({self._type.__name__})"
+        return f"ANY({', '.join(_type.__name__ for _type in self._types)})"
 
 
 class PipelineTestCaseMeta(type):
diff --git a/tests/test_pipelines_image_segmentation.py b/tests/test_pipelines_image_segmentation.py
index 99fab7db3..afb360f36 100644
--- a/tests/test_pipelines_image_segmentation.py
+++ b/tests/test_pipelines_image_segmentation.py
@@ -15,10 +15,14 @@
 import hashlib
 import unittest
 
+import datasets
+
 from transformers import (
     MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
     AutoFeatureExtractor,
     AutoModelForImageSegmentation,
+    DetrForSegmentation,
     ImageSegmentationPipeline,
     is_vision_available,
     pipeline,
@@ -46,12 +50,23 @@ else:
             pass
 
 
+def hashimage(image: Image) -> str:
+    m = hashlib.md5(image.tobytes())
+    return m.hexdigest()
+
+
 @require_vision
 @require_timm
 @require_torch
 @is_pipeline_test
 class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
-    model_mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING
+    model_mapping = {
+        k: v
+        for k, v in (
+            list(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()) if MODEL_FOR_IMAGE_SEGMENTATION_MAPPING else []
+        )
+        + (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else [])
+    }
 
     def get_test_pipeline(self, model, tokenizer, feature_extractor):
         image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor)
@@ -62,34 +77,59 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
 
     def run_pipeline_test(self, image_segmenter, examples):
         outputs = image_segmenter("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0)
-        self.assertEqual(outputs, [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12)
-
-        import datasets
+        self.assertIsInstance(outputs, list)
+        n = len(outputs)
+        self.assertGreater(n, 1)
+        # XXX: PIL.Image implements __eq__ which bypasses ANY, so we inverse the comparison
+        # to make it work
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, outputs)
 
         dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
 
-        batch = [
-            Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
-            "http://images.cocodataset.org/val2017/000000039769.jpg",
-            # RGBA
-            dataset[0]["file"],
-            # LA
-            dataset[1]["file"],
-            # L
-            dataset[2]["file"],
-        ]
-        outputs = image_segmenter(batch, threshold=0.0)
+        # RGBA
+        outputs = image_segmenter(dataset[0]["file"])
+        m = len(outputs)
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
+        # LA
+        outputs = image_segmenter(dataset[1]["file"])
+        m = len(outputs)
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
+        # L
+        outputs = image_segmenter(dataset[2]["file"])
+        m = len(outputs)
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
 
+        if isinstance(image_segmenter.model, DetrForSegmentation):
+            # We need to test batch_size with images with the same size.
+            # Detr doesn't normalize the size of the images, meaning we can have
+            # 800x800 or 800x1200, meaning we cannot batch simply.
+            # We simply bail on this
+            batch_size = 1
+        else:
+            batch_size = 2
+
+        # 5 times the same image so the output shape is predictable
+        batch = [
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+        ]
+        outputs = image_segmenter(batch, threshold=0.0, batch_size=batch_size)
         self.assertEqual(len(batch), len(outputs))
+        self.assertEqual({"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}, outputs[0][0])
+        self.assertEqual(len(outputs[0]), n)
         self.assertEqual(
-            outputs,
             [
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
-                [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
             ],
+            outputs,
+            f"Expected [{n}, {n}, {n}, {n}, {n}], got {[len(item) for item in outputs]}",
         )
 
     @require_tf
@@ -108,7 +148,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
         outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.0)
         for o in outputs:
             # shortening by hashing
-            o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
+            o["mask"] = hashimage(o["mask"])
 
         self.assertEqual(
             nested_simplify(outputs, decimals=4),
@@ -116,12 +156,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
                 {
                     "score": 0.004,
                     "label": "LABEL_0",
-                    "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                    "mask": "34eecd16bbfb0f476083ef947d81bf66",
                 },
                 {
                     "score": 0.004,
                     "label": "LABEL_0",
-                    "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                    "mask": "34eecd16bbfb0f476083ef947d81bf66",
                 },
             ],
         )
@@ -135,7 +175,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
         )
         for output in outputs:
             for o in output:
-                o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
+                o["mask"] = hashimage(o["mask"])
 
         self.assertEqual(
             nested_simplify(outputs, decimals=4),
@@ -144,29 +184,54 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
                     {
                         "score": 0.004,
                         "label": "LABEL_0",
-                        "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                        "mask": "34eecd16bbfb0f476083ef947d81bf66",
                     },
                     {
                         "score": 0.004,
                         "label": "LABEL_0",
-                        "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                        "mask": "34eecd16bbfb0f476083ef947d81bf66",
                     },
                 ],
                 [
                     {
                         "score": 0.004,
                         "label": "LABEL_0",
-                        "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                        "mask": "34eecd16bbfb0f476083ef947d81bf66",
                     },
                     {
                         "score": 0.004,
                         "label": "LABEL_0",
-                        "mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
+                        "mask": "34eecd16bbfb0f476083ef947d81bf66",
                     },
                 ],
             ],
         )
 
+    @require_torch
+    def test_small_model_pt_semantic(self):
+        model_id = "hf-internal-testing/tiny-random-beit-pipeline"
+        image_segmenter = pipeline(model=model_id)
+        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg")
+        for o in outputs:
+            # shortening by hashing
+            o["mask"] = hashimage(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": None,
+                    "label": "LABEL_0",
+                    "mask": "01245d8ad25d03f09493ca97965788ae",
+                },
+                {
+                    "score": None,
+                    "label": "LABEL_1",
+                    "mask": "f741516de8d5196a2c830739b9ac1c8c",
+                },
+            ],
+        )
+
     @require_torch
     @slow
     def test_integration_torch_image_segmentation(self):
@@ -176,7 +241,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
 
         outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg")
         for o in outputs:
-            o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
+            o["mask"] = hashimage(o["mask"])
 
         self.assertEqual(
             nested_simplify(outputs, decimals=4),
@@ -234,7 +299,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
         outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=threshold)
 
         for o in outputs:
-            o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
+            o["mask"] = hashimage(o["mask"])
 
         self.assertEqual(
             nested_simplify(outputs, decimals=4),