From 824fcbfd9dda0e07f8e0f4f4529a4b2f6bd4866e Mon Sep 17 00:00:00 2001 From: Xiang Zhang Date: Mon, 24 Aug 2020 13:13:50 -0700 Subject: [PATCH] support Normalized_0_1 and Normalized_1_1 (#4800) * support Normalized_0_1 and Normalized_1_1 * add tests for Normalized_1_1 * fix build error * fix imagetests failure * support denterization and add more tests * fix build * remove added models * disable gpu tests for CPU pipeline * refactor based on comments and moved two added models * merge normalizer and Denomalizer into NominalRangeConverter * add comments * little change --- cmake/winml.cmake | 2 + winml/lib/Api.Image/CpuDetensorizer.h | 74 ++++++++------- winml/lib/Api.Image/CpuTensorizer.h | 93 ++++++++++--------- winml/lib/Api.Image/NominalRangeConverter.cpp | 59 ++++++++++++ .../Api.Image/TensorToVideoFrameConverter.cpp | 20 +++- .../Api.Image/VideoFrameToTensorConverter.cpp | 18 +++- .../lib/Api.Image/inc/ImageConversionTypes.h | 8 ++ .../lib/Api.Image/inc/NominalRangeConverter.h | 31 +++++++ .../OnnxruntimeDescriptorConverter.cpp | 4 +- winml/lib/Api/ImageFeatureValue.cpp | 26 +++++- ...Net1920WithImageMetadataBgr8_SRGB_0_1.onnx | 12 +-- ...Net1920WithImageMetadataBgr8_SRGB_1_1.onnx | 12 +-- winml/test/image/imagetests.cpp | 4 +- .../cppwinrt/scenariotestscppwinrt.cpp | 47 ++++++++++ .../scenario/cppwinrt/scenariotestscppwinrt.h | 2 + 15 files changed, 317 insertions(+), 95 deletions(-) create mode 100644 winml/lib/Api.Image/NominalRangeConverter.cpp create mode 100644 winml/lib/Api.Image/inc/NominalRangeConverter.h diff --git a/cmake/winml.cmake b/cmake/winml.cmake index 83c1f3031d..76f5c40d91 100644 --- a/cmake/winml.cmake +++ b/cmake/winml.cmake @@ -323,6 +323,7 @@ add_library(winml_lib_image STATIC ${winml_lib_api_image_dir}/inc/ImageConverter.h ${winml_lib_api_image_dir}/inc/TensorToVideoFrameConverter.h ${winml_lib_api_image_dir}/inc/VideoFrameToTensorConverter.h + ${winml_lib_api_image_dir}/inc/NominalRangeConverter.h ${winml_lib_api_image_dir}/CpuDetensorizer.h ${winml_lib_api_image_dir}/CpuTensorizer.h ${winml_lib_api_image_dir}/pch.h @@ -333,6 +334,7 @@ add_library(winml_lib_image STATIC ${winml_lib_api_image_dir}/ImageConverter.cpp ${winml_lib_api_image_dir}/TensorToVideoFrameConverter.cpp ${winml_lib_api_image_dir}/VideoFrameToTensorConverter.cpp + ${winml_lib_api_image_dir}/NominalRangeConverter.cpp ) # Compiler options diff --git a/winml/lib/Api.Image/CpuDetensorizer.h b/winml/lib/Api.Image/CpuDetensorizer.h index 264e100d36..d51d582683 100644 --- a/winml/lib/Api.Image/CpuDetensorizer.h +++ b/winml/lib/Api.Image/CpuDetensorizer.h @@ -4,6 +4,7 @@ #pragma once #include "inc/ImageConversionTypes.h" +#include "inc/NominalRangeConverter.h" namespace _winml { @@ -13,7 +14,8 @@ class CpuDetensorizer { static HRESULT Detensorize( _In_ ImageTensorChannelType formatFrom, _In_ ImageTensorChannelType formatTo, - _In_ const T* pCPUTensor, + _In_ ImageNominalPixelRange pixelRange, + _In_ T* pCPUTensor, _In_ uint32_t bufferWidth, _In_ uint32_t tensorHeight, _In_ uint32_t tensorWidth, @@ -30,6 +32,8 @@ class CpuDetensorizer { uint32_t end = bufferWidth * tensorHeight; size_t tensorPlaneSize = tensorWidth * tensorHeight; + auto nominalRangeConverter = NominalRangeConverter(pixelRange); + if (formatFrom == formatTo && (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) { for (uint32_t i = 0; i < tensorHeight; i++) { BYTE* pPixel = pData; @@ -40,7 +44,8 @@ class CpuDetensorizer { pCPUTensor + tensorPlaneSize * 2 + i * tensorWidth, tensorWidth, pPixel, - bytesPerPixel); + bytesPerPixel, + nominalRangeConverter); pData += bufferWidth; } @@ -54,7 +59,8 @@ class CpuDetensorizer { pCPUTensor + i * tensorWidth, tensorWidth, pPixel, - bytesPerPixel); + bytesPerPixel, + nominalRangeConverter); pData += bufferWidth; } @@ -62,7 +68,7 @@ class CpuDetensorizer { // just replicate the gray data across each channel for (uint32_t i = 0; i < end; i += bufferWidth) { for (uint32_t j = i; j < i + bytesPerRow; j += 4) { - BYTE bGray = DetensorizeValue(pCPUTensor); + BYTE bGray = DetensorizeValue(pCPUTensor, nominalRangeConverter); pData[j] = bGray; pData[j + 1] = bGray; pData[j + 2] = bGray; @@ -73,7 +79,7 @@ class CpuDetensorizer { } else if (formatFrom == kImageTensorChannelTypeGRAY8 && formatTo == kImageTensorChannelTypeGRAY8) { for (uint32_t i = 0; i < end; i += bufferWidth) { for (uint32_t j = i; j < i + bytesPerRow; j += 1) { - BYTE bGray = DetensorizeValue(pCPUTensor); + BYTE bGray = DetensorizeValue(pCPUTensor, nominalRangeConverter); pData[j] = bGray; pCPUTensor++; } @@ -83,9 +89,9 @@ class CpuDetensorizer { for (uint32_t j = i; j < i + bytesPerRow; j += 1) { BYTE red, green, blue; - blue = DetensorizeValue(pCPUTensor); - green = DetensorizeValue(pCPUTensor + tensorPlaneSize); - red = DetensorizeValue(pCPUTensor + tensorPlaneSize * 2); + blue = DetensorizeValue(pCPUTensor, nominalRangeConverter); + green = DetensorizeValue(pCPUTensor + tensorPlaneSize, nominalRangeConverter); + red = DetensorizeValue(pCPUTensor + tensorPlaneSize * 2, nominalRangeConverter); pData[j] = static_cast(0.2126f * red + 0.7152f * green + 0.0722f * blue); pCPUTensor++; @@ -96,9 +102,9 @@ class CpuDetensorizer { for (uint32_t j = i; j < i + bytesPerRow; j += 1) { BYTE red, green, blue; - red = DetensorizeValue(pCPUTensor); - green = DetensorizeValue(pCPUTensor + tensorPlaneSize); - blue = DetensorizeValue(pCPUTensor + tensorPlaneSize * 2); + red = DetensorizeValue(pCPUTensor, nominalRangeConverter); + green = DetensorizeValue(pCPUTensor + tensorPlaneSize, nominalRangeConverter); + blue = DetensorizeValue(pCPUTensor + tensorPlaneSize * 2, nominalRangeConverter); pData[j] = static_cast(0.2126f * red + 0.7152f * green + 0.0722f * blue); pCPUTensor++; @@ -114,18 +120,21 @@ class CpuDetensorizer { private: template - static float ReadTensor(const T* pCPUTensor) { - return *pCPUTensor; + static float ReadTensor(const T* pCPUTensor, const NominalRangeConverter& nominalRangeConverter) { + return nominalRangeConverter.Denormalize(*pCPUTensor); } template <> - static float ReadTensor(const DirectX::PackedVector::HALF* pCPUTensor) { - return DirectX::PackedVector::XMConvertHalfToFloat(*pCPUTensor); + static float ReadTensor( + const DirectX::PackedVector::HALF* pCPUTensor, + const NominalRangeConverter& nominalRangeConverter) { + return nominalRangeConverter.Denormalize( + DirectX::PackedVector::XMConvertHalfToFloat(*pCPUTensor)); } template - static BYTE DetensorizeValue(const T* pCPUTensor) { - return static_cast(std::max(0.0f, std::min(255.0f, ReadTensor(pCPUTensor) + 0.5f))); + static BYTE DetensorizeValue(const T* pCPUTensor, const NominalRangeConverter& nominalRangeConverter) { + return static_cast(std::max(0.0f, std::min(255.0f, ReadTensor(pCPUTensor, nominalRangeConverter) + 0.5f))); } template @@ -135,14 +144,15 @@ class CpuDetensorizer { const T* zChannel, uint32_t tensorWidth, BYTE* pData, - uint32_t bytesPerPixel) { + uint32_t bytesPerPixel, + const NominalRangeConverter& nominalRangeConverter) { BYTE* pPixel = pData; uint32_t tensorWidthRemaining = tensorWidth; while (tensorWidthRemaining > 0) { - pPixel[0] = DetensorizeValue(xChannel); - pPixel[1] = DetensorizeValue(yChannel); - pPixel[2] = DetensorizeValue(zChannel); + pPixel[0] = DetensorizeValue(xChannel, nominalRangeConverter); + pPixel[1] = DetensorizeValue(yChannel, nominalRangeConverter); + pPixel[2] = DetensorizeValue(zChannel, nominalRangeConverter); pPixel[3] = 255; pPixel += 4; @@ -161,7 +171,9 @@ class CpuDetensorizer { const float* zChannel, uint32_t tensorWidth, BYTE* pData, - uint32_t bytesPerPixel) { + uint32_t bytesPerPixel, + const NominalRangeConverter& nominalRangeConverter + ) { BYTE* pPixel = pData; uint32_t tensorWidthRemaining = tensorWidth; @@ -175,22 +187,22 @@ class CpuDetensorizer { while (tensorWidthRemaining >= 8) { // Load, saturate, and convert to ints, 8 - 32 bit floats from X channel - __m128i vXIntsLo = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(xChannel), maxv)); - __m128i vXIntsHi = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(xChannel + 4), maxv)); + __m128i vXIntsLo = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(xChannel)), maxv)); + __m128i vXIntsHi = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(xChannel + 4)), maxv)); // Pack 32 bit ints into 16 bit ints __m128i vXWords = _mm_packs_epi32(vXIntsLo, vXIntsHi); // Load, saturate, and convert to ints, 8 - 32 bit floats from Y channel - __m128i vYIntsLo = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(yChannel), maxv)); - __m128i vYIntsHi = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(yChannel + 4), maxv)); + __m128i vYIntsLo = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(yChannel)), maxv)); + __m128i vYIntsHi = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(yChannel + 4)), maxv)); // Pack 32 bit ints into 16 bit ints __m128i vYWords = _mm_packs_epi32(vYIntsLo, vYIntsHi); // Load, saturate, and convert to ints, 8 - 32 bit floats from Z channel - __m128i vZIntsLo = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(zChannel), maxv)); - __m128i vZIntsHi = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(zChannel + 4), maxv)); + __m128i vZIntsLo = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(zChannel)), maxv)); + __m128i vZIntsHi = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(zChannel + 4)), maxv)); // Pack 32 bit ints into 16 bit ints __m128i vZWords = _mm_packs_epi32(vZIntsLo, vZIntsHi); @@ -221,9 +233,9 @@ class CpuDetensorizer { // Anything remaining deal with it one at a time while (tensorWidthRemaining > 0) { - pPixel[0] = DetensorizeValue(xChannel); - pPixel[1] = DetensorizeValue(yChannel); - pPixel[2] = DetensorizeValue(zChannel); + pPixel[0] = DetensorizeValue(xChannel, nominalRangeConverter); + pPixel[1] = DetensorizeValue(yChannel, nominalRangeConverter); + pPixel[2] = DetensorizeValue(zChannel, nominalRangeConverter); pPixel[3] = 255; pPixel += bytesPerPixel; diff --git a/winml/lib/Api.Image/CpuTensorizer.h b/winml/lib/Api.Image/CpuTensorizer.h index a653abf8f2..728f3973ae 100644 --- a/winml/lib/Api.Image/CpuTensorizer.h +++ b/winml/lib/Api.Image/CpuTensorizer.h @@ -4,6 +4,7 @@ #pragma once #include "inc/ImageConversionTypes.h" +#include "inc/NominalRangeConverter.h" namespace _winml { @@ -13,6 +14,7 @@ class CpuTensorizer { static HRESULT TensorizeData( _In_ ImageTensorChannelType formatFrom, _In_ ImageTensorChannelType formatTo, + _In_ ImageNominalPixelRange pixelRange, _In_ BYTE* pBuffer, _In_ UINT32 bufferWidth, _In_ const wgi::BitmapBounds& inputBounds, @@ -33,6 +35,8 @@ class CpuTensorizer { uint32_t xElements = inputBounds.Width - inputBounds.X; uint32_t yElements = inputBounds.Height - inputBounds.Y; + auto nominalRangeConverter = NominalRangeConverter(pixelRange); + if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeRGB8) { // Convert BGR8 -> BGR8 or RGB8 -> RGB8 for (uint32_t y = 0; y < yElements; y++) { @@ -42,7 +46,8 @@ class CpuTensorizer { pCPUTensor + (inputBounds.Height * inputBounds.Width) + y * inputBounds.Width, pCPUTensor + (inputBounds.Height * inputBounds.Width) * 2 + y * inputBounds.Width, xElements, - bytesPerPixel); + bytesPerPixel, + nominalRangeConverter); } } else if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeRGB8 || formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeBGR8) { // Convert RGB8 -> BGR8 or BGR8 -> RGB8 @@ -53,7 +58,8 @@ class CpuTensorizer { pCPUTensor + (inputBounds.Height * inputBounds.Width) + y * inputBounds.Width, pCPUTensor + y * inputBounds.Width, xElements, - bytesPerPixel); + bytesPerPixel, + nominalRangeConverter); } } else if (formatTo == kImageTensorChannelTypeGRAY8 && (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) { // Convert BGR8 -> GRAY8 or RGB8 -> GRAY8 @@ -66,7 +72,7 @@ class CpuTensorizer { float green = float(pBuffer[j + 1]); float blue = float(pBuffer[j + blueIncrement]); float gray = 0.2126f * red + 0.7152f * green + 0.0722f * blue; - pCPUTensor[pixelInd] = ConvertByteToFloat(static_cast(gray)); + pCPUTensor[pixelInd] = ConvertByteToFloat(static_cast(gray), nominalRangeConverter); pixelInd++; } } @@ -74,9 +80,9 @@ class CpuTensorizer { // Convert GRAY8 -> BGR8 or GRAY8 -> RGB8 for (UINT32 i = start; i < end; i += bufferWidth) { for (UINT32 j = i; j < i + bytesPerRow; j += bytesPerPixel) { - pCPUTensor[pixelInd] = ConvertByteToFloat(pBuffer[j]); - pCPUTensor[(inputBounds.Height * inputBounds.Width) + pixelInd] = ConvertByteToFloat(pBuffer[j]); - pCPUTensor[(inputBounds.Height * inputBounds.Width * 2) + pixelInd] = ConvertByteToFloat(pBuffer[j]); + pCPUTensor[pixelInd] = ConvertByteToFloat(pBuffer[j], nominalRangeConverter); + pCPUTensor[(inputBounds.Height * inputBounds.Width) + pixelInd] = ConvertByteToFloat(pBuffer[j], nominalRangeConverter); + pCPUTensor[(inputBounds.Height * inputBounds.Width * 2) + pixelInd] = ConvertByteToFloat(pBuffer[j], nominalRangeConverter); pixelInd++; } } @@ -84,7 +90,7 @@ class CpuTensorizer { // Convert GRAY8 -> GRAY8 for (UINT32 i = start; i < end; i += bufferWidth) { for (UINT32 j = i; j < i + bytesPerRow; j += bytesPerPixel) { - pCPUTensor[pixelInd] = ConvertByteToFloat(pBuffer[j]); + pCPUTensor[pixelInd] = ConvertByteToFloat(pBuffer[j], nominalRangeConverter); pixelInd++; } } @@ -97,16 +103,17 @@ class CpuTensorizer { } private: + template - static T ConvertByteToFloat(const BYTE& input); + static T ConvertByteToFloat(const BYTE& input, const NominalRangeConverter& nominalRangeConverter); template <> - static float ConvertByteToFloat(const BYTE& input) { - return static_cast(input); + static float ConvertByteToFloat(const BYTE& input, const NominalRangeConverter& nominalRangeConverter) { + return nominalRangeConverter.Normalize(static_cast(input)); } template <> - static DirectX::PackedVector::HALF ConvertByteToFloat(const BYTE& input) { - return DirectX::PackedVector::XMConvertFloatToHalf(input); + static DirectX::PackedVector::HALF ConvertByteToFloat(const BYTE& input, const NominalRangeConverter& nominalRangeConverter) { + return nominalRangeConverter.Normalize(DirectX::PackedVector::XMConvertFloatToHalf(input)); } template @@ -116,29 +123,30 @@ class CpuTensorizer { _Inout_ T* yChannel, _Inout_ T* zChannel, uint32_t pixelElements, - uint32_t bytesPerPixel) { + uint32_t bytesPerPixel, + const NominalRangeConverter& nominalRangeConverter) { UINT32 j; for (j = 0; j < (pixelElements & 0xFFFFFFFC); j += 4) { - xChannel[j] = ConvertByteToFloat(pBuffer[0]); - yChannel[j] = ConvertByteToFloat(pBuffer[1]); - zChannel[j] = ConvertByteToFloat(pBuffer[2]); - xChannel[j + 1] = ConvertByteToFloat(pBuffer[4]); - yChannel[j + 1] = ConvertByteToFloat(pBuffer[5]); - zChannel[j + 1] = ConvertByteToFloat(pBuffer[6]); - xChannel[j + 2] = ConvertByteToFloat(pBuffer[8]); - yChannel[j + 2] = ConvertByteToFloat(pBuffer[9]); - zChannel[j + 2] = ConvertByteToFloat(pBuffer[10]); - xChannel[j + 3] = ConvertByteToFloat(pBuffer[12]); - yChannel[j + 3] = ConvertByteToFloat(pBuffer[13]); - zChannel[j + 3] = ConvertByteToFloat(pBuffer[14]); + xChannel[j] = ConvertByteToFloat(pBuffer[0], nominalRangeConverter); + yChannel[j] = ConvertByteToFloat(pBuffer[1], nominalRangeConverter); + zChannel[j] = ConvertByteToFloat(pBuffer[2], nominalRangeConverter); + xChannel[j + 1] = ConvertByteToFloat(pBuffer[4], nominalRangeConverter); + yChannel[j + 1] = ConvertByteToFloat(pBuffer[5], nominalRangeConverter); + zChannel[j + 1] = ConvertByteToFloat(pBuffer[6], nominalRangeConverter); + xChannel[j + 2] = ConvertByteToFloat(pBuffer[8], nominalRangeConverter); + yChannel[j + 2] = ConvertByteToFloat(pBuffer[9], nominalRangeConverter); + zChannel[j + 2] = ConvertByteToFloat(pBuffer[10], nominalRangeConverter); + xChannel[j + 3] = ConvertByteToFloat(pBuffer[12], nominalRangeConverter); + yChannel[j + 3] = ConvertByteToFloat(pBuffer[13], nominalRangeConverter); + zChannel[j + 3] = ConvertByteToFloat(pBuffer[14], nominalRangeConverter); pBuffer += bytesPerPixel * 4; } for (; j < pixelElements; j++) { - xChannel[j] = ConvertByteToFloat(pBuffer[0]); - yChannel[j] = ConvertByteToFloat(pBuffer[1]); - zChannel[j] = ConvertByteToFloat(pBuffer[2]); + xChannel[j] = ConvertByteToFloat(pBuffer[0], nominalRangeConverter); + yChannel[j] = ConvertByteToFloat(pBuffer[1], nominalRangeConverter); + zChannel[j] = ConvertByteToFloat(pBuffer[2], nominalRangeConverter); pBuffer += bytesPerPixel; } } @@ -151,7 +159,8 @@ class CpuTensorizer { _Inout_ float* yChannel, _Inout_ float* zChannel, uint32_t pixelElements, - uint32_t bytesPerPixel) { + uint32_t bytesPerPixel, + const NominalRangeConverter& nominalRangeConverter) { assert(bytesPerPixel == 4); __m128i ZeroVector = _mm_setzero_si128(); @@ -189,8 +198,8 @@ class CpuTensorizer { __m128i vXIntsHi = _mm_unpackhi_epi16(vXWords, ZeroVector); // store 256 bits of X channel Floats - _mm_storeu_ps(xChannel, _mm_cvtepi32_ps(vXIntsLo)); - _mm_storeu_ps(xChannel + 4, _mm_cvtepi32_ps(vXIntsHi)); + _mm_storeu_ps(xChannel, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vXIntsLo))); + _mm_storeu_ps(xChannel + 4, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vXIntsHi))); xChannel += 8; // unpack again for Y @@ -199,8 +208,8 @@ class CpuTensorizer { __m128i vYIntsLo = _mm_unpacklo_epi16(vYWords, ZeroVector); __m128i vYIntsHi = _mm_unpackhi_epi16(vYWords, ZeroVector); - _mm_storeu_ps(yChannel, _mm_cvtepi32_ps(vYIntsLo)); - _mm_storeu_ps(yChannel + 4, _mm_cvtepi32_ps(vYIntsHi)); + _mm_storeu_ps(yChannel, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vYIntsLo))); + _mm_storeu_ps(yChannel + 4, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vYIntsHi))); yChannel += 8; // unpack again for Z @@ -209,8 +218,8 @@ class CpuTensorizer { __m128i vZIntsLo = _mm_unpacklo_epi16(vZWords, ZeroVector); __m128i vZIntsHi = _mm_unpackhi_epi16(vZWords, ZeroVector); - _mm_storeu_ps(zChannel, _mm_cvtepi32_ps(vZIntsLo)); - _mm_storeu_ps(zChannel + 4, _mm_cvtepi32_ps(vZIntsHi)); + _mm_storeu_ps(zChannel, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vZIntsLo))); + _mm_storeu_ps(zChannel + 4, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vZIntsHi))); zChannel += 8; pBuffer += 32; @@ -230,7 +239,7 @@ class CpuTensorizer { __m128i vInts2 = _mm_unpacklo_epi16(vWords1, ZeroVector); __m128i vInts3 = _mm_unpackhi_epi16(vWords1, ZeroVector); - // Convert to floats + // Normalize to floats __m128 vFloats0 = _mm_cvtepi32_ps(vInts0); __m128 vFloats1 = _mm_cvtepi32_ps(vInts1); __m128 vFloats2 = _mm_cvtepi32_ps(vInts2); @@ -240,9 +249,9 @@ class CpuTensorizer { _MM_TRANSPOSE4_PS(vFloats0, vFloats1, vFloats2, vFloats3); // Drop alpha channel transposed to vFloats3 write out rest - _mm_storeu_ps(xChannel, vFloats0); - _mm_storeu_ps(yChannel, vFloats1); - _mm_storeu_ps(zChannel, vFloats2); + _mm_storeu_ps(xChannel, nominalRangeConverter.Normalize(vFloats0)); + _mm_storeu_ps(yChannel, nominalRangeConverter.Normalize(vFloats1)); + _mm_storeu_ps(zChannel, nominalRangeConverter.Normalize(vFloats2)); xChannel += 4; yChannel += 4; @@ -253,9 +262,9 @@ class CpuTensorizer { // Any remainder just do one at a time for (uint32_t j = 0; j < pixelElements; j++) { - xChannel[j] = static_cast(pBuffer[0]); - yChannel[j] = static_cast(pBuffer[1]); - zChannel[j] = static_cast(pBuffer[2]); + xChannel[j] = nominalRangeConverter.Normalize(static_cast(pBuffer[0])); + yChannel[j] = nominalRangeConverter.Normalize(static_cast(pBuffer[1])); + zChannel[j] = nominalRangeConverter.Normalize(static_cast(pBuffer[2])); pBuffer += bytesPerPixel; } } diff --git a/winml/lib/Api.Image/NominalRangeConverter.cpp b/winml/lib/Api.Image/NominalRangeConverter.cpp new file mode 100644 index 0000000000..60fcd0c51e --- /dev/null +++ b/winml/lib/Api.Image/NominalRangeConverter.cpp @@ -0,0 +1,59 @@ +#include "pch.h" +#include "inc/NominalRangeConverter.h" + +namespace _winml { + NominalRangeConverter::NominalRangeConverter(ImageNominalPixelRange pixelRange) { + // For Normalization: the formula is input_range[min, max] / scale - shift + // For DeNormalization: the formula is (input_range[min, max] + shift) * scale + if (pixelRange == ImageNominalPixelRange::kNominalRange_0_255) { + scale = 1.f; + shift = 0; + } + else if (pixelRange == ImageNominalPixelRange::kNormalized_0_1) { + scale = 255.f; + shift = 0; + } + else if (pixelRange == ImageNominalPixelRange::kNormalized_1_1) { + scale = (255.f / 2.f); + shift = 1; + } + }; + + // [0, 255] --> [0, 255] + // [0, 255] / 255 --> [0, 1] + // [0, 255] * 2 / 255 - 1 --> [-1, 1] + float NominalRangeConverter::Normalize(float val) const { + return val / scale - shift; + } + + DirectX::PackedVector::HALF NominalRangeConverter::Normalize(DirectX::PackedVector::HALF val) const { + return val / scale - shift; + } + + __m128 NominalRangeConverter::Normalize(__m128 sse_data) const { + __m128 sse_shift = _mm_set1_ps(shift); + __m128 sse_scale = _mm_set1_ps(scale); + + auto sse_dived = _mm_div_ps(sse_data, sse_scale); + return _mm_sub_ps(sse_dived, sse_shift); + } + + // [0, 255] --> [0, 255] + // ([0, 1] + 0 ) * 255 -> [0, 1] + // ([-1, 1] + 1) * 255 / 2 --> [-1, 1] + float NominalRangeConverter::Denormalize(float val) const { + return scale * (val + shift); + } + + DirectX::PackedVector::HALF NominalRangeConverter::Denormalize(DirectX::PackedVector::HALF val) const { + return scale * (val + shift); + } + + __m128 NominalRangeConverter::Denormalize(__m128 sse_data) const { + __m128 sse_shift = _mm_set1_ps(shift); + __m128 sse_scale = _mm_set1_ps(scale); + + auto sse_added = _mm_add_ps(sse_data, sse_shift); + return _mm_mul_ps(sse_added, sse_scale); + } +} // namespace _winml \ No newline at end of file diff --git a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp index d09802e2f7..fe691ea009 100644 --- a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp +++ b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp @@ -613,8 +613,24 @@ void TensorToVideoFrameConverter::ConvertCPUTensorToSoftwareBitmap( ImageTensorChannelType targetChannelType = _winmli::GetChannelTypeFromSoftwareBitmap(softwareBitmap); if (tensorDesc.dataType == kImageTensorDataTypeFloat32) { - WINML_THROW_IF_FAILED(CpuDetensorizer::Detensorize(tensorDesc.channelType, targetChannelType, static_cast(pCPUTensor), bufferWidth, height, width, pData)); + WINML_THROW_IF_FAILED(CpuDetensorizer::Detensorize( + tensorDesc.channelType, + targetChannelType, + tensorDesc.pixelRange, + static_cast(pCPUTensor), + bufferWidth, + height, + width, + pData)); } else if (tensorDesc.dataType == kImageTensorDataTypeFloat16) { - WINML_THROW_IF_FAILED(CpuDetensorizer::Detensorize(tensorDesc.channelType, targetChannelType, static_cast(pCPUTensor), bufferWidth, height, width, pData)); + WINML_THROW_IF_FAILED(CpuDetensorizer::Detensorize( + tensorDesc.channelType, + targetChannelType, + tensorDesc.pixelRange, + static_cast(pCPUTensor), + bufferWidth, + height, + width, + pData)); } } \ No newline at end of file diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp index 19ebde646a..385d1ffcde 100644 --- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp +++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp @@ -558,8 +558,22 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToCPUTensor( ImageTensorChannelType channelType = _winmli::GetChannelTypeFromSoftwareBitmap(softwareBitmap); if (tensorDesc.dataType == _winml::kImageTensorDataTypeFloat32) { - WINML_THROW_IF_FAILED(CpuTensorizer::TensorizeData(channelType, tensorDesc.channelType, pData, bufferWidth, inputBounds, reinterpret_cast(pCPUTensor))); + WINML_THROW_IF_FAILED(CpuTensorizer::TensorizeData( + channelType, + tensorDesc.channelType, + tensorDesc.pixelRange, + pData, + bufferWidth, + inputBounds, + reinterpret_cast(pCPUTensor))); } else if (tensorDesc.dataType == _winml::kImageTensorDataTypeFloat16) { - WINML_THROW_IF_FAILED(CpuTensorizer::TensorizeData(channelType, tensorDesc.channelType, pData, bufferWidth, inputBounds, reinterpret_cast(pCPUTensor))); + WINML_THROW_IF_FAILED(CpuTensorizer::TensorizeData( + channelType, + tensorDesc.channelType, + tensorDesc.pixelRange, + pData, + bufferWidth, + inputBounds, + reinterpret_cast(pCPUTensor))); } } \ No newline at end of file diff --git a/winml/lib/Api.Image/inc/ImageConversionTypes.h b/winml/lib/Api.Image/inc/ImageConversionTypes.h index 489c00c01f..553d56b27d 100644 --- a/winml/lib/Api.Image/inc/ImageConversionTypes.h +++ b/winml/lib/Api.Image/inc/ImageConversionTypes.h @@ -25,9 +25,17 @@ enum ImageTensorChannelType { ImageTensorChannelType_COUNT }; +enum ImageNominalPixelRange { + kNominalRange_0_255, + kNormalized_0_1, + kNormalized_1_1, + ImageNominalPixelRange_COUNT +}; + struct ImageTensorDescription { ImageTensorDataType dataType; ImageTensorChannelType channelType; + ImageNominalPixelRange pixelRange; int64_t sizes[kImageTensorDimensionCountMax]; }; } // namespace _winml \ No newline at end of file diff --git a/winml/lib/Api.Image/inc/NominalRangeConverter.h b/winml/lib/Api.Image/inc/NominalRangeConverter.h new file mode 100644 index 0000000000..10e4a3da33 --- /dev/null +++ b/winml/lib/Api.Image/inc/NominalRangeConverter.h @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "inc/ImageConversionTypes.h" + +namespace _winml { + +class NominalRangeConverter { + public: + NominalRangeConverter() = delete; + NominalRangeConverter(ImageNominalPixelRange pixelRange); + + float Normalize(float val) const; + + DirectX::PackedVector::HALF Normalize(DirectX::PackedVector::HALF val) const; + + __m128 Normalize(__m128 sse_data) const; + + float Denormalize(float val) const; + + DirectX::PackedVector::HALF Denormalize(DirectX::PackedVector::HALF val) const; + + __m128 Denormalize(__m128 sse_data) const; + + private: + float scale; + int32_t shift; +}; +} // namespace _winml \ No newline at end of file diff --git a/winml/lib/Api.Ort/OnnxruntimeDescriptorConverter.cpp b/winml/lib/Api.Ort/OnnxruntimeDescriptorConverter.cpp index 3e1ae7de3a..90d02609b7 100644 --- a/winml/lib/Api.Ort/OnnxruntimeDescriptorConverter.cpp +++ b/winml/lib/Api.Ort/OnnxruntimeDescriptorConverter.cpp @@ -40,7 +40,9 @@ static const char* c_unsupported_color_spaces[] = static const char* c_nominal_range_key = "Image.NominalPixelRange"; static const char* c_supported_nominal_ranges[] = { - "NominalRange_0_255"}; + "NominalRange_0_255", + "Normalized_0_1", + "Normalized_1_1"}; namespace _winml { diff --git a/winml/lib/Api/ImageFeatureValue.cpp b/winml/lib/Api/ImageFeatureValue.cpp index 206b890961..0a277dc541 100644 --- a/winml/lib/Api/ImageFeatureValue.cpp +++ b/winml/lib/Api/ImageFeatureValue.cpp @@ -174,8 +174,10 @@ static unsigned GetSizeFromTensorDataType(_winml::ImageTensorDataType type) { FAIL_FAST_HR(E_INVALIDARG); } -static _winml::ImageTensorDescription CreateImageTensorDescriptor(winml::TensorKind tensorKind, wgi::BitmapPixelFormat pixelFormat, - uint32_t batchSize, uint32_t width, uint32_t height) { +static _winml::ImageTensorDescription CreateImageTensorDescriptor(winml::TensorKind tensorKind, + wgi::BitmapPixelFormat pixelFormat, + ImageNominalPixelRange pixelRange, + uint32_t batchSize, uint32_t width, uint32_t height) { _winml::ImageTensorDescription tensorDescription = {}; tensorDescription.dataType = GetTensorDataTypeFromTensorKind(tensorKind); tensorDescription.sizes[0] = batchSize; @@ -192,6 +194,17 @@ static _winml::ImageTensorDescription CreateImageTensorDescriptor(winml::TensorK } else { THROW_HR(E_NOTIMPL); } + + if (pixelRange == ImageNominalPixelRange::ImageNominalPixelRange_NominalRange_0_255) { + tensorDescription.pixelRange = _winml::ImageNominalPixelRange::kNominalRange_0_255; + } else if (pixelRange == ImageNominalPixelRange::ImageNominalPixelRange_Normalized_0_1) { + tensorDescription.pixelRange = _winml::ImageNominalPixelRange::kNormalized_0_1; + } else if (pixelRange == ImageNominalPixelRange::ImageNominalPixelRange_Normalized_1_1) { + tensorDescription.pixelRange = _winml::ImageNominalPixelRange::kNormalized_1_1; + } else { + THROW_HR(E_NOTIMPL); + } + tensorDescription.sizes[2] = height; tensorDescription.sizes[3] = width; @@ -375,8 +388,15 @@ std::optional ImageFeatureValue::GetIn THROW_HR(WINML_ERR_SIZE_MISMATCH); } } + + // Set up ImageNominalPixelRange + ImageNominalPixelRange pixelRange = ImageNominalPixelRange::ImageNominalPixelRange_NominalRange_0_255; //default; + if (spImageDescriptor) { + pixelRange = spImageDescriptor->GetNominalPixelRange(); + } + //NCHW layout - auto imageTensorDescriptor = CreateImageTensorDescriptor(tensorKind, pixelFormat.value(), m_batchSize, descriptorWidth, descriptorHeight); + auto imageTensorDescriptor = CreateImageTensorDescriptor(tensorKind, pixelFormat.value(), pixelRange, m_batchSize, descriptorWidth, descriptorHeight); return ImageResourceMetadata{bounds, imageTensorDescriptor}; } diff --git a/winml/test/collateral/models/Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx b/winml/test/collateral/models/Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx index 88fb241f32..4ea1d3db44 100644 --- a/winml/test/collateral/models/Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx +++ b/winml/test/collateral/models/Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx @@ -1,20 +1,20 @@  OnnxMLTools -0.1.0.0000"onnxml:í +0.1.0.0000"onnxml:û % input_39 -input_40add_3Add"Addkeras_Add_ImageNet_smallZ$ -input_39 +input_40add_3Add"Addkeras_Add_ImageNet_smallZ+ +input_39    ¸ -€Z$ -input_40 +€2IMAGEZ+ +input_40    ¸ -€b^ +€2IMAGEb^ add_3U LH  diff --git a/winml/test/collateral/models/Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx b/winml/test/collateral/models/Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx index c3faef843f..3589f0f8b6 100644 --- a/winml/test/collateral/models/Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx +++ b/winml/test/collateral/models/Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx @@ -1,20 +1,20 @@  OnnxMLTools -0.1.0.0000"onnxml:í +0.1.0.0000"onnxml:û % input_39 -input_40add_3Add"Addkeras_Add_ImageNet_smallZ$ -input_39 +input_40add_3Add"Addkeras_Add_ImageNet_smallZ+ +input_39    ¸ -€Z$ -input_40 +€2IMAGEZ+ +input_40    ¸ -€b^ +€2IMAGEb^ add_3U LH  diff --git a/winml/test/image/imagetests.cpp b/winml/test/image/imagetests.cpp index c69f90401d..062a8eb5ad 100644 --- a/winml/test/image/imagetests.cpp +++ b/winml/test/image/imagetests.cpp @@ -616,12 +616,12 @@ TEST_F(ImageTests, ImageMetaDataTest) { // supported image metadata ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_255.onnx", BitmapAlphaMode::Premultiplied, BitmapPixelFormat::Bgra8, true); ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataRgb8_SRGB_0_255.onnx", BitmapAlphaMode::Premultiplied, BitmapPixelFormat::Rgba8, true); + ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx", BitmapAlphaMode::Premultiplied, BitmapPixelFormat::Bgra8, true); + ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx", BitmapAlphaMode::Premultiplied, BitmapPixelFormat::Bgra8, true); // unsupported image metadata ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgra8_SRGB_0_255.onnx", BitmapAlphaMode::Straight, BitmapPixelFormat::Bgra8, false); ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataRgba8_SRGB_0_255.onnx", BitmapAlphaMode::Straight, BitmapPixelFormat::Rgba8, false); - ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx", BitmapAlphaMode::Straight, BitmapPixelFormat::Bgra8, false); - ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx", BitmapAlphaMode::Straight, BitmapPixelFormat::Bgra8, false); ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_16_235.onnx", BitmapAlphaMode::Straight, BitmapPixelFormat::Bgra8, false); ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_LINEAR_0_255.onnx", BitmapAlphaMode::Straight, BitmapPixelFormat::Bgra8, false); } diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp index 9becd9aecd..7dc0d3d2a3 100644 --- a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp +++ b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp @@ -1028,6 +1028,50 @@ static void Scenario22ImageBindingAsGPUTensor() { encoder.FlushAsync().get(); } +static void Scenario23NominalPixelRange() { + std::wstring modulePath = FileHelpers::GetModulePath(); + std::wstring inputImagePath = modulePath + L"1080.jpg"; + + // The following models have single op "add", with different metadata + std::vector modelPaths = { + // Normalized_0_1 and image output + modulePath + L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx", + // Normalized_1_1 and image output + modulePath + L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx" + }; + + for (uint32_t model_i = 0; model_i < modelPaths.size(); model_i++) { + // load model and create session + auto model = LearningModel::LoadFromFilePath(modelPaths[model_i]); + auto session = LearningModelSession(model, LearningModelDevice(LearningModelDeviceKind::DirectX)); + auto binding = LearningModelBinding(session); + + SoftwareBitmap softwareBitmap = FileHelpers::GetSoftwareBitmapFromFile(inputImagePath); + auto videoFrame = VideoFrame::CreateWithSoftwareBitmap(softwareBitmap); + auto imageValue = ImageFeatureValue::CreateFromVideoFrame(videoFrame); + + // Create Zero tensor + auto inputShape = std::vector{ 1, 3, 1080, 1920 }; + auto inputData = std::vector(3 * 1080 * 1920, 0); + auto zeroValue = + TensorFloat::CreateFromIterable( + inputShape, + winrt::single_threaded_vector(std::move(inputData)).GetView()); + // bind inputs + binding.Bind(L"input_39", imageValue); + binding.Bind(L"input_40", zeroValue); + + VideoFrame outputimage(BitmapPixelFormat::Bgra8, 1920, 1080); + ImageFeatureValue outputIfv = ImageFeatureValue::CreateFromVideoFrame(outputimage); + binding.Bind(L"add_3", outputIfv); + + winrt::hstring correlationId; + session.EvaluateAsync(binding, correlationId).get(); + + WINML_EXPECT_TRUE(VerifyHelper(imageValue, outputIfv)); + } +} + static void QuantizedModels() { // load a model std::wstring filePath = FileHelpers::GetModulePath() + L"onnxzoo_lotus_inception_v1-dq.onnx"; @@ -1408,6 +1452,7 @@ const ScenarioTestsApi& getapi() { Scenario8SetDeviceSampleCPU, Scenario17DevDiagnostics, Scenario22ImageBindingAsCPUTensor, + Scenario23NominalPixelRange, QuantizedModels, EncryptedStream, Scenario3SoftwareBitmapInputBinding, @@ -1450,6 +1495,7 @@ const ScenarioTestsApi& getapi() { api.Scenario20bLoadBindEvalReplacementCustomOperatorCPU = SkipTest; api.Scenario21RunModel2ChainZ = SkipTest; api.Scenario22ImageBindingAsGPUTensor = SkipTest; + api.Scenario23NominalPixelRange = SkipTest; api.MsftQuantizedModels = SkipTest; api.SyncVsAsync = SkipTest; api.CustomCommandQueueWithFence = SkipTest; @@ -1480,6 +1526,7 @@ const ScenarioTestsApi& getapi() { api.Scenario21RunModel2ChainZ = SkipTest; api.Scenario22ImageBindingAsCPUTensor = SkipTest; api.Scenario22ImageBindingAsGPUTensor = SkipTest; + api.Scenario23NominalPixelRange = SkipTest; api.CustomCommandQueueWithFence = SkipTest; api.ReuseVideoFrame = SkipTest; api.D2DInterop = SkipTest; diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.h b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.h index 4293323b24..5202b6c18b 100644 --- a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.h +++ b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.h @@ -14,6 +14,7 @@ struct ScenarioTestsApi VoidTest Scenario8SetDeviceSampleCPU; VoidTest Scenario17DevDiagnostics; VoidTest Scenario22ImageBindingAsCPUTensor; + VoidTest Scenario23NominalPixelRange; VoidTest QuantizedModels; VoidTest EncryptedStream; VoidTest Scenario3SoftwareBitmapInputBinding; @@ -54,6 +55,7 @@ WINML_TEST(ScenarioCppWinrtTests, Scenario8SetDeviceSampleDefault) WINML_TEST(ScenarioCppWinrtTests, Scenario8SetDeviceSampleCPU) WINML_TEST(ScenarioCppWinrtTests, Scenario17DevDiagnostics) WINML_TEST(ScenarioCppWinrtTests, Scenario22ImageBindingAsCPUTensor) +WINML_TEST(ScenarioCppWinrtTests, Scenario23NominalPixelRange) WINML_TEST(ScenarioCppWinrtTests, QuantizedModels) WINML_TEST(ScenarioCppWinrtTests, EncryptedStream) WINML_TEST(ScenarioCppWinrtTests, Scenario3SoftwareBitmapInputBinding)