support Normalized_0_1 and Normalized_1_1 (#4800)

* support Normalized_0_1 and Normalized_1_1

* add tests for Normalized_1_1

* fix build error

* fix imagetests failure

* support denterization and add more tests

* fix build

* remove added models

* disable gpu tests for CPU pipeline

* refactor based on comments and moved two added models

* merge normalizer and Denomalizer into NominalRangeConverter

* add comments

* little change
This commit is contained in:
Xiang Zhang 2020-08-24 13:13:50 -07:00 committed by GitHub
parent 268d2283c0
commit 824fcbfd9d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 317 additions and 95 deletions

View file

@ -323,6 +323,7 @@ add_library(winml_lib_image STATIC
${winml_lib_api_image_dir}/inc/ImageConverter.h
${winml_lib_api_image_dir}/inc/TensorToVideoFrameConverter.h
${winml_lib_api_image_dir}/inc/VideoFrameToTensorConverter.h
${winml_lib_api_image_dir}/inc/NominalRangeConverter.h
${winml_lib_api_image_dir}/CpuDetensorizer.h
${winml_lib_api_image_dir}/CpuTensorizer.h
${winml_lib_api_image_dir}/pch.h
@ -333,6 +334,7 @@ add_library(winml_lib_image STATIC
${winml_lib_api_image_dir}/ImageConverter.cpp
${winml_lib_api_image_dir}/TensorToVideoFrameConverter.cpp
${winml_lib_api_image_dir}/VideoFrameToTensorConverter.cpp
${winml_lib_api_image_dir}/NominalRangeConverter.cpp
)
# Compiler options

View file

@ -4,6 +4,7 @@
#pragma once
#include "inc/ImageConversionTypes.h"
#include "inc/NominalRangeConverter.h"
namespace _winml {
@ -13,7 +14,8 @@ class CpuDetensorizer {
static HRESULT Detensorize(
_In_ ImageTensorChannelType formatFrom,
_In_ ImageTensorChannelType formatTo,
_In_ const T* pCPUTensor,
_In_ ImageNominalPixelRange pixelRange,
_In_ T* pCPUTensor,
_In_ uint32_t bufferWidth,
_In_ uint32_t tensorHeight,
_In_ uint32_t tensorWidth,
@ -30,6 +32,8 @@ class CpuDetensorizer {
uint32_t end = bufferWidth * tensorHeight;
size_t tensorPlaneSize = tensorWidth * tensorHeight;
auto nominalRangeConverter = NominalRangeConverter(pixelRange);
if (formatFrom == formatTo && (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) {
for (uint32_t i = 0; i < tensorHeight; i++) {
BYTE* pPixel = pData;
@ -40,7 +44,8 @@ class CpuDetensorizer {
pCPUTensor + tensorPlaneSize * 2 + i * tensorWidth,
tensorWidth,
pPixel,
bytesPerPixel);
bytesPerPixel,
nominalRangeConverter);
pData += bufferWidth;
}
@ -54,7 +59,8 @@ class CpuDetensorizer {
pCPUTensor + i * tensorWidth,
tensorWidth,
pPixel,
bytesPerPixel);
bytesPerPixel,
nominalRangeConverter);
pData += bufferWidth;
}
@ -62,7 +68,7 @@ class CpuDetensorizer {
// just replicate the gray data across each channel
for (uint32_t i = 0; i < end; i += bufferWidth) {
for (uint32_t j = i; j < i + bytesPerRow; j += 4) {
BYTE bGray = DetensorizeValue<T>(pCPUTensor);
BYTE bGray = DetensorizeValue<T>(pCPUTensor, nominalRangeConverter);
pData[j] = bGray;
pData[j + 1] = bGray;
pData[j + 2] = bGray;
@ -73,7 +79,7 @@ class CpuDetensorizer {
} else if (formatFrom == kImageTensorChannelTypeGRAY8 && formatTo == kImageTensorChannelTypeGRAY8) {
for (uint32_t i = 0; i < end; i += bufferWidth) {
for (uint32_t j = i; j < i + bytesPerRow; j += 1) {
BYTE bGray = DetensorizeValue<T>(pCPUTensor);
BYTE bGray = DetensorizeValue<T>(pCPUTensor, nominalRangeConverter);
pData[j] = bGray;
pCPUTensor++;
}
@ -83,9 +89,9 @@ class CpuDetensorizer {
for (uint32_t j = i; j < i + bytesPerRow; j += 1) {
BYTE red, green, blue;
blue = DetensorizeValue(pCPUTensor);
green = DetensorizeValue(pCPUTensor + tensorPlaneSize);
red = DetensorizeValue(pCPUTensor + tensorPlaneSize * 2);
blue = DetensorizeValue(pCPUTensor, nominalRangeConverter);
green = DetensorizeValue(pCPUTensor + tensorPlaneSize, nominalRangeConverter);
red = DetensorizeValue(pCPUTensor + tensorPlaneSize * 2, nominalRangeConverter);
pData[j] = static_cast<BYTE>(0.2126f * red + 0.7152f * green + 0.0722f * blue);
pCPUTensor++;
@ -96,9 +102,9 @@ class CpuDetensorizer {
for (uint32_t j = i; j < i + bytesPerRow; j += 1) {
BYTE red, green, blue;
red = DetensorizeValue(pCPUTensor);
green = DetensorizeValue(pCPUTensor + tensorPlaneSize);
blue = DetensorizeValue(pCPUTensor + tensorPlaneSize * 2);
red = DetensorizeValue(pCPUTensor, nominalRangeConverter);
green = DetensorizeValue(pCPUTensor + tensorPlaneSize, nominalRangeConverter);
blue = DetensorizeValue(pCPUTensor + tensorPlaneSize * 2, nominalRangeConverter);
pData[j] = static_cast<BYTE>(0.2126f * red + 0.7152f * green + 0.0722f * blue);
pCPUTensor++;
@ -114,18 +120,21 @@ class CpuDetensorizer {
private:
template <typename T>
static float ReadTensor(const T* pCPUTensor) {
return *pCPUTensor;
static float ReadTensor(const T* pCPUTensor, const NominalRangeConverter& nominalRangeConverter) {
return nominalRangeConverter.Denormalize(*pCPUTensor);
}
template <>
static float ReadTensor<DirectX::PackedVector::HALF>(const DirectX::PackedVector::HALF* pCPUTensor) {
return DirectX::PackedVector::XMConvertHalfToFloat(*pCPUTensor);
static float ReadTensor<DirectX::PackedVector::HALF>(
const DirectX::PackedVector::HALF* pCPUTensor,
const NominalRangeConverter& nominalRangeConverter) {
return nominalRangeConverter.Denormalize(
DirectX::PackedVector::XMConvertHalfToFloat(*pCPUTensor));
}
template <typename T>
static BYTE DetensorizeValue(const T* pCPUTensor) {
return static_cast<BYTE>(std::max(0.0f, std::min(255.0f, ReadTensor(pCPUTensor) + 0.5f)));
static BYTE DetensorizeValue(const T* pCPUTensor, const NominalRangeConverter& nominalRangeConverter) {
return static_cast<BYTE>(std::max(0.0f, std::min(255.0f, ReadTensor(pCPUTensor, nominalRangeConverter) + 0.5f)));
}
template <typename T>
@ -135,14 +144,15 @@ class CpuDetensorizer {
const T* zChannel,
uint32_t tensorWidth,
BYTE* pData,
uint32_t bytesPerPixel) {
uint32_t bytesPerPixel,
const NominalRangeConverter& nominalRangeConverter) {
BYTE* pPixel = pData;
uint32_t tensorWidthRemaining = tensorWidth;
while (tensorWidthRemaining > 0) {
pPixel[0] = DetensorizeValue(xChannel);
pPixel[1] = DetensorizeValue(yChannel);
pPixel[2] = DetensorizeValue(zChannel);
pPixel[0] = DetensorizeValue(xChannel, nominalRangeConverter);
pPixel[1] = DetensorizeValue(yChannel, nominalRangeConverter);
pPixel[2] = DetensorizeValue(zChannel, nominalRangeConverter);
pPixel[3] = 255;
pPixel += 4;
@ -161,7 +171,9 @@ class CpuDetensorizer {
const float* zChannel,
uint32_t tensorWidth,
BYTE* pData,
uint32_t bytesPerPixel) {
uint32_t bytesPerPixel,
const NominalRangeConverter& nominalRangeConverter
) {
BYTE* pPixel = pData;
uint32_t tensorWidthRemaining = tensorWidth;
@ -175,22 +187,22 @@ class CpuDetensorizer {
while (tensorWidthRemaining >= 8) {
// Load, saturate, and convert to ints, 8 - 32 bit floats from X channel
__m128i vXIntsLo = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(xChannel), maxv));
__m128i vXIntsHi = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(xChannel + 4), maxv));
__m128i vXIntsLo = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(xChannel)), maxv));
__m128i vXIntsHi = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(xChannel + 4)), maxv));
// Pack 32 bit ints into 16 bit ints
__m128i vXWords = _mm_packs_epi32(vXIntsLo, vXIntsHi);
// Load, saturate, and convert to ints, 8 - 32 bit floats from Y channel
__m128i vYIntsLo = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(yChannel), maxv));
__m128i vYIntsHi = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(yChannel + 4), maxv));
__m128i vYIntsLo = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(yChannel)), maxv));
__m128i vYIntsHi = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(yChannel + 4)), maxv));
// Pack 32 bit ints into 16 bit ints
__m128i vYWords = _mm_packs_epi32(vYIntsLo, vYIntsHi);
// Load, saturate, and convert to ints, 8 - 32 bit floats from Z channel
__m128i vZIntsLo = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(zChannel), maxv));
__m128i vZIntsHi = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(zChannel + 4), maxv));
__m128i vZIntsLo = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(zChannel)), maxv));
__m128i vZIntsHi = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(zChannel + 4)), maxv));
// Pack 32 bit ints into 16 bit ints
__m128i vZWords = _mm_packs_epi32(vZIntsLo, vZIntsHi);
@ -221,9 +233,9 @@ class CpuDetensorizer {
// Anything remaining deal with it one at a time
while (tensorWidthRemaining > 0) {
pPixel[0] = DetensorizeValue(xChannel);
pPixel[1] = DetensorizeValue(yChannel);
pPixel[2] = DetensorizeValue(zChannel);
pPixel[0] = DetensorizeValue(xChannel, nominalRangeConverter);
pPixel[1] = DetensorizeValue(yChannel, nominalRangeConverter);
pPixel[2] = DetensorizeValue(zChannel, nominalRangeConverter);
pPixel[3] = 255;
pPixel += bytesPerPixel;

View file

@ -4,6 +4,7 @@
#pragma once
#include "inc/ImageConversionTypes.h"
#include "inc/NominalRangeConverter.h"
namespace _winml {
@ -13,6 +14,7 @@ class CpuTensorizer {
static HRESULT TensorizeData(
_In_ ImageTensorChannelType formatFrom,
_In_ ImageTensorChannelType formatTo,
_In_ ImageNominalPixelRange pixelRange,
_In_ BYTE* pBuffer,
_In_ UINT32 bufferWidth,
_In_ const wgi::BitmapBounds& inputBounds,
@ -33,6 +35,8 @@ class CpuTensorizer {
uint32_t xElements = inputBounds.Width - inputBounds.X;
uint32_t yElements = inputBounds.Height - inputBounds.Y;
auto nominalRangeConverter = NominalRangeConverter(pixelRange);
if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeRGB8) {
// Convert BGR8 -> BGR8 or RGB8 -> RGB8
for (uint32_t y = 0; y < yElements; y++) {
@ -42,7 +46,8 @@ class CpuTensorizer {
pCPUTensor + (inputBounds.Height * inputBounds.Width) + y * inputBounds.Width,
pCPUTensor + (inputBounds.Height * inputBounds.Width) * 2 + y * inputBounds.Width,
xElements,
bytesPerPixel);
bytesPerPixel,
nominalRangeConverter);
}
} else if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeRGB8 || formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeBGR8) {
// Convert RGB8 -> BGR8 or BGR8 -> RGB8
@ -53,7 +58,8 @@ class CpuTensorizer {
pCPUTensor + (inputBounds.Height * inputBounds.Width) + y * inputBounds.Width,
pCPUTensor + y * inputBounds.Width,
xElements,
bytesPerPixel);
bytesPerPixel,
nominalRangeConverter);
}
} else if (formatTo == kImageTensorChannelTypeGRAY8 && (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) {
// Convert BGR8 -> GRAY8 or RGB8 -> GRAY8
@ -66,7 +72,7 @@ class CpuTensorizer {
float green = float(pBuffer[j + 1]);
float blue = float(pBuffer[j + blueIncrement]);
float gray = 0.2126f * red + 0.7152f * green + 0.0722f * blue;
pCPUTensor[pixelInd] = ConvertByteToFloat<T>(static_cast<BYTE>(gray));
pCPUTensor[pixelInd] = ConvertByteToFloat<T>(static_cast<BYTE>(gray), nominalRangeConverter);
pixelInd++;
}
}
@ -74,9 +80,9 @@ class CpuTensorizer {
// Convert GRAY8 -> BGR8 or GRAY8 -> RGB8
for (UINT32 i = start; i < end; i += bufferWidth) {
for (UINT32 j = i; j < i + bytesPerRow; j += bytesPerPixel) {
pCPUTensor[pixelInd] = ConvertByteToFloat<T>(pBuffer[j]);
pCPUTensor[(inputBounds.Height * inputBounds.Width) + pixelInd] = ConvertByteToFloat<T>(pBuffer[j]);
pCPUTensor[(inputBounds.Height * inputBounds.Width * 2) + pixelInd] = ConvertByteToFloat<T>(pBuffer[j]);
pCPUTensor[pixelInd] = ConvertByteToFloat<T>(pBuffer[j], nominalRangeConverter);
pCPUTensor[(inputBounds.Height * inputBounds.Width) + pixelInd] = ConvertByteToFloat<T>(pBuffer[j], nominalRangeConverter);
pCPUTensor[(inputBounds.Height * inputBounds.Width * 2) + pixelInd] = ConvertByteToFloat<T>(pBuffer[j], nominalRangeConverter);
pixelInd++;
}
}
@ -84,7 +90,7 @@ class CpuTensorizer {
// Convert GRAY8 -> GRAY8
for (UINT32 i = start; i < end; i += bufferWidth) {
for (UINT32 j = i; j < i + bytesPerRow; j += bytesPerPixel) {
pCPUTensor[pixelInd] = ConvertByteToFloat<T>(pBuffer[j]);
pCPUTensor[pixelInd] = ConvertByteToFloat<T>(pBuffer[j], nominalRangeConverter);
pixelInd++;
}
}
@ -97,16 +103,17 @@ class CpuTensorizer {
}
private:
template <typename T>
static T ConvertByteToFloat(const BYTE& input);
static T ConvertByteToFloat(const BYTE& input, const NominalRangeConverter& nominalRangeConverter);
template <>
static float ConvertByteToFloat(const BYTE& input) {
return static_cast<float>(input);
static float ConvertByteToFloat(const BYTE& input, const NominalRangeConverter& nominalRangeConverter) {
return nominalRangeConverter.Normalize(static_cast<float>(input));
}
template <>
static DirectX::PackedVector::HALF ConvertByteToFloat(const BYTE& input) {
return DirectX::PackedVector::XMConvertFloatToHalf(input);
static DirectX::PackedVector::HALF ConvertByteToFloat(const BYTE& input, const NominalRangeConverter& nominalRangeConverter) {
return nominalRangeConverter.Normalize(DirectX::PackedVector::XMConvertFloatToHalf(input));
}
template <typename T>
@ -116,29 +123,30 @@ class CpuTensorizer {
_Inout_ T* yChannel,
_Inout_ T* zChannel,
uint32_t pixelElements,
uint32_t bytesPerPixel) {
uint32_t bytesPerPixel,
const NominalRangeConverter& nominalRangeConverter) {
UINT32 j;
for (j = 0; j < (pixelElements & 0xFFFFFFFC); j += 4) {
xChannel[j] = ConvertByteToFloat<T>(pBuffer[0]);
yChannel[j] = ConvertByteToFloat<T>(pBuffer[1]);
zChannel[j] = ConvertByteToFloat<T>(pBuffer[2]);
xChannel[j + 1] = ConvertByteToFloat<T>(pBuffer[4]);
yChannel[j + 1] = ConvertByteToFloat<T>(pBuffer[5]);
zChannel[j + 1] = ConvertByteToFloat<T>(pBuffer[6]);
xChannel[j + 2] = ConvertByteToFloat<T>(pBuffer[8]);
yChannel[j + 2] = ConvertByteToFloat<T>(pBuffer[9]);
zChannel[j + 2] = ConvertByteToFloat<T>(pBuffer[10]);
xChannel[j + 3] = ConvertByteToFloat<T>(pBuffer[12]);
yChannel[j + 3] = ConvertByteToFloat<T>(pBuffer[13]);
zChannel[j + 3] = ConvertByteToFloat<T>(pBuffer[14]);
xChannel[j] = ConvertByteToFloat<T>(pBuffer[0], nominalRangeConverter);
yChannel[j] = ConvertByteToFloat<T>(pBuffer[1], nominalRangeConverter);
zChannel[j] = ConvertByteToFloat<T>(pBuffer[2], nominalRangeConverter);
xChannel[j + 1] = ConvertByteToFloat<T>(pBuffer[4], nominalRangeConverter);
yChannel[j + 1] = ConvertByteToFloat<T>(pBuffer[5], nominalRangeConverter);
zChannel[j + 1] = ConvertByteToFloat<T>(pBuffer[6], nominalRangeConverter);
xChannel[j + 2] = ConvertByteToFloat<T>(pBuffer[8], nominalRangeConverter);
yChannel[j + 2] = ConvertByteToFloat<T>(pBuffer[9], nominalRangeConverter);
zChannel[j + 2] = ConvertByteToFloat<T>(pBuffer[10], nominalRangeConverter);
xChannel[j + 3] = ConvertByteToFloat<T>(pBuffer[12], nominalRangeConverter);
yChannel[j + 3] = ConvertByteToFloat<T>(pBuffer[13], nominalRangeConverter);
zChannel[j + 3] = ConvertByteToFloat<T>(pBuffer[14], nominalRangeConverter);
pBuffer += bytesPerPixel * 4;
}
for (; j < pixelElements; j++) {
xChannel[j] = ConvertByteToFloat<T>(pBuffer[0]);
yChannel[j] = ConvertByteToFloat<T>(pBuffer[1]);
zChannel[j] = ConvertByteToFloat<T>(pBuffer[2]);
xChannel[j] = ConvertByteToFloat<T>(pBuffer[0], nominalRangeConverter);
yChannel[j] = ConvertByteToFloat<T>(pBuffer[1], nominalRangeConverter);
zChannel[j] = ConvertByteToFloat<T>(pBuffer[2], nominalRangeConverter);
pBuffer += bytesPerPixel;
}
}
@ -151,7 +159,8 @@ class CpuTensorizer {
_Inout_ float* yChannel,
_Inout_ float* zChannel,
uint32_t pixelElements,
uint32_t bytesPerPixel) {
uint32_t bytesPerPixel,
const NominalRangeConverter& nominalRangeConverter) {
assert(bytesPerPixel == 4);
__m128i ZeroVector = _mm_setzero_si128();
@ -189,8 +198,8 @@ class CpuTensorizer {
__m128i vXIntsHi = _mm_unpackhi_epi16(vXWords, ZeroVector);
// store 256 bits of X channel Floats
_mm_storeu_ps(xChannel, _mm_cvtepi32_ps(vXIntsLo));
_mm_storeu_ps(xChannel + 4, _mm_cvtepi32_ps(vXIntsHi));
_mm_storeu_ps(xChannel, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vXIntsLo)));
_mm_storeu_ps(xChannel + 4, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vXIntsHi)));
xChannel += 8;
// unpack again for Y
@ -199,8 +208,8 @@ class CpuTensorizer {
__m128i vYIntsLo = _mm_unpacklo_epi16(vYWords, ZeroVector);
__m128i vYIntsHi = _mm_unpackhi_epi16(vYWords, ZeroVector);
_mm_storeu_ps(yChannel, _mm_cvtepi32_ps(vYIntsLo));
_mm_storeu_ps(yChannel + 4, _mm_cvtepi32_ps(vYIntsHi));
_mm_storeu_ps(yChannel, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vYIntsLo)));
_mm_storeu_ps(yChannel + 4, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vYIntsHi)));
yChannel += 8;
// unpack again for Z
@ -209,8 +218,8 @@ class CpuTensorizer {
__m128i vZIntsLo = _mm_unpacklo_epi16(vZWords, ZeroVector);
__m128i vZIntsHi = _mm_unpackhi_epi16(vZWords, ZeroVector);
_mm_storeu_ps(zChannel, _mm_cvtepi32_ps(vZIntsLo));
_mm_storeu_ps(zChannel + 4, _mm_cvtepi32_ps(vZIntsHi));
_mm_storeu_ps(zChannel, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vZIntsLo)));
_mm_storeu_ps(zChannel + 4, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vZIntsHi)));
zChannel += 8;
pBuffer += 32;
@ -230,7 +239,7 @@ class CpuTensorizer {
__m128i vInts2 = _mm_unpacklo_epi16(vWords1, ZeroVector);
__m128i vInts3 = _mm_unpackhi_epi16(vWords1, ZeroVector);
// Convert to floats
// Normalize to floats
__m128 vFloats0 = _mm_cvtepi32_ps(vInts0);
__m128 vFloats1 = _mm_cvtepi32_ps(vInts1);
__m128 vFloats2 = _mm_cvtepi32_ps(vInts2);
@ -240,9 +249,9 @@ class CpuTensorizer {
_MM_TRANSPOSE4_PS(vFloats0, vFloats1, vFloats2, vFloats3);
// Drop alpha channel transposed to vFloats3 write out rest
_mm_storeu_ps(xChannel, vFloats0);
_mm_storeu_ps(yChannel, vFloats1);
_mm_storeu_ps(zChannel, vFloats2);
_mm_storeu_ps(xChannel, nominalRangeConverter.Normalize(vFloats0));
_mm_storeu_ps(yChannel, nominalRangeConverter.Normalize(vFloats1));
_mm_storeu_ps(zChannel, nominalRangeConverter.Normalize(vFloats2));
xChannel += 4;
yChannel += 4;
@ -253,9 +262,9 @@ class CpuTensorizer {
// Any remainder just do one at a time
for (uint32_t j = 0; j < pixelElements; j++) {
xChannel[j] = static_cast<float>(pBuffer[0]);
yChannel[j] = static_cast<float>(pBuffer[1]);
zChannel[j] = static_cast<float>(pBuffer[2]);
xChannel[j] = nominalRangeConverter.Normalize(static_cast<float>(pBuffer[0]));
yChannel[j] = nominalRangeConverter.Normalize(static_cast<float>(pBuffer[1]));
zChannel[j] = nominalRangeConverter.Normalize(static_cast<float>(pBuffer[2]));
pBuffer += bytesPerPixel;
}
}

View file

@ -0,0 +1,59 @@
#include "pch.h"
#include "inc/NominalRangeConverter.h"
namespace _winml {
NominalRangeConverter::NominalRangeConverter(ImageNominalPixelRange pixelRange) {
// For Normalization: the formula is input_range[min, max] / scale - shift
// For DeNormalization: the formula is (input_range[min, max] + shift) * scale
if (pixelRange == ImageNominalPixelRange::kNominalRange_0_255) {
scale = 1.f;
shift = 0;
}
else if (pixelRange == ImageNominalPixelRange::kNormalized_0_1) {
scale = 255.f;
shift = 0;
}
else if (pixelRange == ImageNominalPixelRange::kNormalized_1_1) {
scale = (255.f / 2.f);
shift = 1;
}
};
// [0, 255] --> [0, 255]
// [0, 255] / 255 --> [0, 1]
// [0, 255] * 2 / 255 - 1 --> [-1, 1]
float NominalRangeConverter::Normalize(float val) const {
return val / scale - shift;
}
DirectX::PackedVector::HALF NominalRangeConverter::Normalize(DirectX::PackedVector::HALF val) const {
return val / scale - shift;
}
__m128 NominalRangeConverter::Normalize(__m128 sse_data) const {
__m128 sse_shift = _mm_set1_ps(shift);
__m128 sse_scale = _mm_set1_ps(scale);
auto sse_dived = _mm_div_ps(sse_data, sse_scale);
return _mm_sub_ps(sse_dived, sse_shift);
}
// [0, 255] --> [0, 255]
// ([0, 1] + 0 ) * 255 -> [0, 1]
// ([-1, 1] + 1) * 255 / 2 --> [-1, 1]
float NominalRangeConverter::Denormalize(float val) const {
return scale * (val + shift);
}
DirectX::PackedVector::HALF NominalRangeConverter::Denormalize(DirectX::PackedVector::HALF val) const {
return scale * (val + shift);
}
__m128 NominalRangeConverter::Denormalize(__m128 sse_data) const {
__m128 sse_shift = _mm_set1_ps(shift);
__m128 sse_scale = _mm_set1_ps(scale);
auto sse_added = _mm_add_ps(sse_data, sse_shift);
return _mm_mul_ps(sse_added, sse_scale);
}
} // namespace _winml

View file

@ -613,8 +613,24 @@ void TensorToVideoFrameConverter::ConvertCPUTensorToSoftwareBitmap(
ImageTensorChannelType targetChannelType = _winmli::GetChannelTypeFromSoftwareBitmap(softwareBitmap);
if (tensorDesc.dataType == kImageTensorDataTypeFloat32) {
WINML_THROW_IF_FAILED(CpuDetensorizer::Detensorize<float>(tensorDesc.channelType, targetChannelType, static_cast<float*>(pCPUTensor), bufferWidth, height, width, pData));
WINML_THROW_IF_FAILED(CpuDetensorizer::Detensorize<float>(
tensorDesc.channelType,
targetChannelType,
tensorDesc.pixelRange,
static_cast<float*>(pCPUTensor),
bufferWidth,
height,
width,
pData));
} else if (tensorDesc.dataType == kImageTensorDataTypeFloat16) {
WINML_THROW_IF_FAILED(CpuDetensorizer::Detensorize<DirectX::PackedVector::HALF>(tensorDesc.channelType, targetChannelType, static_cast<DirectX::PackedVector::HALF*>(pCPUTensor), bufferWidth, height, width, pData));
WINML_THROW_IF_FAILED(CpuDetensorizer::Detensorize<DirectX::PackedVector::HALF>(
tensorDesc.channelType,
targetChannelType,
tensorDesc.pixelRange,
static_cast<DirectX::PackedVector::HALF*>(pCPUTensor),
bufferWidth,
height,
width,
pData));
}
}

View file

@ -558,8 +558,22 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToCPUTensor(
ImageTensorChannelType channelType = _winmli::GetChannelTypeFromSoftwareBitmap(softwareBitmap);
if (tensorDesc.dataType == _winml::kImageTensorDataTypeFloat32) {
WINML_THROW_IF_FAILED(CpuTensorizer::TensorizeData<float>(channelType, tensorDesc.channelType, pData, bufferWidth, inputBounds, reinterpret_cast<float*>(pCPUTensor)));
WINML_THROW_IF_FAILED(CpuTensorizer::TensorizeData<float>(
channelType,
tensorDesc.channelType,
tensorDesc.pixelRange,
pData,
bufferWidth,
inputBounds,
reinterpret_cast<float*>(pCPUTensor)));
} else if (tensorDesc.dataType == _winml::kImageTensorDataTypeFloat16) {
WINML_THROW_IF_FAILED(CpuTensorizer::TensorizeData<DirectX::PackedVector::HALF>(channelType, tensorDesc.channelType, pData, bufferWidth, inputBounds, reinterpret_cast<DirectX::PackedVector::HALF*>(pCPUTensor)));
WINML_THROW_IF_FAILED(CpuTensorizer::TensorizeData<DirectX::PackedVector::HALF>(
channelType,
tensorDesc.channelType,
tensorDesc.pixelRange,
pData,
bufferWidth,
inputBounds,
reinterpret_cast<DirectX::PackedVector::HALF*>(pCPUTensor)));
}
}

View file

@ -25,9 +25,17 @@ enum ImageTensorChannelType {
ImageTensorChannelType_COUNT
};
enum ImageNominalPixelRange {
kNominalRange_0_255,
kNormalized_0_1,
kNormalized_1_1,
ImageNominalPixelRange_COUNT
};
struct ImageTensorDescription {
ImageTensorDataType dataType;
ImageTensorChannelType channelType;
ImageNominalPixelRange pixelRange;
int64_t sizes[kImageTensorDimensionCountMax];
};
} // namespace _winml

View file

@ -0,0 +1,31 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "inc/ImageConversionTypes.h"
namespace _winml {
class NominalRangeConverter {
public:
NominalRangeConverter() = delete;
NominalRangeConverter(ImageNominalPixelRange pixelRange);
float Normalize(float val) const;
DirectX::PackedVector::HALF Normalize(DirectX::PackedVector::HALF val) const;
__m128 Normalize(__m128 sse_data) const;
float Denormalize(float val) const;
DirectX::PackedVector::HALF Denormalize(DirectX::PackedVector::HALF val) const;
__m128 Denormalize(__m128 sse_data) const;
private:
float scale;
int32_t shift;
};
} // namespace _winml

View file

@ -40,7 +40,9 @@ static const char* c_unsupported_color_spaces[] =
static const char* c_nominal_range_key = "Image.NominalPixelRange";
static const char* c_supported_nominal_ranges[] =
{
"NominalRange_0_255"};
"NominalRange_0_255",
"Normalized_0_1",
"Normalized_1_1"};
namespace _winml {

View file

@ -174,8 +174,10 @@ static unsigned GetSizeFromTensorDataType(_winml::ImageTensorDataType type) {
FAIL_FAST_HR(E_INVALIDARG);
}
static _winml::ImageTensorDescription CreateImageTensorDescriptor(winml::TensorKind tensorKind, wgi::BitmapPixelFormat pixelFormat,
uint32_t batchSize, uint32_t width, uint32_t height) {
static _winml::ImageTensorDescription CreateImageTensorDescriptor(winml::TensorKind tensorKind,
wgi::BitmapPixelFormat pixelFormat,
ImageNominalPixelRange pixelRange,
uint32_t batchSize, uint32_t width, uint32_t height) {
_winml::ImageTensorDescription tensorDescription = {};
tensorDescription.dataType = GetTensorDataTypeFromTensorKind(tensorKind);
tensorDescription.sizes[0] = batchSize;
@ -192,6 +194,17 @@ static _winml::ImageTensorDescription CreateImageTensorDescriptor(winml::TensorK
} else {
THROW_HR(E_NOTIMPL);
}
if (pixelRange == ImageNominalPixelRange::ImageNominalPixelRange_NominalRange_0_255) {
tensorDescription.pixelRange = _winml::ImageNominalPixelRange::kNominalRange_0_255;
} else if (pixelRange == ImageNominalPixelRange::ImageNominalPixelRange_Normalized_0_1) {
tensorDescription.pixelRange = _winml::ImageNominalPixelRange::kNormalized_0_1;
} else if (pixelRange == ImageNominalPixelRange::ImageNominalPixelRange_Normalized_1_1) {
tensorDescription.pixelRange = _winml::ImageNominalPixelRange::kNormalized_1_1;
} else {
THROW_HR(E_NOTIMPL);
}
tensorDescription.sizes[2] = height;
tensorDescription.sizes[3] = width;
@ -375,8 +388,15 @@ std::optional<ImageFeatureValue::ImageResourceMetadata> ImageFeatureValue::GetIn
THROW_HR(WINML_ERR_SIZE_MISMATCH);
}
}
// Set up ImageNominalPixelRange
ImageNominalPixelRange pixelRange = ImageNominalPixelRange::ImageNominalPixelRange_NominalRange_0_255; //default;
if (spImageDescriptor) {
pixelRange = spImageDescriptor->GetNominalPixelRange();
}
//NCHW layout
auto imageTensorDescriptor = CreateImageTensorDescriptor(tensorKind, pixelFormat.value(), m_batchSize, descriptorWidth, descriptorHeight);
auto imageTensorDescriptor = CreateImageTensorDescriptor(tensorKind, pixelFormat.value(), pixelRange, m_batchSize, descriptorWidth, descriptorHeight);
return ImageResourceMetadata{bounds, imageTensorDescriptor};
}

View file

@ -1,20 +1,20 @@
 OnnxMLTools
0.1.0.0000"onnxml:í
0.1.0.0000"onnxml:û
%
input_39
input_40add_3Add"Addkeras_Add_ImageNet_smallZ$
input_39
input_40add_3Add"Addkeras_Add_ImageNet_smallZ+
input_39



И
Z$
input_40
2IMAGEZ+
input_40



И
b^
2IMAGEb^
add_3U
LH


View file

@ -1,20 +1,20 @@
 OnnxMLTools
0.1.0.0000"onnxml:í
0.1.0.0000"onnxml:û
%
input_39
input_40add_3Add"Addkeras_Add_ImageNet_smallZ$
input_39
input_40add_3Add"Addkeras_Add_ImageNet_smallZ+
input_39



И
Z$
input_40
2IMAGEZ+
input_40



И
b^
2IMAGEb^
add_3U
LH


View file

@ -616,12 +616,12 @@ TEST_F(ImageTests, ImageMetaDataTest) {
// supported image metadata
ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_255.onnx", BitmapAlphaMode::Premultiplied, BitmapPixelFormat::Bgra8, true);
ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataRgb8_SRGB_0_255.onnx", BitmapAlphaMode::Premultiplied, BitmapPixelFormat::Rgba8, true);
ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx", BitmapAlphaMode::Premultiplied, BitmapPixelFormat::Bgra8, true);
ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx", BitmapAlphaMode::Premultiplied, BitmapPixelFormat::Bgra8, true);
// unsupported image metadata
ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgra8_SRGB_0_255.onnx", BitmapAlphaMode::Straight, BitmapPixelFormat::Bgra8, false);
ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataRgba8_SRGB_0_255.onnx", BitmapAlphaMode::Straight, BitmapPixelFormat::Rgba8, false);
ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx", BitmapAlphaMode::Straight, BitmapPixelFormat::Bgra8, false);
ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx", BitmapAlphaMode::Straight, BitmapPixelFormat::Bgra8, false);
ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_16_235.onnx", BitmapAlphaMode::Straight, BitmapPixelFormat::Bgra8, false);
ValidateOutputImageMetaData(L"Add_ImageNet1920WithImageMetadataBgr8_LINEAR_0_255.onnx", BitmapAlphaMode::Straight, BitmapPixelFormat::Bgra8, false);
}

View file

@ -1028,6 +1028,50 @@ static void Scenario22ImageBindingAsGPUTensor() {
encoder.FlushAsync().get();
}
static void Scenario23NominalPixelRange() {
std::wstring modulePath = FileHelpers::GetModulePath();
std::wstring inputImagePath = modulePath + L"1080.jpg";
// The following models have single op "add", with different metadata
std::vector<std::wstring> modelPaths = {
// Normalized_0_1 and image output
modulePath + L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_0_1.onnx",
// Normalized_1_1 and image output
modulePath + L"Add_ImageNet1920WithImageMetadataBgr8_SRGB_1_1.onnx"
};
for (uint32_t model_i = 0; model_i < modelPaths.size(); model_i++) {
// load model and create session
auto model = LearningModel::LoadFromFilePath(modelPaths[model_i]);
auto session = LearningModelSession(model, LearningModelDevice(LearningModelDeviceKind::DirectX));
auto binding = LearningModelBinding(session);
SoftwareBitmap softwareBitmap = FileHelpers::GetSoftwareBitmapFromFile(inputImagePath);
auto videoFrame = VideoFrame::CreateWithSoftwareBitmap(softwareBitmap);
auto imageValue = ImageFeatureValue::CreateFromVideoFrame(videoFrame);
// Create Zero tensor
auto inputShape = std::vector<int64_t>{ 1, 3, 1080, 1920 };
auto inputData = std::vector<float>(3 * 1080 * 1920, 0);
auto zeroValue =
TensorFloat::CreateFromIterable(
inputShape,
winrt::single_threaded_vector<float>(std::move(inputData)).GetView());
// bind inputs
binding.Bind(L"input_39", imageValue);
binding.Bind(L"input_40", zeroValue);
VideoFrame outputimage(BitmapPixelFormat::Bgra8, 1920, 1080);
ImageFeatureValue outputIfv = ImageFeatureValue::CreateFromVideoFrame(outputimage);
binding.Bind(L"add_3", outputIfv);
winrt::hstring correlationId;
session.EvaluateAsync(binding, correlationId).get();
WINML_EXPECT_TRUE(VerifyHelper(imageValue, outputIfv));
}
}
static void QuantizedModels() {
// load a model
std::wstring filePath = FileHelpers::GetModulePath() + L"onnxzoo_lotus_inception_v1-dq.onnx";
@ -1408,6 +1452,7 @@ const ScenarioTestsApi& getapi() {
Scenario8SetDeviceSampleCPU,
Scenario17DevDiagnostics,
Scenario22ImageBindingAsCPUTensor,
Scenario23NominalPixelRange,
QuantizedModels,
EncryptedStream,
Scenario3SoftwareBitmapInputBinding,
@ -1450,6 +1495,7 @@ const ScenarioTestsApi& getapi() {
api.Scenario20bLoadBindEvalReplacementCustomOperatorCPU = SkipTest;
api.Scenario21RunModel2ChainZ = SkipTest;
api.Scenario22ImageBindingAsGPUTensor = SkipTest;
api.Scenario23NominalPixelRange = SkipTest;
api.MsftQuantizedModels = SkipTest;
api.SyncVsAsync = SkipTest;
api.CustomCommandQueueWithFence = SkipTest;
@ -1480,6 +1526,7 @@ const ScenarioTestsApi& getapi() {
api.Scenario21RunModel2ChainZ = SkipTest;
api.Scenario22ImageBindingAsCPUTensor = SkipTest;
api.Scenario22ImageBindingAsGPUTensor = SkipTest;
api.Scenario23NominalPixelRange = SkipTest;
api.CustomCommandQueueWithFence = SkipTest;
api.ReuseVideoFrame = SkipTest;
api.D2DInterop = SkipTest;

View file

@ -14,6 +14,7 @@ struct ScenarioTestsApi
VoidTest Scenario8SetDeviceSampleCPU;
VoidTest Scenario17DevDiagnostics;
VoidTest Scenario22ImageBindingAsCPUTensor;
VoidTest Scenario23NominalPixelRange;
VoidTest QuantizedModels;
VoidTest EncryptedStream;
VoidTest Scenario3SoftwareBitmapInputBinding;
@ -54,6 +55,7 @@ WINML_TEST(ScenarioCppWinrtTests, Scenario8SetDeviceSampleDefault)
WINML_TEST(ScenarioCppWinrtTests, Scenario8SetDeviceSampleCPU)
WINML_TEST(ScenarioCppWinrtTests, Scenario17DevDiagnostics)
WINML_TEST(ScenarioCppWinrtTests, Scenario22ImageBindingAsCPUTensor)
WINML_TEST(ScenarioCppWinrtTests, Scenario23NominalPixelRange)
WINML_TEST(ScenarioCppWinrtTests, QuantizedModels)
WINML_TEST(ScenarioCppWinrtTests, EncryptedStream)
WINML_TEST(ScenarioCppWinrtTests, Scenario3SoftwareBitmapInputBinding)