mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-17 21:10:43 +00:00
Enable cpp20 builds for DML EP and WinML API 1) Missing typename for templated types 2) unmove helper for inline references to rvalue temporaries This is okay since per the standard a temporary bound to a reference parameter in a function call exists until the end of the full expression containing that function call: if the function returns a reference, which outlives the full expression, it becomes a dangling reference. 3) static now not needed for template specializations --------- Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
265 lines
9.6 KiB
C++
265 lines
9.6 KiB
C++
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
// Licensed under the MIT License.
|
|
|
|
#pragma once
|
|
|
|
#include "inc/ImageConversionTypes.h"
|
|
#include "inc/NominalRangeConverter.h"
|
|
|
|
namespace _winml {
|
|
|
|
class CpuDetensorizer {
|
|
public:
|
|
template <typename T>
|
|
static HRESULT Detensorize(
|
|
_In_ ImageTensorChannelType formatFrom,
|
|
_In_ ImageTensorChannelType formatTo,
|
|
_In_ winml::LearningModelPixelRange pixelRange,
|
|
_In_ T* pCPUTensor,
|
|
_In_ uint32_t bufferWidth,
|
|
_In_ uint32_t tensorHeight,
|
|
_In_ uint32_t tensorWidth,
|
|
_Inout_ BYTE* pData
|
|
) {
|
|
#pragma warning(push)
|
|
#pragma warning(disable : 26014 \
|
|
) // warning about possible out of bounds accesing pData, but input is checked for BGRA8 format, so uiCapacity should be in multiples of 4 \
|
|
// output is BGRA8: so blue at i, green is at i + 1, red is at i + 2
|
|
|
|
uint32_t bytesPerPixel = formatTo == kImageTensorChannelTypeGRAY8 ? 1 : 4;
|
|
|
|
// bufferWidth may have padding because of optimization, but bytesPerRow includes only the real tensor data. We need to jump
|
|
// over bufferWidth's extra padding
|
|
uint32_t bytesPerRow = tensorWidth * bytesPerPixel;
|
|
uint32_t end = bufferWidth * tensorHeight;
|
|
size_t tensorPlaneSize = tensorWidth * tensorHeight;
|
|
|
|
auto nominalRangeConverter = NominalRangeConverter(pixelRange);
|
|
|
|
if (formatFrom == formatTo && (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) {
|
|
for (uint32_t i = 0; i < tensorHeight; i++) {
|
|
BYTE* pPixel = pData;
|
|
|
|
InterleaveRowFloatToByte(
|
|
pCPUTensor + i * tensorWidth,
|
|
pCPUTensor + tensorPlaneSize + i * tensorWidth,
|
|
pCPUTensor + tensorPlaneSize * 2 + i * tensorWidth,
|
|
tensorWidth,
|
|
pPixel,
|
|
bytesPerPixel,
|
|
nominalRangeConverter
|
|
);
|
|
|
|
pData += bufferWidth;
|
|
}
|
|
} else if ((formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeBGR8) || (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeRGB8)) {
|
|
for (uint32_t i = 0; i < tensorHeight; i++) {
|
|
BYTE* pPixel = pData;
|
|
|
|
InterleaveRowFloatToByte(
|
|
pCPUTensor + tensorPlaneSize * 2 + i * tensorWidth,
|
|
pCPUTensor + tensorPlaneSize + i * tensorWidth,
|
|
pCPUTensor + i * tensorWidth,
|
|
tensorWidth,
|
|
pPixel,
|
|
bytesPerPixel,
|
|
nominalRangeConverter
|
|
);
|
|
|
|
pData += bufferWidth;
|
|
}
|
|
} else if (formatFrom == kImageTensorChannelTypeGRAY8 && (formatTo == kImageTensorChannelTypeBGR8 || formatTo == kImageTensorChannelTypeRGB8)) {
|
|
// just replicate the gray data across each channel
|
|
for (uint32_t i = 0; i < end; i += bufferWidth) {
|
|
for (uint32_t j = i; j < i + bytesPerRow; j += 4) {
|
|
BYTE bGray = DetensorizeValue<T>(pCPUTensor, nominalRangeConverter);
|
|
pData[j] = bGray;
|
|
pData[j + 1] = bGray;
|
|
pData[j + 2] = bGray;
|
|
pData[j + 3] = 255;
|
|
pCPUTensor++;
|
|
}
|
|
}
|
|
} else if (formatFrom == kImageTensorChannelTypeGRAY8 && formatTo == kImageTensorChannelTypeGRAY8) {
|
|
for (uint32_t i = 0; i < end; i += bufferWidth) {
|
|
for (uint32_t j = i; j < i + bytesPerRow; j += 1) {
|
|
BYTE bGray = DetensorizeValue<T>(pCPUTensor, nominalRangeConverter);
|
|
pData[j] = bGray;
|
|
pCPUTensor++;
|
|
}
|
|
}
|
|
} else if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeGRAY8) {
|
|
for (uint32_t i = 0; i < end; i += bufferWidth) {
|
|
for (uint32_t j = i; j < i + bytesPerRow; j += 1) {
|
|
BYTE red, green, blue;
|
|
|
|
blue = DetensorizeValue(pCPUTensor, nominalRangeConverter);
|
|
green = DetensorizeValue(pCPUTensor + tensorPlaneSize, nominalRangeConverter);
|
|
red = DetensorizeValue(pCPUTensor + tensorPlaneSize * 2, nominalRangeConverter);
|
|
|
|
pData[j] = static_cast<BYTE>(0.2126f * red + 0.7152f * green + 0.0722f * blue);
|
|
pCPUTensor++;
|
|
}
|
|
}
|
|
} else if (formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeGRAY8) {
|
|
for (uint32_t i = 0; i < end; i += bufferWidth) {
|
|
for (uint32_t j = i; j < i + bytesPerRow; j += 1) {
|
|
BYTE red, green, blue;
|
|
|
|
red = DetensorizeValue(pCPUTensor, nominalRangeConverter);
|
|
green = DetensorizeValue(pCPUTensor + tensorPlaneSize, nominalRangeConverter);
|
|
blue = DetensorizeValue(pCPUTensor + tensorPlaneSize * 2, nominalRangeConverter);
|
|
|
|
pData[j] = static_cast<BYTE>(0.2126f * red + 0.7152f * green + 0.0722f * blue);
|
|
pCPUTensor++;
|
|
}
|
|
}
|
|
}
|
|
#pragma warning(pop)
|
|
else {
|
|
return E_INVALIDARG;
|
|
}
|
|
return S_OK;
|
|
}
|
|
|
|
private:
|
|
template <typename T>
|
|
static float ReadTensor(const T* pCPUTensor, const NominalRangeConverter& nominalRangeConverter) {
|
|
return nominalRangeConverter.Denormalize(*pCPUTensor);
|
|
}
|
|
|
|
// clang-format off
|
|
template <>
|
|
#if _MSVC_LANG < 202002L
|
|
static
|
|
#endif
|
|
float ReadTensor<DirectX::PackedVector::HALF>(
|
|
const DirectX::PackedVector::HALF* pCPUTensor, const NominalRangeConverter& nominalRangeConverter
|
|
) {
|
|
return nominalRangeConverter.Denormalize(DirectX::PackedVector::XMConvertHalfToFloat(*pCPUTensor));
|
|
}
|
|
|
|
template <typename T>
|
|
static BYTE DetensorizeValue(const T* pCPUTensor, const NominalRangeConverter& nominalRangeConverter) {
|
|
return static_cast<BYTE>(std::max(0.0f, std::min(255.0f, ReadTensor(pCPUTensor, nominalRangeConverter) + 0.5f)));
|
|
}
|
|
|
|
template <typename T>
|
|
static void InterleaveRowFloatToByte(
|
|
const T* xChannel,
|
|
const T* yChannel,
|
|
const T* zChannel,
|
|
uint32_t tensorWidth,
|
|
BYTE* pData,
|
|
uint32_t bytesPerPixel,
|
|
const NominalRangeConverter& nominalRangeConverter
|
|
) {
|
|
BYTE* pPixel = pData;
|
|
uint32_t tensorWidthRemaining = tensorWidth;
|
|
|
|
while (tensorWidthRemaining > 0) {
|
|
pPixel[0] = DetensorizeValue(xChannel, nominalRangeConverter);
|
|
pPixel[1] = DetensorizeValue(yChannel, nominalRangeConverter);
|
|
pPixel[2] = DetensorizeValue(zChannel, nominalRangeConverter);
|
|
pPixel[3] = 255;
|
|
|
|
pPixel += 4;
|
|
xChannel++;
|
|
yChannel++;
|
|
zChannel++;
|
|
tensorWidthRemaining--;
|
|
}
|
|
}
|
|
|
|
// clang-format off
|
|
#if defined(_M_AMD64) || defined(_M_IX86)
|
|
template <>
|
|
#if _MSVC_LANG < 202002L
|
|
static
|
|
#endif
|
|
void InterleaveRowFloatToByte(
|
|
const float* xChannel,
|
|
const float* yChannel,
|
|
const float* zChannel,
|
|
uint32_t tensorWidth,
|
|
BYTE* pData,
|
|
uint32_t bytesPerPixel,
|
|
const NominalRangeConverter& nominalRangeConverter
|
|
) {
|
|
BYTE* pPixel = pData;
|
|
uint32_t tensorWidthRemaining = tensorWidth;
|
|
|
|
__m128 maxv = _mm_set1_ps(255.0f);
|
|
__m128 zero = _mm_setzero_ps();
|
|
|
|
// Prep an alpha register with 8 bit - 255 alpha values
|
|
__m128i alpha = _mm_setzero_si128();
|
|
alpha = _mm_cmpeq_epi32(alpha, alpha);
|
|
alpha = _mm_srli_epi16(alpha, 8);
|
|
|
|
while (tensorWidthRemaining >= 8) {
|
|
// Load, saturate, and convert to ints, 8 - 32 bit floats from X channel
|
|
__m128i vXIntsLo = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(xChannel)), maxv));
|
|
__m128i vXIntsHi =
|
|
_mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(xChannel + 4)), maxv));
|
|
|
|
// Pack 32 bit ints into 16 bit ints
|
|
__m128i vXWords = _mm_packs_epi32(vXIntsLo, vXIntsHi);
|
|
|
|
// Load, saturate, and convert to ints, 8 - 32 bit floats from Y channel
|
|
__m128i vYIntsLo = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(yChannel)), maxv));
|
|
__m128i vYIntsHi =
|
|
_mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(yChannel + 4)), maxv));
|
|
|
|
// Pack 32 bit ints into 16 bit ints
|
|
__m128i vYWords = _mm_packs_epi32(vYIntsLo, vYIntsHi);
|
|
|
|
// Load, saturate, and convert to ints, 8 - 32 bit floats from Z channel
|
|
__m128i vZIntsLo = _mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(zChannel)), maxv));
|
|
__m128i vZIntsHi =
|
|
_mm_cvtps_epi32(_mm_min_ps(nominalRangeConverter.Denormalize(_mm_loadu_ps(zChannel + 4)), maxv));
|
|
|
|
// Pack 32 bit ints into 16 bit ints
|
|
__m128i vZWords = _mm_packs_epi32(vZIntsLo, vZIntsHi);
|
|
|
|
// Pack 16 bit ints into 8 bit uints
|
|
__m128i vXZBytes = _mm_packus_epi16(vXWords, vZWords);
|
|
__m128i vYABytes = _mm_packus_epi16(vYWords, alpha);
|
|
|
|
// Interleave bytes into XY order
|
|
__m128i vXYBytesInterleaved = _mm_unpacklo_epi8(vXZBytes, vYABytes);
|
|
// Interleave bytes into ZA order
|
|
__m128i vZABytesInterleaved = _mm_unpackhi_epi8(vXZBytes, vYABytes);
|
|
|
|
// Interleave 16 bits to get XYZA XYZA ordering
|
|
__m128i vPixelBytesLo = _mm_unpacklo_epi16(vXYBytesInterleaved, vZABytesInterleaved);
|
|
__m128i vPixelBytesHi = _mm_unpackhi_epi16(vXYBytesInterleaved, vZABytesInterleaved);
|
|
|
|
// Write out bytes now in proper order
|
|
_mm_storeu_si128((__m128i*)pPixel, vPixelBytesLo);
|
|
_mm_storeu_si128((__m128i*)(pPixel + 16), vPixelBytesHi);
|
|
|
|
xChannel += 8;
|
|
yChannel += 8;
|
|
zChannel += 8;
|
|
pPixel += 8 * static_cast<uint64_t>(bytesPerPixel);
|
|
tensorWidthRemaining -= 8;
|
|
}
|
|
|
|
// Anything remaining deal with it one at a time
|
|
while (tensorWidthRemaining > 0) {
|
|
pPixel[0] = DetensorizeValue(xChannel, nominalRangeConverter);
|
|
pPixel[1] = DetensorizeValue(yChannel, nominalRangeConverter);
|
|
pPixel[2] = DetensorizeValue(zChannel, nominalRangeConverter);
|
|
pPixel[3] = 255;
|
|
|
|
pPixel += static_cast<uint64_t>(bytesPerPixel);
|
|
xChannel++;
|
|
yChannel++;
|
|
zChannel++;
|
|
tensorWidthRemaining--;
|
|
}
|
|
}
|
|
#endif
|
|
};
|
|
} // namespace _winml
|