mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-17 21:10:43 +00:00
Enable cpp20 builds for DML EP and WinML API 1) Missing typename for templated types 2) unmove helper for inline references to rvalue temporaries This is okay since per the standard a temporary bound to a reference parameter in a function call exists until the end of the full expression containing that function call: if the function returns a reference, which outlives the full expression, it becomes a dangling reference. 3) static now not needed for template specializations --------- Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
296 lines
12 KiB
C++
296 lines
12 KiB
C++
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
// Licensed under the MIT License.
|
|
|
|
#pragma once
|
|
|
|
#include "inc/ImageConversionTypes.h"
|
|
#include "inc/NominalRangeConverter.h"
|
|
|
|
namespace _winml {
|
|
|
|
class CpuTensorizer {
|
|
public:
|
|
template <typename T>
|
|
static HRESULT TensorizeData(
|
|
_In_ ImageTensorChannelType formatFrom,
|
|
_In_ ImageTensorChannelType formatTo,
|
|
_In_ winml::LearningModelPixelRange pixelRange,
|
|
_In_ BYTE* pBuffer,
|
|
_In_ UINT32 bufferWidth,
|
|
_In_ const wgi::BitmapBounds& inputBounds,
|
|
_Inout_ T* pCPUTensor
|
|
) {
|
|
#pragma warning(push)
|
|
#pragma warning(disable : 26014 \
|
|
) // warning about possible out of bounds accesing pData, but input is checked for BGRA8 format, so uiCapacity should be in multiples of 4 \
|
|
// input is BGRA8: so blue at i, green is at i + 1, red is at i + 2
|
|
|
|
uint32_t bytesPerPixel = formatFrom == kImageTensorChannelTypeGRAY8 ? 1 : 4;
|
|
|
|
// bufferWidth may have padding because of optimization, but bytesPerRow includes only the real tensor data. We need to jump
|
|
// over bufferWidth's extra padding
|
|
uint32_t bytesPerRow = inputBounds.Width * bytesPerPixel;
|
|
uint32_t start = (inputBounds.Y * bufferWidth) + (inputBounds.X * bytesPerPixel);
|
|
uint32_t end = start + bufferWidth * inputBounds.Height;
|
|
uint32_t pixelInd = 0;
|
|
|
|
uint32_t xElements = inputBounds.Width - inputBounds.X;
|
|
uint32_t yElements = inputBounds.Height - inputBounds.Y;
|
|
|
|
auto nominalRangeConverter = NominalRangeConverter(pixelRange);
|
|
|
|
if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeRGB8) {
|
|
// Convert BGR8 -> BGR8 or RGB8 -> RGB8
|
|
for (uint64_t y = 0; y < yElements; y++) {
|
|
DeinterleaveRowByteToFloat(
|
|
pBuffer + y * bufferWidth + start,
|
|
pCPUTensor + y * inputBounds.Width,
|
|
pCPUTensor + (inputBounds.Height * inputBounds.Width) + y * inputBounds.Width,
|
|
pCPUTensor + (inputBounds.Height * inputBounds.Width) * 2 + y * inputBounds.Width,
|
|
xElements,
|
|
bytesPerPixel,
|
|
nominalRangeConverter
|
|
);
|
|
}
|
|
} else if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeRGB8 || formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeBGR8) {
|
|
// Convert RGB8 -> BGR8 or BGR8 -> RGB8
|
|
for (uint32_t y = 0; y < yElements; y++) {
|
|
DeinterleaveRowByteToFloat(
|
|
pBuffer + y * bufferWidth + start,
|
|
pCPUTensor + (inputBounds.Height * inputBounds.Width) * 2 + y * inputBounds.Width,
|
|
pCPUTensor + (inputBounds.Height * inputBounds.Width) + y * inputBounds.Width,
|
|
pCPUTensor + y * inputBounds.Width,
|
|
xElements,
|
|
bytesPerPixel,
|
|
nominalRangeConverter
|
|
);
|
|
}
|
|
} else if (formatTo == kImageTensorChannelTypeGRAY8 && (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) {
|
|
// Convert BGR8 -> GRAY8 or RGB8 -> GRAY8
|
|
uint32_t blueIncrement = formatFrom == kImageTensorChannelTypeBGR8 ? 0 : 2;
|
|
uint32_t redIncrement = formatFrom == kImageTensorChannelTypeBGR8 ? 2 : 0;
|
|
|
|
for (UINT32 i = start; i < end; i += bufferWidth) {
|
|
for (UINT32 j = i; j < i + bytesPerRow; j += bytesPerPixel) {
|
|
float red = float(pBuffer[j + redIncrement]);
|
|
float green = float(pBuffer[j + 1]);
|
|
float blue = float(pBuffer[j + blueIncrement]);
|
|
float gray = 0.2126f * red + 0.7152f * green + 0.0722f * blue;
|
|
pCPUTensor[pixelInd] = ConvertByteToFloat<T>(static_cast<BYTE>(gray), nominalRangeConverter);
|
|
pixelInd++;
|
|
}
|
|
}
|
|
} else if (formatFrom == kImageTensorChannelTypeGRAY8 && (formatTo == kImageTensorChannelTypeBGR8 || formatTo == kImageTensorChannelTypeRGB8)) {
|
|
// Convert GRAY8 -> BGR8 or GRAY8 -> RGB8
|
|
for (UINT32 i = start; i < end; i += bufferWidth) {
|
|
for (UINT32 j = i; j < i + bytesPerRow; j += bytesPerPixel) {
|
|
pCPUTensor[pixelInd] = ConvertByteToFloat<T>(pBuffer[j], nominalRangeConverter);
|
|
pCPUTensor[(inputBounds.Height * inputBounds.Width) + pixelInd] =
|
|
ConvertByteToFloat<T>(pBuffer[j], nominalRangeConverter);
|
|
pCPUTensor[(inputBounds.Height * inputBounds.Width * 2) + pixelInd] =
|
|
ConvertByteToFloat<T>(pBuffer[j], nominalRangeConverter);
|
|
pixelInd++;
|
|
}
|
|
}
|
|
} else if (formatFrom == kImageTensorChannelTypeGRAY8 && formatTo == kImageTensorChannelTypeGRAY8) {
|
|
// Convert GRAY8 -> GRAY8
|
|
for (UINT32 i = start; i < end; i += bufferWidth) {
|
|
for (UINT32 j = i; j < i + bytesPerRow; j += bytesPerPixel) {
|
|
pCPUTensor[pixelInd] = ConvertByteToFloat<T>(pBuffer[j], nominalRangeConverter);
|
|
pixelInd++;
|
|
}
|
|
}
|
|
}
|
|
#pragma warning(pop)
|
|
else {
|
|
return E_INVALIDARG;
|
|
}
|
|
return S_OK;
|
|
}
|
|
|
|
private:
|
|
template <typename T>
|
|
static T ConvertByteToFloat(const BYTE& input, const NominalRangeConverter& nominalRangeConverter);
|
|
|
|
// clang-format off
|
|
template <>
|
|
#if _MSVC_LANG < 202002L
|
|
static
|
|
#endif
|
|
float ConvertByteToFloat(const BYTE& input, const NominalRangeConverter& nominalRangeConverter) {
|
|
return nominalRangeConverter.Normalize(static_cast<float>(input));
|
|
}
|
|
|
|
// clang-format off
|
|
template <>
|
|
#if _MSVC_LANG < 202002L
|
|
static
|
|
#endif
|
|
DirectX::PackedVector::HALF ConvertByteToFloat(
|
|
const BYTE& input,
|
|
const NominalRangeConverter& nominalRangeConverter
|
|
) {
|
|
return nominalRangeConverter.Normalize(DirectX::PackedVector::XMConvertFloatToHalf(input));
|
|
}
|
|
|
|
template <typename T>
|
|
static void DeinterleaveRowByteToFloat(
|
|
_In_ BYTE* pBuffer,
|
|
_Inout_ T* xChannel,
|
|
_Inout_ T* yChannel,
|
|
_Inout_ T* zChannel,
|
|
uint32_t pixelElements,
|
|
uint32_t bytesPerPixel,
|
|
const NominalRangeConverter& nominalRangeConverter
|
|
) {
|
|
UINT32 j;
|
|
|
|
for (j = 0; j < (pixelElements & 0xFFFFFFFC); j += 4) {
|
|
xChannel[j] = ConvertByteToFloat<T>(pBuffer[0], nominalRangeConverter);
|
|
yChannel[j] = ConvertByteToFloat<T>(pBuffer[1], nominalRangeConverter);
|
|
zChannel[j] = ConvertByteToFloat<T>(pBuffer[2], nominalRangeConverter);
|
|
xChannel[j + 1] = ConvertByteToFloat<T>(pBuffer[4], nominalRangeConverter);
|
|
yChannel[j + 1] = ConvertByteToFloat<T>(pBuffer[5], nominalRangeConverter);
|
|
zChannel[j + 1] = ConvertByteToFloat<T>(pBuffer[6], nominalRangeConverter);
|
|
xChannel[j + 2] = ConvertByteToFloat<T>(pBuffer[8], nominalRangeConverter);
|
|
yChannel[j + 2] = ConvertByteToFloat<T>(pBuffer[9], nominalRangeConverter);
|
|
zChannel[j + 2] = ConvertByteToFloat<T>(pBuffer[10], nominalRangeConverter);
|
|
xChannel[j + 3] = ConvertByteToFloat<T>(pBuffer[12], nominalRangeConverter);
|
|
yChannel[j + 3] = ConvertByteToFloat<T>(pBuffer[13], nominalRangeConverter);
|
|
zChannel[j + 3] = ConvertByteToFloat<T>(pBuffer[14], nominalRangeConverter);
|
|
pBuffer += bytesPerPixel * 4;
|
|
}
|
|
|
|
for (; j < pixelElements; j++) {
|
|
xChannel[j] = ConvertByteToFloat<T>(pBuffer[0], nominalRangeConverter);
|
|
yChannel[j] = ConvertByteToFloat<T>(pBuffer[1], nominalRangeConverter);
|
|
zChannel[j] = ConvertByteToFloat<T>(pBuffer[2], nominalRangeConverter);
|
|
pBuffer += bytesPerPixel;
|
|
}
|
|
}
|
|
|
|
// clang-format off
|
|
#if defined(_M_AMD64) || defined(_M_IX86)
|
|
template <>
|
|
#if _MSVC_LANG < 202002L
|
|
static
|
|
#endif
|
|
void DeinterleaveRowByteToFloat(
|
|
_In_ BYTE* pBuffer,
|
|
_Inout_ float* xChannel,
|
|
_Inout_ float* yChannel,
|
|
_Inout_ float* zChannel,
|
|
uint32_t pixelElements,
|
|
uint32_t bytesPerPixel,
|
|
const NominalRangeConverter& nominalRangeConverter
|
|
) {
|
|
assert(bytesPerPixel == 4);
|
|
|
|
__m128i ZeroVector = _mm_setzero_si128();
|
|
while (pixelElements >= 8) {
|
|
// Load 8 Pixels into 2 Registers
|
|
// vBytes0 = X0 Y0 Z0 A0 X1 Y1...
|
|
// vBytes0 = X4 Y4 Z4 A4 X2 Y2...
|
|
__m128i vBytes0 = _mm_loadu_si128((__m128i*)pBuffer);
|
|
__m128i vBytes1 = _mm_loadu_si128((__m128i*)(pBuffer + 16));
|
|
|
|
// Shuffle to get
|
|
// vi0 = X0 X4 Y0 Y4...A1 A5 (A is Alpha which is ignored)
|
|
// vi1 = X2 X6 Y2 Y6...A2 A6
|
|
__m128i vi0 = _mm_unpacklo_epi8(vBytes0, vBytes1);
|
|
__m128i vi1 = _mm_unpackhi_epi8(vBytes0, vBytes1);
|
|
|
|
// Shuffle again to get
|
|
// vi0 = X0 X2 X4 X6...A4 A6 (All even byes)
|
|
// vi1 = X1 X3 X5 X7...A3 A7 (All odd bytes)
|
|
__m128i vi2 = _mm_unpacklo_epi8(vi0, vi1);
|
|
__m128i vi3 = _mm_unpackhi_epi8(vi0, vi1);
|
|
|
|
// Shuffle last time to get desired order
|
|
// vi0 = X0 X1 X2 X3...Y6 Y7 (All even byes)
|
|
// vi1 = Z0 Z1 Z2 Z3...A6 A7 (All odd bytes)
|
|
__m128i vi4 = _mm_unpacklo_epi8(vi2, vi3);
|
|
__m128i vi5 = _mm_unpackhi_epi8(vi2, vi3);
|
|
|
|
// unpack with zeros to get 16 bit ints
|
|
// vXWords = X0 X1...X6 X7
|
|
__m128i vXWords = _mm_unpacklo_epi8(vi4, ZeroVector);
|
|
|
|
// unpack again with zeros to get 32 bit ints
|
|
__m128i vXIntsLo = _mm_unpacklo_epi16(vXWords, ZeroVector);
|
|
__m128i vXIntsHi = _mm_unpackhi_epi16(vXWords, ZeroVector);
|
|
|
|
// store 256 bits of X channel Floats
|
|
_mm_storeu_ps(xChannel, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vXIntsLo)));
|
|
_mm_storeu_ps(xChannel + 4, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vXIntsHi)));
|
|
xChannel += 8;
|
|
|
|
// unpack again for Y
|
|
__m128i vYWords = _mm_unpackhi_epi8(vi4, ZeroVector);
|
|
|
|
__m128i vYIntsLo = _mm_unpacklo_epi16(vYWords, ZeroVector);
|
|
__m128i vYIntsHi = _mm_unpackhi_epi16(vYWords, ZeroVector);
|
|
|
|
_mm_storeu_ps(yChannel, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vYIntsLo)));
|
|
_mm_storeu_ps(yChannel + 4, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vYIntsHi)));
|
|
yChannel += 8;
|
|
|
|
// unpack again for Z
|
|
__m128i vZWords = _mm_unpacklo_epi8(vi5, ZeroVector);
|
|
|
|
__m128i vZIntsLo = _mm_unpacklo_epi16(vZWords, ZeroVector);
|
|
__m128i vZIntsHi = _mm_unpackhi_epi16(vZWords, ZeroVector);
|
|
|
|
_mm_storeu_ps(zChannel, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vZIntsLo)));
|
|
_mm_storeu_ps(zChannel + 4, nominalRangeConverter.Normalize(_mm_cvtepi32_ps(vZIntsHi)));
|
|
zChannel += 8;
|
|
|
|
pBuffer += 32;
|
|
pixelElements -= 8;
|
|
}
|
|
if (pixelElements >= 4) {
|
|
// load 4 pixels = 16 values
|
|
__m128i vBytes = _mm_loadu_si128((__m128i*)pBuffer);
|
|
|
|
// unpack to 16 bits
|
|
__m128i vWords0 = _mm_unpacklo_epi8(vBytes, ZeroVector);
|
|
__m128i vWords1 = _mm_unpackhi_epi8(vBytes, ZeroVector);
|
|
|
|
// unpack to 32 bits
|
|
__m128i vInts0 = _mm_unpacklo_epi16(vWords0, ZeroVector);
|
|
__m128i vInts1 = _mm_unpackhi_epi16(vWords0, ZeroVector);
|
|
__m128i vInts2 = _mm_unpacklo_epi16(vWords1, ZeroVector);
|
|
__m128i vInts3 = _mm_unpackhi_epi16(vWords1, ZeroVector);
|
|
|
|
// Normalize to floats
|
|
__m128 vFloats0 = _mm_cvtepi32_ps(vInts0);
|
|
__m128 vFloats1 = _mm_cvtepi32_ps(vInts1);
|
|
__m128 vFloats2 = _mm_cvtepi32_ps(vInts2);
|
|
__m128 vFloats3 = _mm_cvtepi32_ps(vInts3);
|
|
|
|
// We want have row but need cols so transpose 4x4 matrix
|
|
_MM_TRANSPOSE4_PS(vFloats0, vFloats1, vFloats2, vFloats3);
|
|
|
|
// Drop alpha channel transposed to vFloats3 write out rest
|
|
_mm_storeu_ps(xChannel, nominalRangeConverter.Normalize(vFloats0));
|
|
_mm_storeu_ps(yChannel, nominalRangeConverter.Normalize(vFloats1));
|
|
_mm_storeu_ps(zChannel, nominalRangeConverter.Normalize(vFloats2));
|
|
|
|
xChannel += 4;
|
|
yChannel += 4;
|
|
zChannel += 4;
|
|
pBuffer += 4 * 4;
|
|
pixelElements -= 4;
|
|
}
|
|
|
|
// Any remainder just do one at a time
|
|
for (uint32_t j = 0; j < pixelElements; j++) {
|
|
xChannel[j] = nominalRangeConverter.Normalize(static_cast<float>(pBuffer[0]));
|
|
yChannel[j] = nominalRangeConverter.Normalize(static_cast<float>(pBuffer[1]));
|
|
zChannel[j] = nominalRangeConverter.Normalize(static_cast<float>(pBuffer[2]));
|
|
pBuffer += bytesPerPixel;
|
|
}
|
|
}
|
|
#endif
|
|
};
|
|
} // namespace _winml
|