mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-19 02:03:52 +00:00
Don't reduce warning level for CUDA build on Windows (#19663)
### Description <!-- Describe your changes. --> Address warnings so all the ORT projects build with /W4 on Windows. Mainly - unused parameters - variables shadowing other ones ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> #19588 started on this.
This commit is contained in:
parent
a788514027
commit
db59cec82f
55 changed files with 315 additions and 219 deletions
|
|
@ -1274,11 +1274,7 @@ endif()
|
|||
#Dependencies end. In the next we'll enable "treat warning as error"
|
||||
|
||||
#Adjust warning flags
|
||||
if (onnxruntime_USE_CUDA)
|
||||
set_msvc_c_cpp_compiler_warning_level(3)
|
||||
else()
|
||||
set_msvc_c_cpp_compiler_warning_level(4)
|
||||
endif()
|
||||
set_msvc_c_cpp_compiler_warning_level(4)
|
||||
|
||||
set(onnxruntime_DELAYLOAD_FLAGS "")
|
||||
|
||||
|
|
|
|||
|
|
@ -141,18 +141,22 @@
|
|||
if (HAS_GUARD_CF)
|
||||
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /guard:cf>")
|
||||
endif()
|
||||
|
||||
if (HAS_QSPECTRE)
|
||||
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /Qspectre>")
|
||||
endif()
|
||||
|
||||
foreach(ORT_FLAG ${ORT_WARNING_FLAGS})
|
||||
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler \"${ORT_FLAG}\">")
|
||||
endforeach()
|
||||
|
||||
# CUDA 11.3+ supports parallel compilation
|
||||
# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver-threads
|
||||
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.3)
|
||||
option(onnxruntime_NVCC_THREADS "Number of threads that NVCC can use for compilation." 1)
|
||||
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
|
||||
endif()
|
||||
|
||||
if (UNIX)
|
||||
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler -Wno-reorder>"
|
||||
"$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-reorder>")
|
||||
|
|
@ -162,6 +166,13 @@
|
|||
#mutex.cuh(91): warning C4834: discarding return value of function with 'nodiscard' attribute
|
||||
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4834>")
|
||||
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4127>")
|
||||
if (MSVC)
|
||||
# the VS warnings for 'Conditional Expression is Constant' are spurious as they don't handle multiple conditions
|
||||
# e.g. `if (std::is_same_v<T, float> && not_a_const)` will generate the warning even though constexpr cannot
|
||||
# be used due to `&& not_a_const`. This affects too many places for it to be reasonable to disable at a finer
|
||||
# granularity.
|
||||
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/wd4127>")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
onnxruntime_add_include_to_target(${target} onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers)
|
||||
|
|
@ -187,7 +198,7 @@
|
|||
target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
if (onnxruntime_USE_TRITON_KERNEL)
|
||||
# compile triton kernel, generate .a and .h files
|
||||
include(onnxruntime_compile_triton_kernel.cmake)
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ struct CudaContext : public CustomOpContext {
|
|||
|
||||
template <typename T>
|
||||
T FetchResource(const OrtKernelContext& kernel_ctx, CudaResource resource_type) {
|
||||
if (sizeof(T) > sizeof(void*)) {
|
||||
if constexpr (sizeof(T) > sizeof(void*)) {
|
||||
ORT_CXX_API_THROW("void* is not large enough to hold resource type: " + std::to_string(resource_type), OrtErrorCode::ORT_INVALID_ARGUMENT);
|
||||
}
|
||||
const auto& ort_api = Ort::GetApi();
|
||||
|
|
|
|||
|
|
@ -843,11 +843,11 @@ void InvokeAddBiasTransposeTrt(
|
|||
|
||||
template <>
|
||||
void LaunchAddBiasTransposeTrt(
|
||||
cudaStream_t stream, const int max_threads_per_block,
|
||||
const int batch_size, const int sequence_length,
|
||||
const int num_heads, const int head_size,
|
||||
const float* biases, const float* query, const float* key, const float* value, float* output,
|
||||
bool is_cross_attention, int kv_sequence_length) {
|
||||
cudaStream_t /*stream*/, const int /*max_threads_per_block*/,
|
||||
const int /*batch_size*/, const int /*sequence_length*/,
|
||||
const int /*num_heads*/, const int /*head_size*/,
|
||||
const float* /*biases*/, const float* /*query*/, const float* /*key*/, const float* /*value*/, float* /*output*/,
|
||||
bool /*is_cross_attention*/, int /*kv_sequence_length*/) {
|
||||
ORT_ENFORCE(false, "Shall not call this since fused kernel does not support float input.");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -58,12 +58,12 @@ size_t AlignSize(size_t bytes) {
|
|||
return bytesAligned;
|
||||
}
|
||||
|
||||
void CumulatedSequenceLengthCache::Initialize(int32_t sequence_length, cudaStream_t stream) {
|
||||
if (this->sequence_length != sequence_length) {
|
||||
void CumulatedSequenceLengthCache::Initialize(int32_t seq_length, cudaStream_t stream) {
|
||||
if (this->sequence_length != seq_length) {
|
||||
ORT_ENFORCE(buffer.get() != nullptr && this->max_batch_size > 0);
|
||||
LaunchTrtSequenceOffset(reinterpret_cast<int32_t*>(buffer.get()), nullptr,
|
||||
this->max_batch_size, sequence_length, stream);
|
||||
this->sequence_length = sequence_length;
|
||||
this->max_batch_size, seq_length, stream);
|
||||
this->sequence_length = seq_length;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -213,9 +213,9 @@ Status FusedTrtCrossAttention(
|
|||
|
||||
template <>
|
||||
Status FusedTrtCrossAttention<float>(
|
||||
cudaStream_t stream,
|
||||
contrib::AttentionParameters& parameters,
|
||||
AttentionData<float>& data) {
|
||||
cudaStream_t /*stream*/,
|
||||
contrib::AttentionParameters& /*parameters*/,
|
||||
AttentionData<float>& /*data*/) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED,
|
||||
"Trt fused cross attention does not support float tensor");
|
||||
}
|
||||
|
|
@ -276,9 +276,9 @@ Status FusedTrtSelfAttention(
|
|||
// Template Specialization for float type
|
||||
template <>
|
||||
Status FusedTrtSelfAttention<float>(
|
||||
cudaStream_t stream,
|
||||
contrib::AttentionParameters& parameters,
|
||||
AttentionData<float>& data) {
|
||||
cudaStream_t /*stream*/,
|
||||
contrib::AttentionParameters& /*parameters*/,
|
||||
AttentionData<float>& /*data*/) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED,
|
||||
"Trt fused attention does not support float tensor");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -231,7 +231,7 @@ Status PrepareQkv_MHA_PackedQKV(contrib::AttentionParameters& parameters,
|
|||
AttentionData<T>& data,
|
||||
cudaStream_t stream,
|
||||
int max_threads_per_block,
|
||||
T* q, T* k, T* v, AttentionQkvFormat& qkv_format) {
|
||||
T* /*q*/, T* /*k*/, T* /*v*/, AttentionQkvFormat& qkv_format) {
|
||||
const int batch_size = parameters.batch_size;
|
||||
const int sequence_length = parameters.sequence_length;
|
||||
const int num_heads = parameters.num_heads;
|
||||
|
|
@ -279,7 +279,7 @@ Status PrepareQkv_MHA_PackedKV(contrib::AttentionParameters& parameters,
|
|||
AttentionData<T>& data,
|
||||
cudaStream_t stream,
|
||||
int max_threads_per_block,
|
||||
T* q, T* k, T* v, AttentionQkvFormat& qkv_format) {
|
||||
T* /*q*/, T* k, T* /*v*/, AttentionQkvFormat& qkv_format) {
|
||||
const int batch_size = parameters.batch_size;
|
||||
const int kv_sequence_length = parameters.kv_sequence_length;
|
||||
const int num_heads = parameters.num_heads;
|
||||
|
|
|
|||
|
|
@ -242,18 +242,18 @@ void DispatchIsAligned(const MemoryEfficientAttentionParams& params) {
|
|||
using AlignedAK = AttentionKernel<T, ArchTag, true, queries_per_block, keys_per_block, single_value_iteration>;
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 6287)
|
||||
#pragma warning(disable : 6287 4189) // kAligned is used via capture so 4189 warning seems incorrect
|
||||
#endif
|
||||
// Run a more efficient kernel with `isAligned=True` when memory is correctly aligned.
|
||||
bool is_aligned = params.qk_head_size % AlignedAK::kAlignmentQ == 0 &&
|
||||
params.qk_head_size % AlignedAK::kAlignmentK == 0 &&
|
||||
params.v_head_size % AlignedAK::kAlignmentV == 0;
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
DISPATCH_BOOL(is_aligned, kIsAligned, ([&]() {
|
||||
LaunchCutlassFmha<T, ArchTag, kIsAligned, queries_per_block, keys_per_block, single_value_iteration>(params);
|
||||
}));
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename ArchTag>
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ Status DecoderQkvToContext(
|
|||
const cudaDeviceProp& device_prop,
|
||||
Stream* ort_stream,
|
||||
cublasHandle_t& cublas,
|
||||
const size_t element_size,
|
||||
const size_t /*element_size*/,
|
||||
const int batch_size,
|
||||
const int sequence_length,
|
||||
const int kv_sequence_length,
|
||||
|
|
|
|||
|
|
@ -451,7 +451,7 @@ __global__ void PastToTotalSeqlen(int32_t* seqlens_k,
|
|||
// Convert Past to Total sequence length tensor
|
||||
Status LaunchGetSeqlenBuff(contrib::GroupQueryAttentionParameters& parameters, int32_t* seqlens_k,
|
||||
int32_t* seqlens_k_buff, bool is_total, cudaStream_t stream,
|
||||
const int threads_per_block) {
|
||||
const int /*threads_per_block*/) {
|
||||
if (parameters.is_prompt) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
@ -655,7 +655,7 @@ Status EfficientAttention(
|
|||
template <typename T>
|
||||
Status QkvToContext(
|
||||
const cudaDeviceProp& device_prop,
|
||||
cublasHandle_t& cublas,
|
||||
cublasHandle_t& /*cublas*/,
|
||||
Stream* ort_stream,
|
||||
contrib::GroupQueryAttentionParameters& parameters,
|
||||
GroupQueryAttentionData<T>& data) {
|
||||
|
|
|
|||
|
|
@ -440,7 +440,7 @@ Status LaunchTransposeRemovePadding(
|
|||
|
||||
template <typename T>
|
||||
Status FusedScaledDotProductAttention(
|
||||
const cudaDeviceProp& device_prop,
|
||||
const cudaDeviceProp& /*device_prop*/,
|
||||
cudaStream_t stream,
|
||||
PackedAttentionParameters& parameters,
|
||||
PackedAttentionData<T>& data) {
|
||||
|
|
|
|||
|
|
@ -381,7 +381,7 @@ void InvokeTranspose(
|
|||
const T* query, const T* key, const T* value, const T* bias, T* output,
|
||||
const int batch_size, const int sequence_length,
|
||||
const int num_heads, const int qk_head_size, const int v_head_size,
|
||||
AttentionQkvFormat source_format, AttentionQkvFormat target_format,
|
||||
[[maybe_unused]] AttentionQkvFormat source_format, AttentionQkvFormat target_format,
|
||||
const int32_t* token_offset, int32_t token_count,
|
||||
cudaStream_t stream) {
|
||||
if (key != nullptr && value != nullptr) {
|
||||
|
|
@ -551,7 +551,7 @@ void LaunchTranspose(
|
|||
|
||||
template <typename T>
|
||||
Status FusedAttentionTrt(
|
||||
const cudaDeviceProp& device_prop,
|
||||
const cudaDeviceProp& /*device_prop*/,
|
||||
cudaStream_t stream,
|
||||
PackedAttentionParameters& parameters,
|
||||
PackedMultiHeadAttentionData<T>& data) {
|
||||
|
|
|
|||
|
|
@ -82,8 +82,6 @@ Status RotaryEmbedding<T>::ComputeInternal(OpKernelContext* context) const {
|
|||
interleaved,
|
||||
device_prop.maxThreadsPerBlock,
|
||||
parameters.transposed);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ Status LaunchRotaryEmbeddingKernel(
|
|||
const int num_heads,
|
||||
const int head_size,
|
||||
const int rotary_embedding_dim,
|
||||
const int max_sequence_length,
|
||||
const int /*max_sequence_length*/,
|
||||
const int position_ids_format,
|
||||
const bool interleaved,
|
||||
const int max_threads_per_block,
|
||||
|
|
|
|||
|
|
@ -53,9 +53,9 @@ class FusedMHARunnerFP16v2::mhaImpl {
|
|||
|
||||
~mhaImpl() {}
|
||||
|
||||
void setup(const int S, const int B) {
|
||||
void setup(const int seq_len, const int B) {
|
||||
// For bert and vit, use flash attention when sequence length is larger than the threshold.
|
||||
use_flash_attention = is_flash_attention(S);
|
||||
use_flash_attention = is_flash_attention(seq_len);
|
||||
|
||||
params.force_unroll = use_flash_attention;
|
||||
|
||||
|
|
@ -68,26 +68,26 @@ class FusedMHARunnerFP16v2::mhaImpl {
|
|||
warps_n = 1;
|
||||
} else {
|
||||
if (sm == 70) {
|
||||
if (S == 64 || S == 96) {
|
||||
if (seq_len == 64 || seq_len == 96) {
|
||||
warps_m = 2;
|
||||
warps_n = 2;
|
||||
} else if (S == 128) {
|
||||
} else if (seq_len == 128) {
|
||||
warps_m = 1;
|
||||
warps_n = 4;
|
||||
} else if (S == 256 || S == 384) {
|
||||
} else if (seq_len == 256 || seq_len == 384) {
|
||||
warps_m = 1;
|
||||
warps_n = 8;
|
||||
} else {
|
||||
ORT_ENFORCE(false, "Unsupported sequence length");
|
||||
}
|
||||
} else {
|
||||
if (S == 32 || S == 64 || S == 96 || S == 128) {
|
||||
if (seq_len == 32 || seq_len == 64 || seq_len == 96 || seq_len == 128) {
|
||||
warps_m = 2;
|
||||
warps_n = 2;
|
||||
} else if (S == 192 || S == 256) {
|
||||
} else if (seq_len == 192 || seq_len == 256) {
|
||||
warps_m = 1;
|
||||
warps_n = 4;
|
||||
} else if (S == 384) {
|
||||
} else if (seq_len == 384) {
|
||||
warps_m = 1;
|
||||
warps_n = 8;
|
||||
} else {
|
||||
|
|
@ -99,7 +99,7 @@ class FusedMHARunnerFP16v2::mhaImpl {
|
|||
// The number of threads per CTA.
|
||||
threads_per_cta = warps_m * warps_n * warps_k * 32;
|
||||
// The number of xmmas in the M dimension. We use one uint32_t per XMMA in the M dimension.
|
||||
xmmas_m = (S + 16 * warps_m - 1) / (16 * warps_m);
|
||||
xmmas_m = (seq_len + 16 * warps_m - 1) / (16 * warps_m);
|
||||
|
||||
const float scale_bmm1 = interface->mScale;
|
||||
const float scale_softmax = 1.f; // Seems to be only required for int8
|
||||
|
|
@ -111,7 +111,7 @@ class FusedMHARunnerFP16v2::mhaImpl {
|
|||
|
||||
params.b = B;
|
||||
params.h = interface->mNumHeads;
|
||||
params.s = S;
|
||||
params.s = seq_len;
|
||||
params.d = interface->mHeadSize;
|
||||
|
||||
params.qkv_stride_in_bytes = 3 * interface->mNumHeads * interface->mHeadSize * sizeof(half);
|
||||
|
|
@ -121,7 +121,7 @@ class FusedMHARunnerFP16v2::mhaImpl {
|
|||
has_causal_mask = false;
|
||||
}
|
||||
|
||||
void setup_causal_masked_fmha(const int S, const int B) {
|
||||
void setup_causal_masked_fmha(const int seq_len, const int B) {
|
||||
const float scale_bmm1 = interface->mScale;
|
||||
const float scale_softmax = 1.f; // Seems to be only required for int8
|
||||
const float scale_bmm2 = 1.f;
|
||||
|
|
@ -132,7 +132,7 @@ class FusedMHARunnerFP16v2::mhaImpl {
|
|||
|
||||
params.b = B;
|
||||
params.h = interface->mNumHeads;
|
||||
params.s = S;
|
||||
params.s = seq_len;
|
||||
params.d = interface->mHeadSize;
|
||||
|
||||
params.qkv_stride_in_bytes = 3 * interface->mNumHeads * interface->mHeadSize * sizeof(half);
|
||||
|
|
@ -182,30 +182,30 @@ class FusedMHARunnerFP16v2::mhaImpl {
|
|||
return max_seq_len;
|
||||
}
|
||||
|
||||
int S = max_seq_len;
|
||||
int seq_len = max_seq_len;
|
||||
if (max_seq_len <= 32) {
|
||||
S = (sm == 70) ? 64 : 32;
|
||||
seq_len = (sm == 70) ? 64 : 32;
|
||||
} else if (max_seq_len <= 64) {
|
||||
S = 64;
|
||||
seq_len = 64;
|
||||
} else if (max_seq_len <= 96) {
|
||||
S = 96;
|
||||
seq_len = 96;
|
||||
} else if (max_seq_len <= 128) {
|
||||
S = 128;
|
||||
seq_len = 128;
|
||||
} else if (max_seq_len <= 192) {
|
||||
S = (sm == 70) ? 256 : 192;
|
||||
seq_len = (sm == 70) ? 256 : 192;
|
||||
} else if (max_seq_len <= 256) {
|
||||
S = 256;
|
||||
seq_len = 256;
|
||||
} else if (max_seq_len <= 384) {
|
||||
S = 384;
|
||||
seq_len = 384;
|
||||
}
|
||||
|
||||
return S;
|
||||
return seq_len;
|
||||
}
|
||||
|
||||
protected:
|
||||
bool is_flash_attention(const int S) const {
|
||||
bool is_flash_attention(const int seq_len) const {
|
||||
ORT_ENFORCE(interface->mHasCausalMask == false);
|
||||
return interface->mEnableFlashAttention && S >= kMinSequenceLengthFlashAttention;
|
||||
return interface->mEnableFlashAttention && seq_len >= kMinSequenceLengthFlashAttention;
|
||||
}
|
||||
|
||||
private:
|
||||
|
|
@ -232,12 +232,12 @@ FusedMHARunnerFP16v2::FusedMHARunnerFP16v2(const int numHeads,
|
|||
pimpl(new mhaImpl(this)) {
|
||||
}
|
||||
|
||||
void FusedMHARunnerFP16v2::setup(const int S, const int B) {
|
||||
MHARunner::setup(S, B);
|
||||
void FusedMHARunnerFP16v2::setup(const int seq_len, const int B) {
|
||||
MHARunner::setup(seq_len, B);
|
||||
if (mHasCausalMask) {
|
||||
pimpl->setup_causal_masked_fmha(S, B);
|
||||
pimpl->setup_causal_masked_fmha(seq_len, B);
|
||||
} else {
|
||||
pimpl->setup(S, B);
|
||||
pimpl->setup(seq_len, B);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -136,10 +136,10 @@ struct GroupNormNHWCParams {
|
|||
bool use_silu,
|
||||
bool broadcast_skip,
|
||||
int channels_per_block) {
|
||||
int32_t channels_per_group = num_channels / num_groups;
|
||||
int32_t channels_per_group_in = num_channels / num_groups;
|
||||
// channels_per_block is computed in PrePack.
|
||||
// If the gamma is not initializer, channels_per_block might be zero after PrePack. In that happens, compute it here.
|
||||
if (channels_per_block < channels_per_group) {
|
||||
if (channels_per_block < channels_per_group_in) {
|
||||
channels_per_block = GetChannelsPerBlock(num_channels, num_groups);
|
||||
}
|
||||
|
||||
|
|
@ -167,7 +167,7 @@ struct GroupNormNHWCParams {
|
|||
this->hw_per_block = DivUp(this->hw, blocks_per_hw);
|
||||
|
||||
this->channels_per_block = channels_per_block;
|
||||
this->channels_per_group = channels_per_group;
|
||||
this->channels_per_group = channels_per_group_in;
|
||||
this->hwc = this->hw * this->c;
|
||||
this->inv_hw_channels_per_group = 1.F / (float)(this->hw * this->channels_per_group);
|
||||
this->groups_per_block = channels_per_block / this->channels_per_group;
|
||||
|
|
|
|||
|
|
@ -78,9 +78,9 @@ struct Inverse::ComputeImpl {
|
|||
cudaStream_t stream = ort_stream ? static_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
|
||||
|
||||
// Make a copy of the input which will serve as a workspace as well.
|
||||
if (std::is_same<T, float>::value || std::is_same<T, MLFloat16>::value) {
|
||||
if constexpr (std::is_same<T, float>::value || std::is_same<T, MLFloat16>::value) {
|
||||
IAllocatorUniquePtr<float> input_workspace = inst->GetScratchBuffer<float>(input_count, ort_stream);
|
||||
if (std::is_same<T, MLFloat16>::value) {
|
||||
if constexpr (std::is_same<T, MLFloat16>::value) {
|
||||
// Convert from MLFloat16(half) to float
|
||||
Impl_Cast<CudaT, float>(stream, reinterpret_cast<const CudaT*>(input.Data<MLFloat16>()), input_workspace.get(), input_count);
|
||||
} else {
|
||||
|
|
@ -96,7 +96,7 @@ struct Inverse::ComputeImpl {
|
|||
// Need to compute ptrs for output buffers
|
||||
// Output for MLFloat
|
||||
IAllocatorUniquePtr<float*> output_ptrs = inst->GetScratchBuffer<float*>(n_batches, ort_stream);
|
||||
if (std::is_same<T, MLFloat16>::value) {
|
||||
if constexpr (std::is_same<T, MLFloat16>::value) {
|
||||
IAllocatorUniquePtr<float> ml_float_output = inst->GetScratchBuffer<float>(input_count, ort_stream);
|
||||
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(stream, ml_float_output.get(), num_batches, rows, output_ptrs));
|
||||
// Do the inverse
|
||||
|
|
@ -112,7 +112,7 @@ struct Inverse::ComputeImpl {
|
|||
ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches));
|
||||
// We are done here
|
||||
}
|
||||
} else if (std::is_same<T, double>::value) {
|
||||
} else if constexpr (std::is_same<T, double>::value) {
|
||||
IAllocatorUniquePtr<double> input_workspace = inst->GetScratchBuffer<double>(static_cast<int>(input_count), ort_stream);
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_workspace.get(), input.Data<double>(), sizeof(double) * input_count,
|
||||
cudaMemcpyDeviceToDevice, stream));
|
||||
|
|
|
|||
|
|
@ -97,8 +97,8 @@ void ComplexMul_Impl(
|
|||
const TArray<int64_t>* rhs_padded_strides,
|
||||
const T* rhs_data,
|
||||
const TArray<onnxruntime::cuda::fast_divmod>* fdm_output_strides,
|
||||
const onnxruntime::cuda::fast_divmod& fdm_H,
|
||||
const onnxruntime::cuda::fast_divmod& fdm_C,
|
||||
const onnxruntime::cuda::fast_divmod& /*fdm_H*/,
|
||||
const onnxruntime::cuda::fast_divmod& /*fdm_C*/,
|
||||
T* output_data,
|
||||
int64_t count,
|
||||
int64_t lhs_size,
|
||||
|
|
|
|||
|
|
@ -174,7 +174,7 @@ Status GemmFloat8::ComputeGemm(
|
|||
int32_t dtype_A, int32_t dtype_B,
|
||||
int32_t dtype_C, int32_t dtype_Y,
|
||||
const TensorShape& shape_A, const TensorShape& shape_B,
|
||||
const TensorShape& shape_C, const TensorShape& shape_Y,
|
||||
const TensorShape& shape_C, const TensorShape& /*shape_Y*/,
|
||||
bool trans_A, bool trans_B, const void* p_input_a, const void* p_input_b,
|
||||
const void* p_input_c, const void* p_scale_a, const void* p_scale_b,
|
||||
const void* p_scale_y, void* p_output_y, int M, int N, int K, int lda,
|
||||
|
|
|
|||
|
|
@ -202,7 +202,7 @@ struct MoeFCGemm {
|
|||
total_rows_before_expert(total_rows_before_expert),
|
||||
gemm_n(gemm_n),
|
||||
gemm_k(gemm_k),
|
||||
host_problem_sizes(nullptr) {
|
||||
host_problem_sizes(host_problem_sizes) {
|
||||
if (platform::is_same<uint8_t, ElementB>::value || platform::is_same<uint4b_t, ElementB>::value) {
|
||||
assert(weight_scales);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,6 +20,12 @@
|
|||
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
|
||||
#endif
|
||||
|
||||
// Ignore CUTLASS warning C4100: unreferenced formal parameter
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4100)
|
||||
#endif
|
||||
|
||||
#include "cutlass/array.h"
|
||||
#include "cutlass/numeric_conversion.h"
|
||||
#include "cutlass/layout/matrix.h"
|
||||
|
|
@ -36,6 +42,10 @@
|
|||
#include "layout_traits_helper.h"
|
||||
#include "moe_cutlass_kernel.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
|
@ -149,10 +159,10 @@ void generic_moe_gemm_kernelLauncher(const T* A, const WeightType* B, const T* w
|
|||
template <typename T, typename WeightType, typename arch, typename EpilogueTag, typename ThreadblockShape,
|
||||
typename WarpShape, int Stages, typename Enable = void>
|
||||
struct dispatch_stages {
|
||||
static void dispatch(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
|
||||
int64_t* total_rows_before_expert, int64_t gemm_n, int64_t gemm_k, int num_experts,
|
||||
CutlassGemmConfig gemm_config, int multi_processor_count, cudaStream_t stream,
|
||||
int* occupancy = nullptr) {
|
||||
static void dispatch(const T* /*A*/, const WeightType* /*B*/, const T* /*weight_scales*/, const T* /*biases*/,
|
||||
T* /*C*/, int64_t* /*total_rows_before_expert*/, int64_t /*gemm_n*/, int64_t /*gemm_k*/,
|
||||
int /*num_experts*/, CutlassGemmConfig /*gemm_config*/, int /*multi_processor_count*/,
|
||||
cudaStream_t /*stream*/, [[maybe_unused]] int* occupancy = nullptr) {
|
||||
std::string err_msg = "Cutlass fpA_intB gemm. Not instantiates for arch " +
|
||||
std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages);
|
||||
ORT_THROW("[FT Error][dispatch_stages::dispatch] " + err_msg);
|
||||
|
|
@ -221,9 +231,10 @@ template <
|
|||
typename T, typename WeightType, typename arch, typename EpilogueTag,
|
||||
typename std::enable_if<!std::is_same<T, float>::value && std::is_same<T, WeightType>::value>::type* = nullptr>
|
||||
void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
|
||||
int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
|
||||
int num_experts, CutlassGemmConfig gemm_config, int sm_version,
|
||||
int multi_processor_count, cudaStream_t stream, int* occupancy = nullptr) {
|
||||
int64_t* total_rows_before_expert, int64_t /*total_rows*/,
|
||||
int64_t gemm_n, int64_t gemm_k, int num_experts, CutlassGemmConfig gemm_config,
|
||||
int /*sm_version*/, int multi_processor_count, cudaStream_t stream,
|
||||
int* occupancy = nullptr) {
|
||||
switch (gemm_config.tile_config) {
|
||||
case CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64:
|
||||
dispatch_gemm_config<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<32, 128, 64>,
|
||||
|
|
@ -300,8 +311,8 @@ void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weig
|
|||
template <typename T, typename WeightType, typename arch, typename EpilogueTag,
|
||||
typename std::enable_if<std::is_same<T, float>::value>::type* = nullptr>
|
||||
void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
|
||||
int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
|
||||
int num_experts, CutlassGemmConfig gemm_config, int sm_version,
|
||||
int64_t* total_rows_before_expert, int64_t /*total_rows*/, int64_t gemm_n, int64_t gemm_k,
|
||||
int num_experts, CutlassGemmConfig gemm_config, int /*sm_version*/,
|
||||
int multi_processor_count, cudaStream_t stream, int* occupancy = nullptr) {
|
||||
switch (gemm_config.tile_config) {
|
||||
case CutlassTileConfig::CtaShape128x128x8_WarpShape64x64x8:
|
||||
|
|
|
|||
|
|
@ -370,7 +370,7 @@ struct TopkConstants {
|
|||
|
||||
template <typename T, int EXPERTS, int WARPS_PER_TB>
|
||||
void topk_gating_softmax_launcher_helper(const T* input, const bool* finished, T* output, int* indices, int* source_row,
|
||||
int num_rows, int num_experts, int k, cudaStream_t stream) {
|
||||
int num_rows, int /*num_experts*/, int k, cudaStream_t stream) {
|
||||
static constexpr unsigned long MAX_BYTES_PER_LDG = 16;
|
||||
|
||||
static constexpr int BYTES_PER_LDG = std::min((int)MAX_BYTES_PER_LDG, (int)sizeof(T) * EXPERTS);
|
||||
|
|
@ -599,7 +599,7 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
|
|||
static constexpr bool scales_required =
|
||||
std::is_same<WeightType, uint8_t>::value || std::is_same<WeightType, cutlass::uint4b_t>::value;
|
||||
|
||||
if (scales_required) {
|
||||
if constexpr (scales_required) {
|
||||
if (fc1_scales == nullptr) {
|
||||
ORT_THROW("[FT Error][Run MoE FC] Scales expected but scale for first matmul is a null pointer");
|
||||
} else if (fc2_scales == nullptr) {
|
||||
|
|
|
|||
|
|
@ -276,13 +276,13 @@ struct MoeProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode:
|
|||
return true;
|
||||
}
|
||||
|
||||
static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count,
|
||||
int32_t block_count) {
|
||||
static size_t get_workspace_size(const cutlass::gemm::GemmCoord* /*host_problem_sizes_ptr*/,
|
||||
int32_t /*problem_count*/, int32_t /*block_count*/) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count,
|
||||
int32_t block_count, void* host_workspace_ptr) {}
|
||||
static void host_precompute(const cutlass::gemm::GemmCoord* /*host_problem_sizes_ptr*/, int32_t /*problem_count*/,
|
||||
int32_t /*block_count*/, void* /*host_workspace_ptr*/) {}
|
||||
};
|
||||
|
||||
} // namespace kernel
|
||||
|
|
|
|||
|
|
@ -154,7 +154,7 @@ Status QAttention<T, int8_t>::ComputeInternal(OpKernelContext* context) const {
|
|||
CudaT dequant_scale;
|
||||
CudaT input_scale = *(reinterpret_cast<const CudaT*>(input_scale_tensor->Data<T>()));
|
||||
CudaT weight_scale = *(reinterpret_cast<const CudaT*>(weight_scale_tensor->Data<T>()));
|
||||
if (sizeof(T) == 2) {
|
||||
if constexpr (sizeof(T) == 2) {
|
||||
dequant_scale = __float2half(__half2float(input_scale) * __half2float(weight_scale));
|
||||
} else {
|
||||
dequant_scale = input_scale * weight_scale;
|
||||
|
|
|
|||
|
|
@ -142,7 +142,7 @@ inline void debug_print([[maybe_unused]] const T* arr,
|
|||
std::cout << "========" << name << std::endl;
|
||||
for (size_t i = 0; i < sz; i++) {
|
||||
if (i % w == 0) std::cout << std::endl;
|
||||
if (std::is_same<T, int8_t>().value) {
|
||||
if constepxr (std::is_same<T, int8_t>::value) {
|
||||
std::cout << (int)buf[i] << ", ";
|
||||
} else {
|
||||
std::cout << buf[i] << ", ";
|
||||
|
|
|
|||
|
|
@ -151,7 +151,7 @@ QOrderBatchInt8MatrixTransposeKernel(const int8_t* src, const int8_t* dst, const
|
|||
}
|
||||
}
|
||||
|
||||
Status QOrderBatchTransposeInt8Matrix(cudaStream_t stream, const cudaDeviceProp& device_prop,
|
||||
Status QOrderBatchTransposeInt8Matrix(cudaStream_t stream, const cudaDeviceProp& /*device_prop*/,
|
||||
const int batch_size, const int rows, const int cols,
|
||||
const int8_t* input, int8_t* output) {
|
||||
ORT_ENFORCE(rows % 4 == 0 && cols % 4 == 0, "Matrix rows and cols must be divisible by 4!");
|
||||
|
|
|
|||
|
|
@ -389,7 +389,7 @@ QOrderDequantizeKernel_Strict(const int8_t* __restrict__ src, const __half* __re
|
|||
}
|
||||
}
|
||||
|
||||
Status QOrderDequantize_Strict(cudaStream_t stream, const cudaDeviceProp& device_prop,
|
||||
Status QOrderDequantize_Strict(cudaStream_t stream, const cudaDeviceProp& /*device_prop*/,
|
||||
const int8_t* src, __half* dst, float scale, size_t N) {
|
||||
ORT_RETURN_IF(N & 0x3LL, "N can not divide by 4!");
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,22 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
|
||||
// cub.cuh includes device/dispatch_radix_sort.cuh which has assignment in conditional expressions
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4706)
|
||||
#endif
|
||||
#include <cub/cub.cuh>
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
|
||||
#include <cub/util_type.cuh>
|
||||
|
||||
#include "core/providers/cuda/cuda_common.h"
|
||||
#include "core/providers/cuda/cu_inc/common.cuh"
|
||||
#include "cub/util_type.cuh"
|
||||
#include <cub/cub.cuh>
|
||||
#include <cub/device/device_segmented_radix_sort.cuh>
|
||||
|
||||
#include "contrib_ops/cuda/bert/utils.cuh"
|
||||
#include "contrib_ops/cuda/transformers/generation_cuda_impl.h"
|
||||
|
||||
|
|
|
|||
|
|
@ -131,41 +131,33 @@ class CUDAExecutionProvider : public IExecutionProvider {
|
|||
|
||||
template <typename T>
|
||||
const T* GetConstOnes(size_t count, cudaStream_t stream) {
|
||||
constexpr bool is_float = std::is_same<T, float>::value;
|
||||
constexpr bool is_double = std::is_same<T, double>::value;
|
||||
constexpr bool is_half = std::is_same<T, half>::value;
|
||||
constexpr bool is_BFloat16 = std::is_same<T, BFloat16>::value;
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
constexpr bool is_Float8E4M3FN = std::is_same<T, Float8E4M3FN>::value;
|
||||
constexpr bool is_Float8E5M2 = std::is_same<T, Float8E5M2>::value;
|
||||
#endif
|
||||
if (is_float) {
|
||||
if constexpr (std::is_same<T, float>::value) {
|
||||
if (!constant_ones_float_) {
|
||||
constant_ones_float_ = cuda::CreateConstantOnes<float>();
|
||||
}
|
||||
return reinterpret_cast<const T*>(constant_ones_float_->GetBuffer(stream, count));
|
||||
} else if (is_double) {
|
||||
} else if constexpr (std::is_same<T, double>::value) {
|
||||
if (!constant_ones_double_) {
|
||||
constant_ones_double_ = cuda::CreateConstantOnes<double>();
|
||||
}
|
||||
return reinterpret_cast<const T*>(constant_ones_double_->GetBuffer(stream, count));
|
||||
} else if (is_half) {
|
||||
} else if constexpr (std::is_same<T, half>::value) {
|
||||
if (!constant_ones_half_) {
|
||||
constant_ones_half_ = cuda::CreateConstantOnes<half>();
|
||||
}
|
||||
return reinterpret_cast<const T*>(constant_ones_half_->GetBuffer(stream, count));
|
||||
} else if (is_BFloat16) {
|
||||
} else if constexpr (std::is_same<T, BFloat16>::value) {
|
||||
if (!constant_ones_bfloat16_) {
|
||||
constant_ones_bfloat16_ = cuda::CreateConstantOnes<BFloat16>();
|
||||
}
|
||||
return reinterpret_cast<const T*>(constant_ones_bfloat16_->GetBuffer(stream, count));
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
} else if (is_Float8E4M3FN) {
|
||||
} else if constexpr (std::is_same<T, Float8E4M3FN>::value) {
|
||||
if (!constant_ones_float8e4m3fn_) {
|
||||
constant_ones_float8e4m3fn_ = cuda::CreateConstantOnes<Float8E4M3FN>();
|
||||
}
|
||||
return reinterpret_cast<const T*>(constant_ones_float8e4m3fn_->GetBuffer(stream, count));
|
||||
} else if (is_Float8E5M2) {
|
||||
} else if constexpr (std::is_same<T, Float8E5M2>::value) {
|
||||
if (!constant_ones_float8e5m2_) {
|
||||
constant_ones_float8e5m2_ = cuda::CreateConstantOnes<Float8E5M2>();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -160,7 +160,6 @@ cudnnDataType_t CudnnTensor::GetDataType<half>() {
|
|||
template <>
|
||||
cudnnDataType_t CudnnTensor::GetDataType<BFloat16>() {
|
||||
ORT_THROW("cuDNN doesn't support BFloat16.");
|
||||
return CUDNN_DATA_FLOAT;
|
||||
}
|
||||
|
||||
template <>
|
||||
|
|
|
|||
|
|
@ -127,9 +127,10 @@ struct OP_Cast {
|
|||
UnaryElementWiseImpl(stream, input_data, output_data, OP_Cast<InT, OutT>(), count); \
|
||||
}
|
||||
|
||||
#define IMPL_CAST_IMPL_THROW(InT, OutT) \
|
||||
void Explicit_Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count) { \
|
||||
ORT_THROW("Cast from " #InT " to " #OutT " must define saturate."); \
|
||||
#define IMPL_CAST_IMPL_THROW(InT, OutT) \
|
||||
void Explicit_Impl_Cast(cudaStream_t /*stream*/, const InT* /*input_data*/, OutT* /*output_data*/, \
|
||||
size_t /*count*/) { \
|
||||
ORT_THROW("Cast from " #InT " to " #OutT " must define saturate."); \
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
|
|
|
|||
|
|
@ -97,11 +97,11 @@ Status SliceOutUnwantedOutputSection(cudaStream_t stream,
|
|||
|
||||
template <typename T, bool NHWC>
|
||||
Status Conv<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
|
||||
bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) {
|
||||
bool& is_packed, PrePackedWeights* /*prepacked_weights*/) {
|
||||
is_packed = false;
|
||||
// only layout of weight input is adjusted via PrePack
|
||||
if (NHWC && is_nhwc_domain_) { // InputTensors::IN_W
|
||||
if (input_idx == 1) {
|
||||
if constexpr (NHWC) {
|
||||
if (is_nhwc_domain_ && input_idx == 1) { // InputTensors::IN_W
|
||||
// Transpose from {M, C/group, kH, kW} to {M, kH, kW, C/group}
|
||||
auto orig_shape = tensor.Shape();
|
||||
|
||||
|
|
@ -123,6 +123,10 @@ Status Conv<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr
|
|||
CUDA_CALL_THROW(cudaStreamSynchronize(DefaultCudaStream()));
|
||||
is_packed = true;
|
||||
}
|
||||
} else {
|
||||
ORT_UNUSED_PARAMETER(tensor);
|
||||
ORT_UNUSED_PARAMETER(input_idx);
|
||||
ORT_UNUSED_PARAMETER(alloc);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
|
@ -149,8 +153,11 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
|
|||
|
||||
// Make sure input and weight are 4D for NHWC since we set 4D descriptor for NHWC.
|
||||
constexpr bool channels_last = NHWC;
|
||||
if (channels_last && (x_shape.NumDimensions() != 4 || w_shape.NumDimensions() != 4)) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Number of dimensions of X and W should be 4 for channels_last format (NHWC)");
|
||||
if constexpr (channels_last) {
|
||||
if (x_shape.NumDimensions() != 4 || w_shape.NumDimensions() != 4) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
|
||||
"Number of dimensions of X and W should be 4 for channels_last format (NHWC)");
|
||||
}
|
||||
}
|
||||
|
||||
// set B
|
||||
|
|
@ -403,7 +410,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
|
|||
default:
|
||||
perf.algo = kDefaultConvAlgo;
|
||||
CUDNN_RETURN_IF_ERROR(GetWorkspaceSize(GetCudnnHandle(context), s_, perf.algo, &perf.memory));
|
||||
if (std::is_same<T, MLFloat16>::value) {
|
||||
|
||||
if constexpr (std::is_same<T, MLFloat16>::value) {
|
||||
perf.mathType = CUDNN_TENSOR_OP_MATH;
|
||||
} else if (std::is_same<T, float>::value && !UseTF32()) {
|
||||
perf.mathType = CUDNN_FMA_MATH;
|
||||
|
|
|
|||
|
|
@ -195,7 +195,7 @@ class Conv : public CudaKernel {
|
|||
}
|
||||
|
||||
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
|
||||
bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) override;
|
||||
bool& is_packed, PrePackedWeights* prepacked_weights) override;
|
||||
|
||||
Status ComputeInternal(OpKernelContext* context) const override;
|
||||
|
||||
|
|
|
|||
|
|
@ -7,8 +7,6 @@
|
|||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
using namespace onnxruntime::cuda;
|
||||
|
||||
// NOTE: This was originally a contrib op with 3 type constraints. The ONNX spec merges 'T' and 'V'.
|
||||
// the kernel is templatized on all three for backwards compatibility, but in ONNX usage T == V.
|
||||
template <typename T, typename U, typename V, bool simplified>
|
||||
|
|
|
|||
|
|
@ -29,8 +29,6 @@
|
|||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
using namespace onnxruntime::cuda;
|
||||
|
||||
template <typename U, bool simplified>
|
||||
__device__ void cuWelfordOnlineSum(
|
||||
const U curr,
|
||||
|
|
|
|||
|
|
@ -305,7 +305,6 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
if (!weight_cached_) {
|
||||
const Tensor& W = *ctx->Input<Tensor>(RNN_Input_Index::W);
|
||||
const Tensor& R = *ctx->Input<Tensor>(RNN_Input_Index::R);
|
||||
const Tensor* B = ctx->Input<Tensor>(RNN_Input_Index::B);
|
||||
ORT_RETURN_IF_ERROR(ReorganizeWeights(&W, &R, B, w_data_size_in_bytes, w_data, w_desc,
|
||||
rnn_desc, ctx->GetComputeStream()));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ __global__ void FastGeluKernel2(const half2 a, const half2 b, const half2 c, int
|
|||
}
|
||||
|
||||
template <>
|
||||
Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
|
||||
Status LaunchFastGeluKernel(const cudaDeviceProp& /*prop*/, cudaStream_t stream, int input_length, int bias_length,
|
||||
const float* input, const float* bias, float* output, bool /*use_half2*/) {
|
||||
constexpr int blockSize = 256;
|
||||
const int gridSize = (input_length + blockSize - 1) / blockSize;
|
||||
|
|
@ -73,7 +73,7 @@ Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int
|
|||
}
|
||||
|
||||
template <>
|
||||
Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
|
||||
Status LaunchFastGeluKernel(const cudaDeviceProp& /*prop*/, cudaStream_t stream, int input_length, int bias_length,
|
||||
const double* input, const double* bias, double* output, bool /*use_half2*/) {
|
||||
constexpr int blockSize = 256;
|
||||
const int gridSize = (input_length + blockSize - 1) / blockSize;
|
||||
|
|
@ -108,7 +108,7 @@ Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int
|
|||
}
|
||||
|
||||
template <>
|
||||
Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
|
||||
Status LaunchFastGeluKernel(const cudaDeviceProp& /*prop*/, cudaStream_t stream, int input_length, int bias_length,
|
||||
const BFloat16* input, const BFloat16* bias, BFloat16* output, bool /*use_half2*/) {
|
||||
constexpr int blockSize = 256;
|
||||
|
||||
|
|
|
|||
|
|
@ -680,10 +680,10 @@ template <typename T>
|
|||
void ResizeTrilinearUpsample(
|
||||
cudaStream_t stream,
|
||||
int rank,
|
||||
const UpsampleMode upsample_mode,
|
||||
const UpsampleMode /*upsample_mode*/,
|
||||
ResizeCoordinateTransformationMode coordinate_transform_mode,
|
||||
gsl::span<const int64_t> input_shape,
|
||||
gsl::span<const int64_t> output_shape,
|
||||
gsl::span<const int64_t> /*input_shape*/,
|
||||
gsl::span<const int64_t> /*output_shape*/,
|
||||
int64_t batch_size, int64_t num_channels,
|
||||
std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
|
||||
std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
|
||||
|
|
@ -832,11 +832,11 @@ void ResizeTrilinearUpsample(
|
|||
template <class T>
|
||||
void ResizeBiLinearUpsample(cudaStream_t stream,
|
||||
int rank,
|
||||
const UpsampleMode upsample_mode,
|
||||
const UpsampleMode /*upsample_mode*/,
|
||||
ResizeCoordinateTransformationMode coordinate_transform_mode,
|
||||
gsl::span<const int64_t> input_shape,
|
||||
gsl::span<const int64_t> output_shape,
|
||||
int64_t batch_size, int64_t num_channels,
|
||||
gsl::span<const int64_t> /*input_shape*/,
|
||||
gsl::span<const int64_t> /*output_shape*/,
|
||||
int64_t /*batch_size*/, int64_t num_channels,
|
||||
std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
|
||||
std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
|
||||
std::tuple<float, float, float> inferred_dim_rscales,
|
||||
|
|
@ -959,10 +959,10 @@ void ResizeBiLinearUpsample(cudaStream_t stream,
|
|||
template <typename T>
|
||||
void ResizeBicubicUpsample(cudaStream_t stream,
|
||||
int rank,
|
||||
const UpsampleMode upsample_mode,
|
||||
const UpsampleMode /*upsample_mode*/,
|
||||
ResizeCoordinateTransformationMode coordinate_transform_mode,
|
||||
gsl::span<const int64_t> input_shape,
|
||||
gsl::span<const int64_t> output_shape,
|
||||
gsl::span<const int64_t> /*input_shape*/,
|
||||
gsl::span<const int64_t> /*output_shape*/,
|
||||
int64_t batch_size, int64_t num_channels,
|
||||
std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
|
||||
std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
|
||||
|
|
|
|||
|
|
@ -609,7 +609,7 @@ void ResizeNearestImpl(
|
|||
const size_t N,
|
||||
bool extrapolation_enabled,
|
||||
const T extrapolation_value,
|
||||
float cubic_coeff_a,
|
||||
float /*cubic_coeff_a*/,
|
||||
ResizeCoordinateTransformationMode transform_coordinate,
|
||||
ResizeNearestMode calc_nearest_pixel,
|
||||
int64_t* /* prefix_dim_sum */,
|
||||
|
|
|
|||
|
|
@ -80,7 +80,7 @@ bool CanDoTranspose3D(const cudaDeviceProp& prop, size_t rank, const gsl::span<c
|
|||
} break
|
||||
|
||||
Status Transpose3DImpl(cudaStream_t stream, size_t element_size, const TArray<int64_t>& input_shape,
|
||||
const TArray<int64_t>& input_strides, const void* input_data, void* output_data, int64_t N,
|
||||
const TArray<int64_t>& input_strides, const void* input_data, void* output_data, int64_t /*N*/,
|
||||
const dim3& grid_size, const dim3& block_size) {
|
||||
switch (element_size) {
|
||||
HANDLE_TRANSPOSE_3D_TILE_DIM(int8_t);
|
||||
|
|
@ -248,10 +248,10 @@ __global__ void Transpose4DKernelParallelizeOneElementPerThread(
|
|||
}
|
||||
|
||||
bool CanDoTranspose4DParallelizeOneElementPerThread(const cudaDeviceProp& prop,
|
||||
size_t element_size,
|
||||
size_t /*element_size*/,
|
||||
int32_t rank,
|
||||
const gsl::span<const int64_t>& input_dims,
|
||||
const gsl::span<const size_t>& permutations,
|
||||
const gsl::span<const size_t>& /*permutations*/,
|
||||
dim3& grid_size, dim3& block_size) {
|
||||
if (rank == 4) {
|
||||
// dims[3]: block.x
|
||||
|
|
|
|||
|
|
@ -130,27 +130,11 @@ void LoadOrtTritonKernel() {
|
|||
std::call_once(load_ort_triton_kernel_flag, TryToLoadKernel);
|
||||
}
|
||||
|
||||
Status LaunchTritonKernel(cudaStream_t stream, std::string fname,
|
||||
int grid0, int grid1, int grid2, void* args, size_t args_size) {
|
||||
#ifdef USE_TRITON_KERNEL
|
||||
if (ort_triton_kernel_map.count(fname) == 0) {
|
||||
// Return unsupported status if function name not found in registry.
|
||||
// This error status will be used by TunableOp
|
||||
std::ostringstream message_stream;
|
||||
message_stream << "Can't find ort triton kernel name: " << fname;
|
||||
std::string message = message_stream.str();
|
||||
TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(true, message);
|
||||
}
|
||||
auto idx = ort_triton_kernel_map[fname];
|
||||
return LaunchTritonKernel(stream, idx, grid0, grid1, grid2, args, args_size);
|
||||
#else
|
||||
return Status::OK();
|
||||
#endif
|
||||
}
|
||||
|
||||
Status LaunchTritonKernel(cudaStream_t stream, size_t idx,
|
||||
int grid0, int grid1, int grid2, void* args, size_t args_size) {
|
||||
|
||||
#ifdef USE_TRITON_KERNEL
|
||||
Status LaunchTritonKernel(cudaStream_t stream, size_t idx, int grid0, int grid1, int grid2,
|
||||
void* args, size_t args_size) {
|
||||
if (idx >= ort_triton_kernel_metadata.size()) {
|
||||
// Return unsupported status when idx exceeds the size of ort_triton_kernel_metadata.
|
||||
// This error status will be used by TunableOp
|
||||
|
|
@ -181,11 +165,37 @@ Status LaunchTritonKernel(cudaStream_t stream, size_t idx,
|
|||
nullptr,
|
||||
(void**)&config),
|
||||
"Launching kernel failed.");
|
||||
#endif
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status LaunchTritonKernel(cudaStream_t stream, std::string fname, int grid0, int grid1, int grid2,
|
||||
void* args, size_t args_size) {
|
||||
if (ort_triton_kernel_map.count(fname) == 0) {
|
||||
// Return unsupported status if function name not found in registry.
|
||||
// This error status will be used by TunableOp
|
||||
std::ostringstream message_stream;
|
||||
message_stream << "Can't find ort triton kernel name: " << fname;
|
||||
std::string message = message_stream.str();
|
||||
TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(true, message);
|
||||
}
|
||||
auto idx = ort_triton_kernel_map[fname];
|
||||
return LaunchTritonKernel(stream, idx, grid0, grid1, grid2, args, args_size);
|
||||
}
|
||||
|
||||
#else
|
||||
Status LaunchTritonKernel(cudaStream_t /*stream*/, std::string /*fname*/, int /*grid0*/, int /*grid1*/, int /*grid2*/,
|
||||
void* /*args*/, size_t /*args_size*/) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status LaunchTritonKernel(cudaStream_t /*stream*/, size_t /*idx*/, int /*grid0*/, int /*grid1*/, int /*grid2*/,
|
||||
void* /*args*/, size_t /*args_size*/) {
|
||||
return Status::OK();
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
const TritonKernelMetaData* GetOrtTritonKernelMetadata(size_t idx) {
|
||||
if (idx >= ort_triton_kernel_metadata.size()) {
|
||||
return nullptr;
|
||||
|
|
|
|||
20
onnxruntime/core/providers/tensorrt/nv_includes.h
Normal file
20
onnxruntime/core/providers/tensorrt/nv_includes.h
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
#pragma once
|
||||
|
||||
// File to include the required TRT headers with workarounds for warnings we can't fix.
|
||||
|
||||
// Ignore warning C4100: unreferenced formal parameter
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4100)
|
||||
#endif
|
||||
|
||||
#include <NvInfer.h>
|
||||
#include <NvInferPlugin.h>
|
||||
#include <NvInferRuntime.h>
|
||||
#include <NvOnnxParser.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
|
|
@ -6,7 +6,7 @@
|
|||
#include <string>
|
||||
#include <filesystem>
|
||||
|
||||
#include "NvInfer.h"
|
||||
#include "core/providers/tensorrt/nv_includes.h"
|
||||
#include "core/providers/shared_library/provider_api.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#define ORT_API_MANUAL_INIT
|
||||
#include "core/session/onnxruntime_cxx_api.h"
|
||||
#include "core/common/common.h"
|
||||
#include "core/common/narrow.h"
|
||||
#include "core/common/safeint.h"
|
||||
#include "tensorrt_execution_provider.h"
|
||||
#include "tensorrt_execution_provider_utils.h"
|
||||
|
|
@ -137,10 +138,10 @@ std::vector<std::string> SplitToStringVec(std::string const& s, char separator)
|
|||
return splitted;
|
||||
}
|
||||
|
||||
nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_sting) {
|
||||
nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_string) {
|
||||
nvinfer1::TacticSources disabledTactics = 0;
|
||||
nvinfer1::TacticSources enabledTactics = 0;
|
||||
std::vector<std::string> tacticList = SplitToStringVec(tactic_sting, ',');
|
||||
std::vector<std::string> tacticList = SplitToStringVec(tactic_string, ',');
|
||||
for (auto& t : tacticList) {
|
||||
bool enable{false};
|
||||
if (t.front() == '+') {
|
||||
|
|
@ -151,8 +152,8 @@ nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_sting) {
|
|||
t.erase(0, 1);
|
||||
|
||||
const auto toUpper = [](std::string& sourceName) {
|
||||
std::transform(
|
||||
sourceName.begin(), sourceName.end(), sourceName.begin(), [](char c) { return std::toupper(c); });
|
||||
std::transform(sourceName.begin(), sourceName.end(), sourceName.begin(),
|
||||
[](char c) { return onnxruntime::narrow<char>(std::toupper(c)); });
|
||||
return sourceName;
|
||||
};
|
||||
|
||||
|
|
@ -288,7 +289,8 @@ void CudaCall<cudnnStatus_t, true>(cudnnStatus_t retCode, const char* exprString
|
|||
return g_host->CudaCall_true(retCode, exprString, libName, successCode, msg, file, line);
|
||||
}
|
||||
|
||||
void* OutputAllocator::reallocateOutput(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept {
|
||||
void* OutputAllocator::reallocateOutput(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size,
|
||||
uint64_t /*alignment*/) noexcept {
|
||||
// Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
|
||||
// even for empty tensors, so allocate a dummy byte.
|
||||
size = std::max(size, static_cast<uint64_t>(1));
|
||||
|
|
@ -304,7 +306,7 @@ void* OutputAllocator::reallocateOutput(char const* tensorName, void* currentMem
|
|||
return outputPtr;
|
||||
}
|
||||
|
||||
void OutputAllocator::notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept {
|
||||
void OutputAllocator::notifyShape(char const* /*tensorName*/, nvinfer1::Dims const& dims) noexcept {
|
||||
output_shapes.clear();
|
||||
output_shapes.reserve(dims.nbDims);
|
||||
for (int i = 0; i < dims.nbDims; i++) {
|
||||
|
|
@ -613,20 +615,22 @@ Status ApplyProfileShapesFromInputTensorValue(std::vector<nvinfer1::IOptimizatio
|
|||
tensor_shape_values[input_name].resize(shape_size);
|
||||
switch (tensor_type) {
|
||||
case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
|
||||
auto input = std::make_unique<int32_t[]>(shape_size);
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input.get(), input_tensor.GetTensorData<int32_t>(), shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost, stream));
|
||||
auto input_shape = std::make_unique<int32_t[]>(shape_size);
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_shape.get(), input_tensor.GetTensorData<int32_t>(),
|
||||
shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost, stream));
|
||||
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
|
||||
for (int j = 0; j < shape_size; ++j) {
|
||||
tensor_shape_values[input_name][j] = input[j];
|
||||
tensor_shape_values[input_name][j] = input_shape[j];
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
|
||||
auto input = std::make_unique<int64_t[]>(shape_size);
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input.get(), input_tensor.GetTensorData<int64_t>(), shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost, stream));
|
||||
auto input_shape = std::make_unique<int64_t[]>(shape_size);
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_shape.get(), input_tensor.GetTensorData<int64_t>(),
|
||||
shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost, stream));
|
||||
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
|
||||
for (int j = 0; j < shape_size; ++j) {
|
||||
tensor_shape_values[input_name][j] = static_cast<int32_t>(input[j]);
|
||||
tensor_shape_values[input_name][j] = static_cast<int32_t>(input_shape[j]);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
@ -974,7 +978,7 @@ Status BindContextOutput(Ort::KernelContext& ctx,
|
|||
* we are waiting for ORT core to support "assign" memory address to ORT context output. Some works need to be done in ORT memory planner to be aware of this memory support.
|
||||
*/
|
||||
Status BindKernelOutput(Ort::KernelContext& ctx,
|
||||
OrtMemoryInfo* mem_info,
|
||||
OrtMemoryInfo* /*mem_info*/,
|
||||
DDSOutputAllocatorMap& allocator_map,
|
||||
char const* output_name,
|
||||
size_t output_index,
|
||||
|
|
@ -1143,7 +1147,8 @@ TensorrtExecutionProvider::PerThreadContext& TensorrtExecutionProvider::GetPerTh
|
|||
|
||||
// get or create a context
|
||||
if (context_state_.retired_context_pool.empty()) {
|
||||
context = std::make_shared<PerThreadContext>(info_.device_id, info_.has_user_compute_stream, stream_);
|
||||
context = std::make_shared<PerThreadContext>(narrow<OrtDevice::DeviceId>(info_.device_id),
|
||||
info_.has_user_compute_stream, stream_);
|
||||
} else {
|
||||
context = context_state_.retired_context_pool.back();
|
||||
context_state_.retired_context_pool.pop_back();
|
||||
|
|
@ -1163,7 +1168,11 @@ TensorrtExecutionProvider::PerThreadContext& TensorrtExecutionProvider::GetPerTh
|
|||
}
|
||||
|
||||
TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProviderInfo& info)
|
||||
: IExecutionProvider{onnxruntime::kTensorrtExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, info.device_id)}, info_(info), device_id_(info.device_id) {
|
||||
: IExecutionProvider{onnxruntime::kTensorrtExecutionProvider,
|
||||
OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT,
|
||||
narrow<OrtDevice::DeviceId>(info.device_id))},
|
||||
info_(info),
|
||||
device_id_(info.device_id) {
|
||||
InitProviderOrtApi();
|
||||
|
||||
CUDA_CALL_THROW(cudaSetDevice(device_id_));
|
||||
|
|
@ -1655,7 +1664,8 @@ void TensorrtExecutionProvider::IncrementRegularRunCountBeforeGraphCapture() {
|
|||
|
||||
std::vector<AllocatorPtr> TensorrtExecutionProvider::CreatePreferredAllocators() {
|
||||
AllocatorCreationInfo default_memory_info(
|
||||
[](OrtDevice::DeviceId device_id) { return CreateCUDAAllocator(device_id, onnxruntime::CUDA); }, device_id_);
|
||||
[](OrtDevice::DeviceId device_id) { return CreateCUDAAllocator(device_id, onnxruntime::CUDA); },
|
||||
narrow<OrtDevice::DeviceId>(device_id_));
|
||||
|
||||
AllocatorCreationInfo pinned_allocator_info(
|
||||
[](OrtDevice::DeviceId device_id) {
|
||||
|
|
@ -3036,7 +3046,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
|
|||
std::unordered_set<std::string> input_names;
|
||||
std::unordered_map<std::string, std::vector<int32_t>> tensor_shape_values;
|
||||
|
||||
OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id_), device_id_);
|
||||
OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, narrow<OrtDevice::DeviceId>(device_id_));
|
||||
OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device, device_id_);
|
||||
if (alloc_ == nullptr) {
|
||||
Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
|
||||
}
|
||||
|
|
@ -3603,7 +3614,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
|
|||
// int num_inputs = static_cast<int>(input_indexes.size());
|
||||
int num_outputs = static_cast<int>(output_indexes.size());
|
||||
|
||||
OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id_), device_id_);
|
||||
OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, narrow<OrtDevice::DeviceId>(device_id_));
|
||||
OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device, device_id_);
|
||||
if (alloc_ == nullptr) {
|
||||
Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,8 +5,9 @@
|
|||
#include <ctime>
|
||||
#include <cudnn.h>
|
||||
#include <cublas_v2.h>
|
||||
#include "NvInfer.h"
|
||||
#include "NvOnnxParser.h"
|
||||
|
||||
#include "core/providers/tensorrt/nv_includes.h"
|
||||
|
||||
#include "core/platform/ort_mutex.h"
|
||||
#include "core/providers/cuda/cuda_graph.h"
|
||||
#include "tensorrt_execution_provider_info.h"
|
||||
|
|
|
|||
|
|
@ -1,12 +1,11 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include <unordered_set>
|
||||
|
||||
#include "core/framework/provider_options.h"
|
||||
#include "tensorrt_execution_provider_custom_ops.h"
|
||||
#include "tensorrt_execution_provider.h"
|
||||
#include <NvInferRuntime.h>
|
||||
#include <NvInferPlugin.h>
|
||||
#include <unordered_set>
|
||||
|
||||
namespace onnxruntime {
|
||||
extern TensorrtLogger& GetTensorrtLogger();
|
||||
|
|
|
|||
|
|
@ -13,7 +13,8 @@ using namespace onnxruntime;
|
|||
namespace onnxruntime {
|
||||
|
||||
common::Status LoadDynamicLibrary(onnxruntime::PathString library_name);
|
||||
common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list, const std::string extra_plugin_lib_paths);
|
||||
common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list,
|
||||
const std::string extra_plugin_lib_paths);
|
||||
common::Status CreateTensorRTCustomOpDomainList(TensorrtExecutionProviderInfo& info);
|
||||
void ReleaseTensorRTCustomOpDomain(OrtCustomOpDomain* domain);
|
||||
void ReleaseTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& custom_op_domain_list);
|
||||
|
|
@ -23,16 +24,22 @@ struct TensorRTCustomKernel {
|
|||
: compute_stream_(compute_stream) {
|
||||
}
|
||||
|
||||
void Compute(OrtKernelContext* context){}; // The implementation is in TensorRT plugin. No need to implement it here.
|
||||
void Compute(OrtKernelContext* /*context*/){
|
||||
// The implementation is in TensorRT plugin. No need to implement it here.
|
||||
};
|
||||
|
||||
private:
|
||||
void* compute_stream_;
|
||||
};
|
||||
|
||||
struct TensorRTCustomOp : Ort::CustomOpBase<TensorRTCustomOp, TensorRTCustomKernel> {
|
||||
explicit TensorRTCustomOp(const char* provider, void* compute_stream) : provider_(provider), compute_stream_(compute_stream) {}
|
||||
explicit TensorRTCustomOp(const char* provider, void* compute_stream) : provider_(provider),
|
||||
compute_stream_(compute_stream) {
|
||||
}
|
||||
|
||||
void* CreateKernel(const OrtApi& /* api */, const OrtKernelInfo* info) const { return new TensorRTCustomKernel(info, compute_stream_); };
|
||||
void* CreateKernel(const OrtApi& /* api */, const OrtKernelInfo* info) const {
|
||||
return new TensorRTCustomKernel(info, compute_stream_);
|
||||
};
|
||||
|
||||
const char* GetName() const { return name_; };
|
||||
|
||||
|
|
@ -46,7 +53,9 @@ struct TensorRTCustomOp : Ort::CustomOpBase<TensorRTCustomOp, TensorRTCustomKern
|
|||
|
||||
ONNXTensorElementDataType GetInputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; };
|
||||
|
||||
OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t) const { return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC; };
|
||||
OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t) const {
|
||||
return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC;
|
||||
};
|
||||
|
||||
size_t GetOutputTypeCount() const { return num_outputs_; };
|
||||
|
||||
|
|
@ -54,7 +63,9 @@ struct TensorRTCustomOp : Ort::CustomOpBase<TensorRTCustomOp, TensorRTCustomKern
|
|||
|
||||
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; };
|
||||
|
||||
OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t) const { return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC; };
|
||||
OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t) const {
|
||||
return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC;
|
||||
};
|
||||
|
||||
bool GetVariadicInputHomogeneity() const {
|
||||
return false; // heterogenous
|
||||
|
|
|
|||
|
|
@ -463,12 +463,12 @@ std::vector<MLFloat16> QK_Transpose(MLFloat16* q_matrix, MLFloat16* k_transpose_
|
|||
|
||||
// Softmax_QK_Transpose
|
||||
template <typename T>
|
||||
std::vector<T> Softmax_QK_Transpose(T* qk_transpose_matrix,
|
||||
int batch_size, int num_heads, int sequence_length, int total_sequence_length, int head_size);
|
||||
std::vector<T> Softmax_QK_Transpose(T* qk_transpose_matrix, int batch_size, int num_heads,
|
||||
int sequence_length, int total_sequence_length, int head_size);
|
||||
|
||||
template <>
|
||||
std::vector<float> Softmax_QK_Transpose(float* qk_transpose_matrix,
|
||||
int batch_size, int num_heads, int sequence_length, int total_sequence_length, int head_size) {
|
||||
std::vector<float> Softmax_QK_Transpose(float* qk_transpose_matrix, int batch_size, int num_heads,
|
||||
int sequence_length, int total_sequence_length, int /*head_size*/) {
|
||||
if (sequence_length != 1) {
|
||||
throw std::runtime_error("Not supported");
|
||||
}
|
||||
|
|
@ -506,8 +506,8 @@ std::vector<float> Softmax_QK_Transpose(float* qk_transpose_matrix,
|
|||
}
|
||||
|
||||
template <>
|
||||
std::vector<MLFloat16> Softmax_QK_Transpose(MLFloat16* qk_transpose_matrix,
|
||||
int batch_size, int num_heads, int sequence_length, int total_sequence_length, int head_size) {
|
||||
std::vector<MLFloat16> Softmax_QK_Transpose(MLFloat16* qk_transpose_matrix, int batch_size, int num_heads,
|
||||
int sequence_length, int total_sequence_length, int /*head_size*/) {
|
||||
if (sequence_length != 1) {
|
||||
throw std::runtime_error("Not supported");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -380,7 +380,7 @@ void RunRandomNormalGpuTest(const std::vector<int64_t> dims, const float mean, c
|
|||
test.AddOutput("Y", dims, fp16_data);
|
||||
}
|
||||
|
||||
auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
|
||||
auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& /*provider_type*/) {
|
||||
// Only one output, and mean of output values are near attribute mean.
|
||||
ASSERT_EQ(fetches.size(), 1u);
|
||||
const auto& output_tensor = fetches[0].Get<Tensor>();
|
||||
|
|
@ -472,7 +472,7 @@ void RunRandomUniformGpuTest(const std::vector<int64_t> dims, const float low, c
|
|||
test.AddOutput("Y", dims, fp16_data);
|
||||
}
|
||||
|
||||
auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
|
||||
auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& /*provider_type*/) {
|
||||
// Only one output. Each value in output tensoer is between low and high.
|
||||
// Mean of output values are near attribute mean of low and high.
|
||||
ASSERT_EQ(fetches.size(), 1u);
|
||||
|
|
|
|||
|
|
@ -32,17 +32,30 @@ void ortenv_setup() {
|
|||
}
|
||||
|
||||
#ifdef USE_TENSORRT
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4100) // Ignore warning C4100: unreferenced format parameter.
|
||||
#endif
|
||||
|
||||
// TensorRT will load/unload libraries as builder objects are created and torn down. This will happen for
|
||||
// every single unit test, which leads to excessive test execution time due to that overhead.
|
||||
// Nvidia suggests to keep a placeholder builder object around to avoid this.
|
||||
#include "NvInfer.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
|
||||
class DummyLogger : public nvinfer1::ILogger {
|
||||
public:
|
||||
DummyLogger(Severity verbosity) {}
|
||||
void log(Severity severity, const char* msg) noexcept override {}
|
||||
DummyLogger(Severity /*verbosity*/) {}
|
||||
void log(Severity /*severity*/, const char* /*msg*/) noexcept override {}
|
||||
};
|
||||
DummyLogger trt_logger(nvinfer1::ILogger::Severity::kWARNING);
|
||||
|
||||
auto const placeholder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(trt_logger));
|
||||
|
||||
#endif
|
||||
|
||||
#define TEST_MAIN main
|
||||
|
|
|
|||
|
|
@ -311,11 +311,9 @@ template <typename T, typename TOut>
|
|||
static std::vector<OrtValue> RunSCELossWithEP(const char* op,
|
||||
int opset_version,
|
||||
const char* domain,
|
||||
std::function<std::unique_ptr<IExecutionProvider>()>
|
||||
ep_creator,
|
||||
std::function<std::unique_ptr<IExecutionProvider>()> ep_creator,
|
||||
const std::string& reduction,
|
||||
const std::int64_t ignore_index,
|
||||
const double error_tolerance,
|
||||
const std::vector<int64_t>* X_dims,
|
||||
const std::vector<int64_t>* index_dims,
|
||||
const std::vector<int64_t>* weight_dims,
|
||||
|
|
@ -403,7 +401,7 @@ static void TestSCELoss(const char* op, int opset_version,
|
|||
cpu_fetches = RunSCELossWithEP<float, float>(
|
||||
op, opset_version, domain,
|
||||
[]() -> std::unique_ptr<IExecutionProvider> { return DefaultCpuExecutionProvider(); },
|
||||
reduction, ignore_index, error_tolerance,
|
||||
reduction, ignore_index,
|
||||
X_dims, index_dims, weight_dims,
|
||||
Y_dims, log_prob_dims,
|
||||
X_data_temp, index_data, weight_data_temp);
|
||||
|
|
@ -411,7 +409,7 @@ static void TestSCELoss(const char* op, int opset_version,
|
|||
cpu_fetches = RunSCELossWithEP<T, float>(
|
||||
op, opset_version, domain,
|
||||
[]() -> std::unique_ptr<IExecutionProvider> { return DefaultCpuExecutionProvider(); },
|
||||
reduction, ignore_index, error_tolerance,
|
||||
reduction, ignore_index,
|
||||
X_dims, index_dims, weight_dims,
|
||||
Y_dims, log_prob_dims,
|
||||
X_data, index_data, weight_data);
|
||||
|
|
@ -429,7 +427,7 @@ static void TestSCELoss(const char* op, int opset_version,
|
|||
return DefaultRocmExecutionProvider();
|
||||
#endif
|
||||
},
|
||||
reduction, ignore_index, error_tolerance,
|
||||
reduction, ignore_index,
|
||||
X_dims, index_dims, weight_dims,
|
||||
Y_dims, log_prob_dims,
|
||||
X_data, index_data, weight_data);
|
||||
|
|
|
|||
|
|
@ -105,7 +105,8 @@ struct AlgoSearch<T_BwdDataPerf> {
|
|||
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
|
||||
CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED};
|
||||
static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
|
||||
ORT_ENFORCE(sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution backward data algorithms.");
|
||||
static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
|
||||
"Missing cuDNN convolution backward data algorithms.");
|
||||
int perf_count;
|
||||
std::unique_ptr<T_BwdDataPerf[]> candidates = std::make_unique<T_BwdDataPerf[]>(num_algos);
|
||||
if (args.params.algo_mode == OrtCudnnConvAlgoSearchHeuristic) {
|
||||
|
|
@ -146,7 +147,9 @@ struct AlgoSearch<T_BwdFilterPerf> {
|
|||
|
||||
// NOTE: - 1 because ALGO_WINOGRAD is not implemented.
|
||||
static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
|
||||
ORT_ENFORCE(sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution backward filter algorithms.");
|
||||
static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
|
||||
"Missing cuDNN convolution backward filter algorithms.");
|
||||
|
||||
std::unique_ptr<T_BwdFilterPerf[]> candidates = std::make_unique<T_BwdFilterPerf[]>(num_algos);
|
||||
int perf_count;
|
||||
if (args.params.algo_mode == OrtCudnnConvAlgoSearchHeuristic) {
|
||||
|
|
@ -188,7 +191,9 @@ struct AlgoSearch<T_FwdPerf> {
|
|||
};
|
||||
|
||||
static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
|
||||
ORT_ENFORCE(sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution backward filter algorithms.");
|
||||
static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
|
||||
"Missing cuDNN convolution backward filter algorithms.");
|
||||
|
||||
std::unique_ptr<T_FwdPerf[]> candidates = std::make_unique<T_FwdPerf[]>(num_algos);
|
||||
int perf_count;
|
||||
if (args.params.algo_mode == OrtCudnnConvAlgoSearchHeuristic) {
|
||||
|
|
|
|||
|
|
@ -53,7 +53,6 @@ Status ConvTransposeGrad<T>::ComputeInputGradient(onnxruntime::Stream* stream, c
|
|||
algo_perf.algo, workspace.get(), algo_perf.memory, &zero, args.y_tensor, args.y_data));
|
||||
return Status::OK();
|
||||
});
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -71,7 +70,6 @@ Status ConvTransposeGrad<T>::ComputeWeightGradient(onnxruntime::Stream* stream,
|
|||
algo_perf.algo, workspace.get(), algo_perf.memory, &zero, args.w_desc, args.dw_data));
|
||||
return Status::OK();
|
||||
});
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
|
|||
|
|
@ -30,8 +30,6 @@
|
|||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
using namespace onnxruntime::cuda;
|
||||
|
||||
namespace {
|
||||
// This is the un-specialized struct. Note that we prevent instantiation of this
|
||||
// struct by putting an undefined symbol in the function body so it won't compile.
|
||||
|
|
|
|||
|
|
@ -619,7 +619,7 @@ CudaKernel::CudaAsyncBuffer<LambMultiTensorSyncRangeAndLock> compute_tensor_rang
|
|||
|
||||
template <typename TIn1, typename TIn2, typename TOut1, typename TOut2, typename TBuf>
|
||||
void LambMultiTensorReductionFunctor<TIn1, TIn2, TOut1, TOut2, TBuf>::operator()(
|
||||
cudaStream_t stream,
|
||||
cudaStream_t /*stream*/,
|
||||
ChunkGroup<4> chunk_group,
|
||||
const CudaKernel& kernel,
|
||||
void* reduction_buffer,
|
||||
|
|
|
|||
|
|
@ -93,8 +93,17 @@ steps:
|
|||
$ccache_parent_dir = (Split-Path -parent $ccache_path)
|
||||
Copy-Item "C:\ProgramData\chocolatey\lib\ccache\tools\ccache-4.7.4-windows-x86_64\ccache.exe" -Destination "C:\ProgramData\chocolatey\bin\cl.exe"
|
||||
Get-ChildItem $ccache_parent_dir
|
||||
ccache --version
|
||||
}
|
||||
|
||||
"ccache info:"
|
||||
ccache --version
|
||||
ccache --show-config
|
||||
|
||||
"cl.exe from path: $((Get-Command cl).Path). Version:"
|
||||
(cl.exe -?) -match 'Compiler Version'
|
||||
"C:\ProgramData\chocolatey\bin\cl.exe version:"
|
||||
(C:\ProgramData\chocolatey\bin\cl.exe -?) -match 'Compiler Version'
|
||||
|
||||
displayName: Install ccache and update PATH to use linked versions of gcc, cc, etc
|
||||
|
||||
- ${{ if eq(parameters.WITHCACHE, true) }}:
|
||||
|
|
|
|||
Loading…
Reference in a new issue