diff --git a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc index 664d01ab42..5757c4b388 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc +++ b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc @@ -132,7 +132,7 @@ Status QAttention::ComputeInternal(OpKernelContext* context) const { Tensor* output = context->Output(0, output_shape); cublasHandle_t cublas = GetCublasHandle(context); - const size_t element_size = sizeof(T); + constexpr size_t element_size = sizeof(T); // Use GEMM for fully connection. int m = batch_size * sequence_length;