diff --git a/onnxruntime/core/providers/cuda/nn/instance_norm.cc b/onnxruntime/core/providers/cuda/nn/instance_norm.cc index cd0a13418d..c40c27cdf1 100644 --- a/onnxruntime/core/providers/cuda/nn/instance_norm.cc +++ b/onnxruntime/core/providers/cuda/nn/instance_norm.cc @@ -100,10 +100,19 @@ Status InstanceNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) co CudnnTensor stats_desc; ORT_RETURN_IF_ERROR(stats_desc.Set(std::array{1, stats_count, 1, 1}, CudnnTensor::GetDataType())); + const size_t stats_byte_count = stats_count * sizeof(CudaT); + + // Mean & Variance are inputs & outputs and must be initialized to zero to work properly auto mean = GetScratchBuffer(stats_count); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mean.get(), 0, stats_byte_count, Stream())); auto variance = GetScratchBuffer(stats_count); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(variance.get(), 0, stats_byte_count, Stream())); + + // We must set the scale & bias inputs to zero as they are inputs to the calculation auto unused_scale = GetScratchBuffer(stats_count); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(unused_scale.get(), 0, stats_byte_count, Stream())); auto unused_bias = GetScratchBuffer(stats_count); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(unused_bias.get(), 0, stats_byte_count, Stream())); // first, compute mean and variance per-instance per-channel using cudnnBatchNorm training CUDNN_RETURN_IF_ERROR(cudnnBatchNormalizationForwardTraining(