mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Summary: MT-team with urikz found out that their convergence discrepancy with another version of the model was caused by numerical stability issues in softmax. These were caused by our implementation not implementing the optimization to avoid doing exp(log(x)) for softmax-crossentropy. This diff fixes that. This does not require any changes to the current models since the output of SoftmaxWithLoss is still the exponentiated items I also did a little bit of cleanup on the code, for some reason we were passing tensors to SoftmaxCPU() instead of pointers. Reviewed By: urikz Differential Revision: D4901888 fbshipit-source-id: 62e785ecdd87e33742292b191e91b4f43912e4c0
55 lines
1.4 KiB
C++
55 lines
1.4 KiB
C++
#include "caffe2/core/context.h"
|
|
#include "caffe2/core/operator.h"
|
|
#include "caffe2/utils/math.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
void SoftmaxCPU(
|
|
CPUContext& context,
|
|
const int N,
|
|
const int D,
|
|
const float* Xdata,
|
|
float* Ydata,
|
|
float* scale,
|
|
const float* sum_multiplier,
|
|
bool logarithmic,
|
|
float* rowmax) {
|
|
math::RowwiseMax<float, CPUContext>(N, D, Xdata, rowmax, &context);
|
|
// Put the intermediate result X - max(X) into Y
|
|
context.template Copy<float, CPUContext, CPUContext>(N * D, Xdata, Ydata);
|
|
// Subtract the max (for nomuerical reasons)
|
|
math::Gemm<float, CPUContext>(
|
|
CblasNoTrans,
|
|
CblasNoTrans,
|
|
N,
|
|
D,
|
|
1,
|
|
-1,
|
|
rowmax,
|
|
sum_multiplier,
|
|
1,
|
|
Ydata,
|
|
&context);
|
|
// Exponentiation
|
|
math::Exp<float, CPUContext>(N * D, Ydata, Ydata, &context);
|
|
math::Gemv<float, CPUContext>(
|
|
CblasNoTrans, N, D, 1, Ydata, sum_multiplier, 0, scale, &context);
|
|
// Do division
|
|
// TODO(Yangqing): maybe implement it more beautifully?
|
|
if (!logarithmic) {
|
|
for (int i = 0; i < N; ++i) {
|
|
for (int j = 0; j < D; ++j) {
|
|
Ydata[i * D + j] /= scale[i];
|
|
}
|
|
}
|
|
} else {
|
|
for (int i = 0; i < N; ++i) {
|
|
for (int j = 0; j < D; ++j) {
|
|
Ydata[i * D + j] =
|
|
Xdata[i * D + j] - rowmax[i] - log(fmaxf(scale[i], 1e-20));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace caffe2
|