pytorch/caffe2/operators/softmax_shared.cc
Aapo Kyrola 0a4c5756df Logitzy SoftmaxWithLoss
Summary:
MT-team with urikz found out that their convergence discrepancy with another version of the model was caused by numerical stability issues in softmax. These were caused by our implementation not implementing the optimization to avoid doing exp(log(x)) for softmax-crossentropy. This diff fixes that.

This does not require any changes to the current models since the output of SoftmaxWithLoss is still the exponentiated items

I also did a little bit of cleanup on the code, for some reason we were passing tensors to SoftmaxCPU() instead of pointers.

Reviewed By: urikz

Differential Revision: D4901888

fbshipit-source-id: 62e785ecdd87e33742292b191e91b4f43912e4c0
2017-04-17 16:40:20 -07:00

55 lines
1.4 KiB
C++

#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
void SoftmaxCPU(
CPUContext& context,
const int N,
const int D,
const float* Xdata,
float* Ydata,
float* scale,
const float* sum_multiplier,
bool logarithmic,
float* rowmax) {
math::RowwiseMax<float, CPUContext>(N, D, Xdata, rowmax, &context);
// Put the intermediate result X - max(X) into Y
context.template Copy<float, CPUContext, CPUContext>(N * D, Xdata, Ydata);
// Subtract the max (for nomuerical reasons)
math::Gemm<float, CPUContext>(
CblasNoTrans,
CblasNoTrans,
N,
D,
1,
-1,
rowmax,
sum_multiplier,
1,
Ydata,
&context);
// Exponentiation
math::Exp<float, CPUContext>(N * D, Ydata, Ydata, &context);
math::Gemv<float, CPUContext>(
CblasNoTrans, N, D, 1, Ydata, sum_multiplier, 0, scale, &context);
// Do division
// TODO(Yangqing): maybe implement it more beautifully?
if (!logarithmic) {
for (int i = 0; i < N; ++i) {
for (int j = 0; j < D; ++j) {
Ydata[i * D + j] /= scale[i];
}
}
} else {
for (int i = 0; i < N; ++i) {
for (int j = 0; j < D; ++j) {
Ydata[i * D + j] =
Xdata[i * D + j] - rowmax[i] - log(fmaxf(scale[i], 1e-20));
}
}
}
}
} // namespace caffe2