pytorch/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc
Jongsoo Park e95282ab28 [caffe2] make fused rowwise quant/dequant op work for N-dim tensors (#33426)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/33426

Make 2/4/8-bit fused rowwise conversion operators more general to work for N-dim tensors

Test Plan: CI

Reviewed By: ellie-wen

Differential Revision: D19943136

fbshipit-source-id: 47008544dd7e1d11a346d34f35449e0fcc0e7ee0
2020-02-19 23:29:42 -08:00

339 lines
12 KiB
C++

#include "caffe2/operators/fused_rowwise_8bit_conversion_ops.h"
#include "c10/util/Registry.h"
namespace caffe2 {
namespace {
void convertfp16fp32(float* dst, const at::Half* src, size_t N) {
for (size_t i = 0; i < N; i++) {
dst[i] = src[i];
}
}
void convertfp32fp16(at::Half* dst, const float* src, size_t N) {
for (size_t i = 0; i < N; i++) {
dst[i] = src[i];
}
}
} // namespace
REGISTER_CPU_OPERATOR(
FloatToFused8BitRowwiseQuantized,
FloatToFused8BitRowwiseQuantizedOp<
float,
float,
nullptr,
false,
CPUContext>);
OPERATOR_SCHEMA(FloatToFused8BitRowwiseQuantized)
.NumInputs(1)
.NumOutputs(1)
.TensorInferenceFunction([](const OperatorDef& /* def */,
const vector<TensorShape>& in) {
vector<TensorShape> out;
TensorShape X = in[0];
X.set_dims(
X.dims().size() - 1, X.dims(X.dims().size() - 1) + 2 * sizeof(float));
out.push_back(std::move(X));
out[0].set_data_type(TensorProto_DataType_UINT8);
return out;
})
.SetDoc(R"DOC(
Applies 8-bit row-wise quantization by determining the range
(maximum - minimum) and offset (minimum value) of each row in the input
matrix, and then scaling each element to an 8-bit number between 0 and
255. To later de-quantize values, the scale (range / 255) and offset
(bias) are stored alongside the data. More precisely, each row contains
int8 elements for each quantized element, and the last 8 bytes
of each row in the output matrix are a float storing the scale
followed by another float containing the scale.
For N-dimensional input tensor, the first N-1 dimensions are interpreted as
rows and the last dimension is interpreted as a column. For example, an
input tensor with dimension 5x2x4 is interpreted as 10 rows and 4 columns.
)
)DOC")
.Input(0, "input", "Float32 input data")
.Output(0, "output", "Fused scale, bias and quantized data");
NO_GRADIENT(FloatToFused8BitRowwiseQuantized);
REGISTER_CPU_OPERATOR(
FloatToFused8BitRowwiseQuantizedHalfScaleBias,
FloatToFused8BitRowwiseQuantizedOp<
float,
at::Half,
nullptr,
false,
CPUContext>);
OPERATOR_SCHEMA(FloatToFused8BitRowwiseQuantizedHalfScaleBias)
.NumInputs(1)
.NumOutputs(1)
.TensorInferenceFunction([](const OperatorDef& /* def */,
const vector<TensorShape>& in) {
vector<TensorShape> out;
TensorShape X = in[0];
X.set_dims(
X.dims().size() - 1,
X.dims(X.dims().size() - 1) + 2 * sizeof(at::Half));
out.push_back(std::move(X));
out[0].set_data_type(TensorProto_DataType_UINT8);
return out;
})
.SetDoc(R"DOC(
Applies 8-bit row-wise quantization by determining the range
(maximum - minimum) and offset (minimum value) of each row in the input
matrix, and then scaling each element to an 8-bit number between 0 and
255. To later de-quantize values, the scale (range / 255) and offset
(bias) are stored alongside the data. More precisely, each row contains
int8 elements for each quantized element, and the last 4 bytes
of each row in the output matrix are a half float storing the scale
followed by another half float containing the scale.)
)DOC")
.Input(0, "input", "Float32 input data")
.Output(0, "output", "Fused scale, bias and quantized data");
NO_GRADIENT(FloatToFused8BitRowwiseQuantizedHalfScaleBias);
REGISTER_CPU_OPERATOR(
HalfFloatToFused8BitRowwiseQuantized,
FloatToFused8BitRowwiseQuantizedOp<
at::Half,
float,
convertfp16fp32,
true,
CPUContext>);
OPERATOR_SCHEMA(HalfFloatToFused8BitRowwiseQuantized)
.NumInputs(1)
.NumOutputs(1)
.TensorInferenceFunction([](const OperatorDef& /* def */,
const vector<TensorShape>& in) {
vector<TensorShape> out;
TensorShape X = in[0];
X.set_dims(
X.dims().size() - 1, X.dims(X.dims().size() - 1) + 2 * sizeof(float));
out.push_back(std::move(X));
out[0].set_data_type(TensorProto_DataType_UINT8);
return out;
})
.SetDoc(R"DOC(
Applies 8-bit row-wise quantization by determining the range
(maximum - minimum) and offset (minimum value) of each row in the input
matrix, and then scaling each element to an 8-bit number between 0 and
255. To later de-quantize values, the scale (range / 255) and offset
(bias) are stored alongside the data. More precisely, each row contains
int8 elements for each quantized element, and the last 8 bytes
of each row in the output matrix are a float storing the scale
followed by another float containing the scale.)
)DOC")
.Input(0, "input", "Float16 input data")
.Output(0, "output", "Fused scale, bias and quantized data");
NO_GRADIENT(HalfFloatToFused8BitRowwiseQuantized);
REGISTER_CPU_OPERATOR(
HalfFloatToFused8BitRowwiseQuantizedHalfScaleBias,
FloatToFused8BitRowwiseQuantizedOp<
at::Half,
at::Half,
convertfp16fp32,
true,
CPUContext>);
OPERATOR_SCHEMA(HalfFloatToFused8BitRowwiseQuantizedHalfScaleBias)
.NumInputs(1)
.NumOutputs(1)
.TensorInferenceFunction([](const OperatorDef& /* def */,
const vector<TensorShape>& in) {
vector<TensorShape> out;
TensorShape X = in[0];
X.set_dims(
X.dims().size() - 1,
X.dims(X.dims().size() - 1) + 2 * sizeof(at::Half));
out.push_back(std::move(X));
out[0].set_data_type(TensorProto_DataType_UINT8);
return out;
})
.SetDoc(R"DOC(
Applies 8-bit row-wise quantization by determining the range
(maximum - minimum) and offset (minimum value) of each row in the input
matrix, and then scaling each element to an 8-bit number between 0 and
255. To later de-quantize values, the scale (range / 255) and offset
(bias) are stored alongside the data. More precisely, each row contains
int8 elements for each quantized element, and the last 4 bytes
of each row in the output matrix are a float storing the scale
followed by another float containing the scale.)
)DOC")
.Input(0, "input", "Float16 input data")
.Output(0, "output", "Fused scale, bias and quantized data");
NO_GRADIENT(HalfFloatToFused8BitRowwiseQuantizedHalfScaleBias);
REGISTER_CPU_OPERATOR(
Fused8BitRowwiseQuantizedToFloat,
Fused8BitRowwiseQuantizedToFloatOp<
float,
float,
nullptr,
false,
CPUContext>);
OPERATOR_SCHEMA(Fused8BitRowwiseQuantizedToFloat)
.NumInputs(1)
.NumOutputs(1)
.TensorInferenceFunction([](const OperatorDef& /* def */,
const vector<TensorShape>& in) {
vector<TensorShape> out;
TensorShape X = in[0];
X.set_dims(
X.dims().size() - 1, X.dims(X.dims().size() - 1) - 2 * sizeof(float));
out.push_back(std::move(X));
out[0].set_data_type(TensorProto_DataType_FLOAT);
return out;
})
.SetDoc(R"DOC(
De-quantizes the result of the
FloatToFused8BitRowwiseQuantized operator. The input is expected to
encode the scale as a 32-bit float in the second to the last 4 bytes of each
row, followed by the bias as a 32-bit float in the next 4 bytes, and the
quantized values in the preceding bytes of the row. The output is a
matrix containing only the values, but de-quantized. De-quantization is
performed by multiplying each value by its row's scale and bias
parameters. The de-quantized values will thus not be exactly equal to
the original, un-quantized floating point values.
)DOC")
.Input(
0,
"scale_bias_quantized_input",
"Fused scale, bias and quantized data")
.Output(0, "float_output", "Float32 data");
NO_GRADIENT(Fused8BitRowwiseQuantizedToFloat);
REGISTER_CPU_OPERATOR(
Fused8BitRowwiseQuantizedHalfScaleBiasToFloat,
Fused8BitRowwiseQuantizedToFloatOp<
float,
at::Half,
nullptr,
false,
CPUContext>);
OPERATOR_SCHEMA(Fused8BitRowwiseQuantizedHalfScaleBiasToFloat)
.NumInputs(1)
.NumOutputs(1)
.TensorInferenceFunction([](const OperatorDef& /* def */,
const vector<TensorShape>& in) {
vector<TensorShape> out;
TensorShape X = in[0];
X.set_dims(
X.dims().size() - 1,
X.dims(X.dims().size() - 1) - 2 * sizeof(at::Half));
out.push_back(std::move(X));
out[0].set_data_type(TensorProto_DataType_FLOAT);
return out;
})
.SetDoc(R"DOC(
De-quantizes the result of the
FloatToFused8BitRowwiseQuantized operator. The input is expected to
encode the scale as a 16-bit float in the second to the last 2 bytes of each
row, followed by the bias as a 16-bit float in the next 2 bytes, and the
quantized values in the preceding bytes of the row. The output is a
matrix containing only the values, but de-quantized. De-quantization is
performed by multiplying each value by its row's scale and bias
parameters. The de-quantized values will thus not be exactly equal to
the original, un-quantized floating point values.
)DOC")
.Input(
0,
"scale_bias_quantized_input",
"Fused scale, bias and quantized data")
.Output(0, "float_output", "Float32 data");
NO_GRADIENT(Fused8BitRowwiseQuantizedHalfScaleBiasToFloat);
REGISTER_CPU_OPERATOR(
Fused8BitRowwiseQuantizedToHalfFloat,
Fused8BitRowwiseQuantizedToFloatOp<
at::Half,
float,
convertfp32fp16,
true,
CPUContext>);
OPERATOR_SCHEMA(Fused8BitRowwiseQuantizedToHalfFloat)
.NumInputs(1)
.NumOutputs(1)
.TensorInferenceFunction([](const OperatorDef& /* def */,
const vector<TensorShape>& in) {
vector<TensorShape> out;
TensorShape X = in[0];
X.set_dims(
X.dims().size() - 1, X.dims(X.dims().size() - 1) - 2 * sizeof(float));
out.push_back(std::move(X));
out[0].set_data_type(TensorProto_DataType_FLOAT16);
return out;
})
.SetDoc(R"DOC(
De-quantizes the result of the
HalfFloatToFused8BitRowwiseQuantized operator. The input is expected to
encode the scale as a 32-bit float in the second to the last 4 bytes of each
row, followed by the bias as a 32-bit float in the next 4 bytes, and the
quantized values in the preceding bytes of the row. The output is a
matrix containing only the values, but de-quantized. De-quantization is
performed by multiplying each value by its row's scale and bias
parameters. The de-quantized values will thus not be exactly equal to
the original, un-quantized floating point values.
)DOC")
.Input(
0,
"scale_bias_quantized_input",
"Fused scale, bias and quantized data")
.Output(0, "float16_output", "Float16 data");
NO_GRADIENT(Fused8BitRowwiseQuantizedToHalfFloat);
REGISTER_CPU_OPERATOR(
Fused8BitRowwiseQuantizedHalfScaleBiasToHalfFloat,
Fused8BitRowwiseQuantizedToFloatOp<
float,
at::Half,
nullptr,
false,
CPUContext>);
OPERATOR_SCHEMA(Fused8BitRowwiseQuantizedHalfScaleBiasToHalfFloat)
.NumInputs(1)
.NumOutputs(1)
.TensorInferenceFunction([](const OperatorDef& /* def */,
const vector<TensorShape>& in) {
vector<TensorShape> out;
TensorShape X = in[0];
X.set_dims(
X.dims().size() - 1,
X.dims(X.dims().size() - 1) - 2 * sizeof(at::Half));
out.push_back(std::move(X));
out[0].set_data_type(TensorProto_DataType_FLOAT);
return out;
})
.SetDoc(R"DOC(
De-quantizes the result of the
FloatToFused8BitRowwiseQuantized operator. The input is expected to
encode the scale as a 16-bit float in the second to the last 2 bytes of each
row, followed by the bias as a 16-bit float in the next 2 bytes, and the
quantized values in the preceding bytes of the row. The output is a
matrix containing only the values, but de-quantized. De-quantization is
performed by multiplying each value by its row's scale and bias
parameters. The de-quantized values will thus not be exactly equal to
the original, un-quantized floating point values.
)DOC")
.Input(
0,
"scale_bias_quantized_input",
"Fused scale, bias and quantized data")
.Output(0, "float_output", "Float32 data");
NO_GRADIENT(Fused8BitRowwiseQuantizedHalfScaleBiasToHalfFloat);
} // namespace caffe2
// To workaround comma
using Fused8BitRowwiseQuantizedToFloatCPUOp =
caffe2::Fused8BitRowwiseQuantizedToFloatOp<
float,
float,
nullptr,
false,
caffe2::CPUContext>;
C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
Fused8BitRowwiseQuantizedToFloat,
"_caffe2::Fused8BitRowwiseQuantizedToFloat(Tensor scale_bias_quantized_input) -> Tensor",
Fused8BitRowwiseQuantizedToFloatCPUOp);