mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33426 Make 2/4/8-bit fused rowwise conversion operators more general to work for N-dim tensors Test Plan: CI Reviewed By: ellie-wen Differential Revision: D19943136 fbshipit-source-id: 47008544dd7e1d11a346d34f35449e0fcc0e7ee0
339 lines
12 KiB
C++
339 lines
12 KiB
C++
#include "caffe2/operators/fused_rowwise_8bit_conversion_ops.h"
|
|
#include "c10/util/Registry.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
namespace {
|
|
void convertfp16fp32(float* dst, const at::Half* src, size_t N) {
|
|
for (size_t i = 0; i < N; i++) {
|
|
dst[i] = src[i];
|
|
}
|
|
}
|
|
|
|
void convertfp32fp16(at::Half* dst, const float* src, size_t N) {
|
|
for (size_t i = 0; i < N; i++) {
|
|
dst[i] = src[i];
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
FloatToFused8BitRowwiseQuantized,
|
|
FloatToFused8BitRowwiseQuantizedOp<
|
|
float,
|
|
float,
|
|
nullptr,
|
|
false,
|
|
CPUContext>);
|
|
OPERATOR_SCHEMA(FloatToFused8BitRowwiseQuantized)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1, X.dims(X.dims().size() - 1) + 2 * sizeof(float));
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_UINT8);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
Applies 8-bit row-wise quantization by determining the range
|
|
(maximum - minimum) and offset (minimum value) of each row in the input
|
|
matrix, and then scaling each element to an 8-bit number between 0 and
|
|
255. To later de-quantize values, the scale (range / 255) and offset
|
|
(bias) are stored alongside the data. More precisely, each row contains
|
|
int8 elements for each quantized element, and the last 8 bytes
|
|
of each row in the output matrix are a float storing the scale
|
|
followed by another float containing the scale.
|
|
For N-dimensional input tensor, the first N-1 dimensions are interpreted as
|
|
rows and the last dimension is interpreted as a column. For example, an
|
|
input tensor with dimension 5x2x4 is interpreted as 10 rows and 4 columns.
|
|
)
|
|
)DOC")
|
|
.Input(0, "input", "Float32 input data")
|
|
.Output(0, "output", "Fused scale, bias and quantized data");
|
|
NO_GRADIENT(FloatToFused8BitRowwiseQuantized);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
FloatToFused8BitRowwiseQuantizedHalfScaleBias,
|
|
FloatToFused8BitRowwiseQuantizedOp<
|
|
float,
|
|
at::Half,
|
|
nullptr,
|
|
false,
|
|
CPUContext>);
|
|
OPERATOR_SCHEMA(FloatToFused8BitRowwiseQuantizedHalfScaleBias)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1,
|
|
X.dims(X.dims().size() - 1) + 2 * sizeof(at::Half));
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_UINT8);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
Applies 8-bit row-wise quantization by determining the range
|
|
(maximum - minimum) and offset (minimum value) of each row in the input
|
|
matrix, and then scaling each element to an 8-bit number between 0 and
|
|
255. To later de-quantize values, the scale (range / 255) and offset
|
|
(bias) are stored alongside the data. More precisely, each row contains
|
|
int8 elements for each quantized element, and the last 4 bytes
|
|
of each row in the output matrix are a half float storing the scale
|
|
followed by another half float containing the scale.)
|
|
)DOC")
|
|
.Input(0, "input", "Float32 input data")
|
|
.Output(0, "output", "Fused scale, bias and quantized data");
|
|
NO_GRADIENT(FloatToFused8BitRowwiseQuantizedHalfScaleBias);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
HalfFloatToFused8BitRowwiseQuantized,
|
|
FloatToFused8BitRowwiseQuantizedOp<
|
|
at::Half,
|
|
float,
|
|
convertfp16fp32,
|
|
true,
|
|
CPUContext>);
|
|
OPERATOR_SCHEMA(HalfFloatToFused8BitRowwiseQuantized)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1, X.dims(X.dims().size() - 1) + 2 * sizeof(float));
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_UINT8);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
Applies 8-bit row-wise quantization by determining the range
|
|
(maximum - minimum) and offset (minimum value) of each row in the input
|
|
matrix, and then scaling each element to an 8-bit number between 0 and
|
|
255. To later de-quantize values, the scale (range / 255) and offset
|
|
(bias) are stored alongside the data. More precisely, each row contains
|
|
int8 elements for each quantized element, and the last 8 bytes
|
|
of each row in the output matrix are a float storing the scale
|
|
followed by another float containing the scale.)
|
|
)DOC")
|
|
.Input(0, "input", "Float16 input data")
|
|
.Output(0, "output", "Fused scale, bias and quantized data");
|
|
NO_GRADIENT(HalfFloatToFused8BitRowwiseQuantized);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
HalfFloatToFused8BitRowwiseQuantizedHalfScaleBias,
|
|
FloatToFused8BitRowwiseQuantizedOp<
|
|
at::Half,
|
|
at::Half,
|
|
convertfp16fp32,
|
|
true,
|
|
CPUContext>);
|
|
OPERATOR_SCHEMA(HalfFloatToFused8BitRowwiseQuantizedHalfScaleBias)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1,
|
|
X.dims(X.dims().size() - 1) + 2 * sizeof(at::Half));
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_UINT8);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
Applies 8-bit row-wise quantization by determining the range
|
|
(maximum - minimum) and offset (minimum value) of each row in the input
|
|
matrix, and then scaling each element to an 8-bit number between 0 and
|
|
255. To later de-quantize values, the scale (range / 255) and offset
|
|
(bias) are stored alongside the data. More precisely, each row contains
|
|
int8 elements for each quantized element, and the last 4 bytes
|
|
of each row in the output matrix are a float storing the scale
|
|
followed by another float containing the scale.)
|
|
)DOC")
|
|
.Input(0, "input", "Float16 input data")
|
|
.Output(0, "output", "Fused scale, bias and quantized data");
|
|
NO_GRADIENT(HalfFloatToFused8BitRowwiseQuantizedHalfScaleBias);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
Fused8BitRowwiseQuantizedToFloat,
|
|
Fused8BitRowwiseQuantizedToFloatOp<
|
|
float,
|
|
float,
|
|
nullptr,
|
|
false,
|
|
CPUContext>);
|
|
OPERATOR_SCHEMA(Fused8BitRowwiseQuantizedToFloat)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1, X.dims(X.dims().size() - 1) - 2 * sizeof(float));
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_FLOAT);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
De-quantizes the result of the
|
|
FloatToFused8BitRowwiseQuantized operator. The input is expected to
|
|
encode the scale as a 32-bit float in the second to the last 4 bytes of each
|
|
row, followed by the bias as a 32-bit float in the next 4 bytes, and the
|
|
quantized values in the preceding bytes of the row. The output is a
|
|
matrix containing only the values, but de-quantized. De-quantization is
|
|
performed by multiplying each value by its row's scale and bias
|
|
parameters. The de-quantized values will thus not be exactly equal to
|
|
the original, un-quantized floating point values.
|
|
)DOC")
|
|
.Input(
|
|
0,
|
|
"scale_bias_quantized_input",
|
|
"Fused scale, bias and quantized data")
|
|
.Output(0, "float_output", "Float32 data");
|
|
NO_GRADIENT(Fused8BitRowwiseQuantizedToFloat);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
Fused8BitRowwiseQuantizedHalfScaleBiasToFloat,
|
|
Fused8BitRowwiseQuantizedToFloatOp<
|
|
float,
|
|
at::Half,
|
|
nullptr,
|
|
false,
|
|
CPUContext>);
|
|
OPERATOR_SCHEMA(Fused8BitRowwiseQuantizedHalfScaleBiasToFloat)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1,
|
|
X.dims(X.dims().size() - 1) - 2 * sizeof(at::Half));
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_FLOAT);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
De-quantizes the result of the
|
|
FloatToFused8BitRowwiseQuantized operator. The input is expected to
|
|
encode the scale as a 16-bit float in the second to the last 2 bytes of each
|
|
row, followed by the bias as a 16-bit float in the next 2 bytes, and the
|
|
quantized values in the preceding bytes of the row. The output is a
|
|
matrix containing only the values, but de-quantized. De-quantization is
|
|
performed by multiplying each value by its row's scale and bias
|
|
parameters. The de-quantized values will thus not be exactly equal to
|
|
the original, un-quantized floating point values.
|
|
)DOC")
|
|
.Input(
|
|
0,
|
|
"scale_bias_quantized_input",
|
|
"Fused scale, bias and quantized data")
|
|
.Output(0, "float_output", "Float32 data");
|
|
NO_GRADIENT(Fused8BitRowwiseQuantizedHalfScaleBiasToFloat);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
Fused8BitRowwiseQuantizedToHalfFloat,
|
|
Fused8BitRowwiseQuantizedToFloatOp<
|
|
at::Half,
|
|
float,
|
|
convertfp32fp16,
|
|
true,
|
|
CPUContext>);
|
|
OPERATOR_SCHEMA(Fused8BitRowwiseQuantizedToHalfFloat)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1, X.dims(X.dims().size() - 1) - 2 * sizeof(float));
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_FLOAT16);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
De-quantizes the result of the
|
|
HalfFloatToFused8BitRowwiseQuantized operator. The input is expected to
|
|
encode the scale as a 32-bit float in the second to the last 4 bytes of each
|
|
row, followed by the bias as a 32-bit float in the next 4 bytes, and the
|
|
quantized values in the preceding bytes of the row. The output is a
|
|
matrix containing only the values, but de-quantized. De-quantization is
|
|
performed by multiplying each value by its row's scale and bias
|
|
parameters. The de-quantized values will thus not be exactly equal to
|
|
the original, un-quantized floating point values.
|
|
)DOC")
|
|
.Input(
|
|
0,
|
|
"scale_bias_quantized_input",
|
|
"Fused scale, bias and quantized data")
|
|
.Output(0, "float16_output", "Float16 data");
|
|
NO_GRADIENT(Fused8BitRowwiseQuantizedToHalfFloat);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
Fused8BitRowwiseQuantizedHalfScaleBiasToHalfFloat,
|
|
Fused8BitRowwiseQuantizedToFloatOp<
|
|
float,
|
|
at::Half,
|
|
nullptr,
|
|
false,
|
|
CPUContext>);
|
|
OPERATOR_SCHEMA(Fused8BitRowwiseQuantizedHalfScaleBiasToHalfFloat)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1,
|
|
X.dims(X.dims().size() - 1) - 2 * sizeof(at::Half));
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_FLOAT);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
De-quantizes the result of the
|
|
FloatToFused8BitRowwiseQuantized operator. The input is expected to
|
|
encode the scale as a 16-bit float in the second to the last 2 bytes of each
|
|
row, followed by the bias as a 16-bit float in the next 2 bytes, and the
|
|
quantized values in the preceding bytes of the row. The output is a
|
|
matrix containing only the values, but de-quantized. De-quantization is
|
|
performed by multiplying each value by its row's scale and bias
|
|
parameters. The de-quantized values will thus not be exactly equal to
|
|
the original, un-quantized floating point values.
|
|
)DOC")
|
|
.Input(
|
|
0,
|
|
"scale_bias_quantized_input",
|
|
"Fused scale, bias and quantized data")
|
|
.Output(0, "float_output", "Float32 data");
|
|
NO_GRADIENT(Fused8BitRowwiseQuantizedHalfScaleBiasToHalfFloat);
|
|
|
|
} // namespace caffe2
|
|
|
|
// To workaround comma
|
|
|
|
using Fused8BitRowwiseQuantizedToFloatCPUOp =
|
|
caffe2::Fused8BitRowwiseQuantizedToFloatOp<
|
|
float,
|
|
float,
|
|
nullptr,
|
|
false,
|
|
caffe2::CPUContext>;
|
|
|
|
C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
|
|
Fused8BitRowwiseQuantizedToFloat,
|
|
"_caffe2::Fused8BitRowwiseQuantizedToFloat(Tensor scale_bias_quantized_input) -> Tensor",
|
|
Fused8BitRowwiseQuantizedToFloatCPUOp);
|