mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/35708 these are not actually needed and it breaks the normal include guard that selects correct Half implementation Test Plan: CI green Reviewed By: malfet Differential Revision: D20744681 fbshipit-source-id: 70e3667593c987434415ad8ac3b68828875fc3fd
302 lines
11 KiB
C++
302 lines
11 KiB
C++
#include "caffe2/operators/fused_rowwise_nbit_conversion_ops.h"
|
|
#include "c10/util/Registry.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
using std::uint16_t;
|
|
using std::vector;
|
|
|
|
namespace internal {
|
|
void convertfp32fp16(at::Half* dst, const float* src, size_t N) {
|
|
for (size_t i = 0; i < N; i++) {
|
|
dst[i] = src[i];
|
|
}
|
|
}
|
|
|
|
} // namespace internal
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
FloatToFused4BitRowwiseQuantized,
|
|
FloatToFusedNBitRowwiseQuantizedOp<4, float, internal::convertfp32fp32>);
|
|
OPERATOR_SCHEMA(FloatToFused4BitRowwiseQuantized)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
// divide over 2 and round up, add 4 for the extra scale and bias
|
|
X.set_dims(
|
|
X.dims().size() - 1,
|
|
(X.dims(X.dims().size() - 1) + 1) / 2 + 2 * sizeof(at::Half));
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_UINT8);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
Applies 4-bit row-wise quantization by determining the range
|
|
(maximum - minimum) and offset (minimum value) of each row in the input
|
|
matrix, and then scaling each element to an 4-bit number between 0 and
|
|
15. To later de-quantize values, the scale (range / 15) and zero_point
|
|
are stored alongside the data. More precisely, each row first has quantized
|
|
values, and then 2-byte fp16 scale and 2-byte zero_offset.)
|
|
)DOC")
|
|
.Input(0, "input", "Float32 input data")
|
|
.Output(0, "output", "Fused scale, bias and quantized data");
|
|
NO_GRADIENT(FloatToFused4BitRowwiseQuantized);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
HalfToFused4BitRowwiseQuantized,
|
|
FloatToFusedNBitRowwiseQuantizedOp<4, at::Half, internal::convertfp16fp32>);
|
|
OPERATOR_SCHEMA(HalfToFused4BitRowwiseQuantized)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1,
|
|
(X.dims(X.dims().size() - 1) + 1) / 2 + 2 * sizeof(at::Half));
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_UINT8);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
Applies 4-bit row-wise quantization by determining the range
|
|
(maximum - minimum) and offset (minimum value) of each row in the input
|
|
matrix, and then scaling each element to an 4-bit number between 0 and
|
|
15. To later de-quantize values, the scale (range / 15) and zero_point
|
|
are stored alongside the data. More precisely, each row first has quantized
|
|
values, and then 2-byte fp16 scale and 2-byte zero_offset.)
|
|
)DOC")
|
|
.Input(0, "input", "Float16 input data")
|
|
.Output(0, "output", "Fused scale, bias and quantized data");
|
|
NO_GRADIENT(HalfToFused4BitRowwiseQuantized);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
Fused4BitRowwiseQuantizedToFloat,
|
|
FusedNBitRowwiseQuantizedToFloatOp<4, float, internal::convertfp32fp32>);
|
|
OPERATOR_SCHEMA(Fused4BitRowwiseQuantizedToFloat)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1,
|
|
(X.dims(X.dims().size() - 1) - 2 * sizeof(at::Half)) * 2);
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_FLOAT);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
De-quantizes the result of the
|
|
FloatToFused4BitRowwiseQuantized operator. The input is expected to first have
|
|
quantized values, then 2-byte fp16 scale and 1-byte zero_offset. The output is a
|
|
matrix containing only the values, but de-quantized. De-quantization is
|
|
performed by multiplying each value by its row's scale and zero_point
|
|
parameters. The de-quantized values will thus not be exactly equal to
|
|
the original, un-quantized floating point values.
|
|
)DOC")
|
|
.Input(
|
|
0,
|
|
"scale_bias_quantized_input",
|
|
"Fused scale, bias and quantized data")
|
|
.Output(0, "float_output", "Float32 data");
|
|
NO_GRADIENT(Fused4BitRowwiseQuantizedToFloat);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
Fused4BitRowwiseQuantizedToHalf,
|
|
FusedNBitRowwiseQuantizedToFloatOp<4, at::Half, internal::convertfp32fp16>);
|
|
OPERATOR_SCHEMA(Fused4BitRowwiseQuantizedToHalf)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1,
|
|
(X.dims(X.dims().size() - 1) - 2 * sizeof(at::Half)) * 2);
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_FLOAT16);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
De-quantizes the result of the
|
|
FloatToFused4BitRowwiseQuantized operator. The input is expected to first have
|
|
quantized values, then 2-byte fp16 scale and 1-byte zero_offset. The output is a
|
|
matrix containing only the values, but de-quantized. De-quantization is
|
|
performed by multiplying each value by its row's scale and zero_point
|
|
parameters. The de-quantized values will thus not be exactly equal to
|
|
the original, un-quantized floating point values.
|
|
)DOC")
|
|
.Input(
|
|
0,
|
|
"scale_bias_quantized_input",
|
|
"Fused scale, bias and quantized data")
|
|
.Output(0, "float16_output", "Float16 data");
|
|
NO_GRADIENT(Fused4BitRowwiseQuantizedToHalf);
|
|
|
|
REGISTER_CPU_OPERATOR_WITH_ENGINE(
|
|
FloatToFused4BitRowwiseQuantized,
|
|
GREEDY,
|
|
FloatToFusedNBitRowwiseQuantizedOp<
|
|
4,
|
|
float,
|
|
internal::convertfp32fp32,
|
|
true /*GREEDY*/>);
|
|
REGISTER_CPU_OPERATOR_WITH_ENGINE(
|
|
HalfToFused4BitRowwiseQuantized,
|
|
GREEDY,
|
|
FloatToFusedNBitRowwiseQuantizedOp<
|
|
4,
|
|
at::Half,
|
|
internal::convertfp16fp32,
|
|
true /*GREEDY*/>);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
FloatToFused2BitRowwiseQuantized,
|
|
FloatToFusedNBitRowwiseQuantizedOp<2, float, internal::convertfp32fp32>);
|
|
OPERATOR_SCHEMA(FloatToFused2BitRowwiseQuantized)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
// divide over 4 and round up, add 4 for the extra scale and bias
|
|
X.set_dims(
|
|
X.dims().size() - 1,
|
|
(X.dims(X.dims().size() - 1) + 3) / 4 + 2 * sizeof(at::Half));
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_UINT8);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
Applies 2-bit row-wise quantization by determining the range
|
|
(maximum - minimum) and offset (minimum value) of each row in the input
|
|
matrix, and then scaling each element to an 2-bit number between 0 and
|
|
3. To later de-quantize values, the scale (range / 3) and zero_point
|
|
are stored alongside the data. More precisely, each row first has quantized
|
|
values, and then 2-byte fp16 scale and 2-byte zero_offset.)
|
|
)DOC")
|
|
.Input(0, "input", "Float32 input data")
|
|
.Output(0, "output", "Fused scale, bias and quantized data");
|
|
NO_GRADIENT(FloatToFused2BitRowwiseQuantized);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
HalfToFused2BitRowwiseQuantized,
|
|
FloatToFusedNBitRowwiseQuantizedOp<2, at::Half, internal::convertfp16fp32>);
|
|
OPERATOR_SCHEMA(HalfToFused2BitRowwiseQuantized)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1,
|
|
(X.dims(X.dims().size() - 1) + 3) / 4 + 2 * sizeof(at::Half));
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_UINT8);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
Applies 2-bit row-wise quantization by determining the range
|
|
(maximum - minimum) and offset (minimum value) of each row in the input
|
|
matrix, and then scaling each element to an 2-bit number between 0 and
|
|
3. To later de-quantize values, the scale (range / 3) and zero_point
|
|
are stored alongside the data. More precisely, each row first has quantized
|
|
values, and then 2-byte fp16 scale and 2-byte zero_offset.)
|
|
)DOC")
|
|
.Input(0, "input", "Float16 input data")
|
|
.Output(0, "output", "Fused scale, bias and quantized data");
|
|
NO_GRADIENT(HalfToFused2BitRowwiseQuantized);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
Fused2BitRowwiseQuantizedToFloat,
|
|
FusedNBitRowwiseQuantizedToFloatOp<2, float, internal::convertfp32fp32>);
|
|
OPERATOR_SCHEMA(Fused2BitRowwiseQuantizedToFloat)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1,
|
|
(X.dims(X.dims().size() - 1) - 2 * sizeof(at::Half)) * 4);
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_FLOAT);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
De-quantizes the result of the
|
|
FloatToFused2BitRowwiseQuantized operator. The input is expected to first have
|
|
quantized values, then 2-byte fp16 scale and 1-byte zero_offset. The output is a
|
|
matrix containing only the values, but de-quantized. De-quantization is
|
|
performed by multiplying each value by its row's scale and zero_point
|
|
parameters. The de-quantized values will thus not be exactly equal to
|
|
the original, un-quantized floating point values.
|
|
)DOC")
|
|
.Input(
|
|
0,
|
|
"scale_bias_quantized_input",
|
|
"Fused scale, bias and quantized data")
|
|
.Output(0, "float_output", "Float32 data");
|
|
NO_GRADIENT(Fused2BitRowwiseQuantizedToFloat);
|
|
|
|
REGISTER_CPU_OPERATOR(
|
|
Fused2BitRowwiseQuantizedToHalf,
|
|
FusedNBitRowwiseQuantizedToFloatOp<2, at::Half, internal::convertfp32fp16>);
|
|
OPERATOR_SCHEMA(Fused2BitRowwiseQuantizedToHalf)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.TensorInferenceFunction([](const OperatorDef& /* def */,
|
|
const vector<TensorShape>& in) {
|
|
vector<TensorShape> out;
|
|
TensorShape X = in[0];
|
|
X.set_dims(
|
|
X.dims().size() - 1,
|
|
(X.dims(X.dims().size() - 1) - 2 * sizeof(at::Half)) * 4);
|
|
out.push_back(std::move(X));
|
|
out[0].set_data_type(TensorProto_DataType_FLOAT16);
|
|
return out;
|
|
})
|
|
.SetDoc(R"DOC(
|
|
De-quantizes the result of the
|
|
FloatToFused2BitRowwiseQuantized operator. The input is expected to first have
|
|
quantized values, then 2-byte fp16 scale and 1-byte zero_offset. The output is a
|
|
matrix containing only the values, but de-quantized. De-quantization is
|
|
performed by multiplying each value by its row's scale and zero_point
|
|
parameters. The de-quantized values will thus not be exactly equal to
|
|
the original, un-quantized floating point values.
|
|
)DOC")
|
|
.Input(
|
|
0,
|
|
"scale_bias_quantized_input",
|
|
"Fused scale, bias and quantized data")
|
|
.Output(0, "float16_output", "Float16 data");
|
|
NO_GRADIENT(Fused2BitRowwiseQuantizedToHalf);
|
|
|
|
REGISTER_CPU_OPERATOR_WITH_ENGINE(
|
|
FloatToFused2BitRowwiseQuantized,
|
|
GREEDY,
|
|
FloatToFusedNBitRowwiseQuantizedOp<
|
|
2,
|
|
float,
|
|
internal::convertfp32fp32,
|
|
true /*GREEDY*/>);
|
|
REGISTER_CPU_OPERATOR_WITH_ENGINE(
|
|
HalfToFused2BitRowwiseQuantized,
|
|
GREEDY,
|
|
FloatToFusedNBitRowwiseQuantizedOp<
|
|
2,
|
|
at::Half,
|
|
internal::convertfp16fp32,
|
|
true /*GREEDY*/>);
|
|
|
|
} // namespace caffe2
|