mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-26 22:35:43 +00:00
BFP schemas: Change block dimension type to Int (#13169)
* Change block dimension type to Int from Ints. * In response to feedback that the block dimension corresponds to the reduction dimension of the consuming matrix multiplication. There is always only 1 reduction dimension.
This commit is contained in:
parent
cf075fcbad
commit
b09dd11ece
3 changed files with 565 additions and 232 deletions
|
|
@ -20,6 +20,7 @@ Do not modify directly.*
|
|||
* <a href="#com.microsoft.DecoderAttention">com.microsoft.DecoderAttention</a>
|
||||
* <a href="#com.microsoft.DequantizeBFP">com.microsoft.DequantizeBFP</a>
|
||||
* <a href="#com.microsoft.DequantizeLinear">com.microsoft.DequantizeLinear</a>
|
||||
* <a href="#com.microsoft.DequantizeWithOrder">com.microsoft.DequantizeWithOrder</a>
|
||||
* <a href="#com.microsoft.DynamicQuantizeLSTM">com.microsoft.DynamicQuantizeLSTM</a>
|
||||
* <a href="#com.microsoft.DynamicQuantizeMatMul">com.microsoft.DynamicQuantizeMatMul</a>
|
||||
* <a href="#com.microsoft.EmbedLayerNormalization">com.microsoft.EmbedLayerNormalization</a>
|
||||
|
|
@ -57,11 +58,14 @@ Do not modify directly.*
|
|||
* <a href="#com.microsoft.QLinearReduceMean">com.microsoft.QLinearReduceMean</a>
|
||||
* <a href="#com.microsoft.QLinearSigmoid">com.microsoft.QLinearSigmoid</a>
|
||||
* <a href="#com.microsoft.QLinearSoftmax">com.microsoft.QLinearSoftmax</a>
|
||||
* <a href="#com.microsoft.QOrderedAttention">com.microsoft.QOrderedAttention</a>
|
||||
* <a href="#com.microsoft.QOrderedGelu">com.microsoft.QOrderedGelu</a>
|
||||
* <a href="#com.microsoft.QOrderedLayerNormalization">com.microsoft.QOrderedLayerNormalization</a>
|
||||
* <a href="#com.microsoft.QOrderedLongformerAttention">com.microsoft.QOrderedLongformerAttention</a>
|
||||
* <a href="#com.microsoft.QOrderedMatMul">com.microsoft.QOrderedMatMul</a>
|
||||
* <a href="#com.microsoft.QuantizeBFP">com.microsoft.QuantizeBFP</a>
|
||||
* <a href="#com.microsoft.QuantizeLinear">com.microsoft.QuantizeLinear</a>
|
||||
* <a href="#com.microsoft.QuantizeWithOrder">com.microsoft.QuantizeWithOrder</a>
|
||||
* <a href="#com.microsoft.Range">com.microsoft.Range</a>
|
||||
* <a href="#com.microsoft.ReduceSumInteger">com.microsoft.ReduceSumInteger</a>
|
||||
* <a href="#com.microsoft.Rfft">com.microsoft.Rfft</a>
|
||||
|
|
@ -989,7 +993,9 @@ This version of the operator has been available since version 1 of the 'com.micr
|
|||
|
||||
### <a name="com.microsoft.DequantizeBFP"></a><a name="com.microsoft.dequantizebfp">**com.microsoft.DequantizeBFP**</a>
|
||||
|
||||
The BFP dequantization operator. It consumes the raw BFP data and some metadata such as the shape and strides of the original tensor and computes the dequantized tensor.
|
||||
The BFP dequantization operator.
|
||||
It consumes the raw BFP data and some metadata such as the shape and strides of the original tensor and computes the dequantized tensor.
|
||||
More documentation on the BFP format can be found in this paper: https://www.microsoft.com/en-us/research/publication/pushing-the-limits-of-narrow-precision-inferencing-at-cloud-scale-with-microsoft-floating-point/
|
||||
|
||||
#### Version
|
||||
|
||||
|
|
@ -1000,8 +1006,8 @@ This version of the operator has been available since version 1 of the 'com.micr
|
|||
<dl>
|
||||
<dt><tt>bfp_type</tt> : int (required)</dt>
|
||||
<dd>The type of BFP - must match with the BFPType enum</dd>
|
||||
<dt><tt>block_dims</tt> : list of ints</dt>
|
||||
<dd>Numbers within a bounding box will span across these dimensions.Any dimension not in this list is the same for all numbers within a bounding box.As an example, consider a 2D tensor with shape [d0, d1] and block_dims equal to [1].Within a bounding box, all elements will be within the same row but will be from different columnns.The default is the last dimension.</dd>
|
||||
<dt><tt>block_dim</tt> : int</dt>
|
||||
<dd>Each bounding box spans this dimension.Typically, the block dimension corresponds to the reduction dimension of the matrix multipication that consumes the output of this operator.For example, for a 2D matrix multiplication A@W, QuantizeBFP(A) would use block_dim 1 and QuantizeBFP(W) would use block_dim 0.The default is the last dimension.</dd>
|
||||
<dt><tt>dtype</tt> : int</dt>
|
||||
<dd>The datatype to dequantize to.</dd>
|
||||
</dl>
|
||||
|
|
@ -1081,6 +1087,53 @@ This version of the operator has been available since version 1 of the 'com.micr
|
|||
</dl>
|
||||
|
||||
|
||||
### <a name="com.microsoft.DequantizeWithOrder"></a><a name="com.microsoft.dequantizewithorder">**com.microsoft.DequantizeWithOrder**</a>
|
||||
|
||||
Dequantize input matrix to specific layout used in cublaslt. attr to specify output type, float16 or float32
|
||||
|
||||
#### Version
|
||||
|
||||
This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
|
||||
|
||||
#### Attributes
|
||||
|
||||
<dl>
|
||||
<dt><tt>order_input</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of input matrix. See the schema of QuantizeWithOrder for order definition.</dd>
|
||||
<dt><tt>order_output</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of output matrix</dd>
|
||||
<dt><tt>to</tt> : int (required)</dt>
|
||||
<dd>The output data type, only support TensorProto_DataType_FLOAT (1) and TensorProto_DataType_FLOAT16 (10)</dd>
|
||||
</dl>
|
||||
|
||||
#### Inputs
|
||||
|
||||
<dl>
|
||||
<dt><tt>input</tt> : Q</dt>
|
||||
<dd>TODO: input tensor of (ROWS, COLS). if less than 2d, will broadcast to (1, X). If 3d, it is treated as (B, ROWS, COS)</dd>
|
||||
<dt><tt>scale_input</tt> : S</dt>
|
||||
<dd>scale of the input</dd>
|
||||
</dl>
|
||||
|
||||
#### Outputs
|
||||
|
||||
<dl>
|
||||
<dt><tt>output</tt> : F</dt>
|
||||
<dd>output tensor</dd>
|
||||
</dl>
|
||||
|
||||
#### Type Constraints
|
||||
|
||||
<dl>
|
||||
<dt><tt>Q</tt> : tensor(int8)</dt>
|
||||
<dd>Constrain input and output types to int8 tensors.</dd>
|
||||
<dt><tt>F</tt> : tensor(float16), tensor(float)</dt>
|
||||
<dd>Constrain to float types</dd>
|
||||
<dt><tt>S</tt> : tensor(float)</dt>
|
||||
<dd>Constrain Scale to float32 types</dd>
|
||||
</dl>
|
||||
|
||||
|
||||
### <a name="com.microsoft.DynamicQuantizeLSTM"></a><a name="com.microsoft.dynamicquantizelstm">**com.microsoft.DynamicQuantizeLSTM**</a>
|
||||
|
||||
#### Version
|
||||
|
|
@ -2916,6 +2969,105 @@ This version of the operator has been available since version 1 of the 'com.micr
|
|||
</dl>
|
||||
|
||||
|
||||
### <a name="com.microsoft.QOrderedAttention"></a><a name="com.microsoft.qorderedattention">**com.microsoft.QOrderedAttention**</a>
|
||||
|
||||
Quantized version of simplified Multi-Head Self Attention(using int8 with specific matrix Layout).
|
||||
Multi-Head Self Attention that can be either unidirectional (like GPT-2) or bidirectional (like BERT).
|
||||
The mask_index input is optional. Besides raw attention mask with shape (batch_size, past_sequence_length + sequence_length)
|
||||
or (batch_size, sequence_length, past_sequence_length + sequence_length) with value 0 for masked and 1 otherwise,
|
||||
we also support other two formats: When input has right-side padding, mask_index is one dimension with shape (batch_size),
|
||||
where value of each element is the end position, or valid length of actual sequence excluding padding. When input has
|
||||
left-side padding, mask_index has shape (2 * batch_size), where the values are the exclusive end positions followed by
|
||||
the inclusive start positions. When unidirectional is 1, and each token only attend to previous tokens. For GPT-2, both past
|
||||
and present state are optional. Present state could appear in output even when past state is not in input.
|
||||
Current version does not support past/present, extra_add and qkv_hidden_sizes.
|
||||
TODO: Support them if needed in the future.
|
||||
|
||||
#### Version
|
||||
|
||||
This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
|
||||
|
||||
#### Attributes
|
||||
|
||||
<dl>
|
||||
<dt><tt>num_heads</tt> : int (required)</dt>
|
||||
<dd>Number of attention heads</dd>
|
||||
<dt><tt>order_input</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of input matrix. See the schema of QuantizeWithOrder for order definition.</dd>
|
||||
<dt><tt>order_output</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of global bias</dd>
|
||||
<dt><tt>order_weight</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of weight matrix</dd>
|
||||
<dt><tt>qkv_hidden_sizes</tt> : list of ints</dt>
|
||||
<dd>Hidden layer sizes of Q, K, V paths in Attention</dd>
|
||||
<dt><tt>unidirectional</tt> : int</dt>
|
||||
<dd>Whether every token can only attend to previous tokens. Default value is 0.</dd>
|
||||
</dl>
|
||||
|
||||
#### Inputs (17 - 20)
|
||||
|
||||
<dl>
|
||||
<dt><tt>input</tt> : Q</dt>
|
||||
<dd>3D input tensor with shape (batch_size, sequence_length, input_hidden_size)</dd>
|
||||
<dt><tt>scale_input</tt> : S</dt>
|
||||
<dd>scale of the input, scalar value (per tensor) currently.</dd>
|
||||
<dt><tt>scale_Q_gemm</tt> : S</dt>
|
||||
<dd>scale of the gemm - scalar (per-tensor quantization)</dd>
|
||||
<dt><tt>scale_K_gemm</tt> : S</dt>
|
||||
<dd>scale of the gemm - scalar (per-tensor quantization)</dd>
|
||||
<dt><tt>scale_V_gemm</tt> : S</dt>
|
||||
<dd>scale of the gemm - scalar (per-tensor quantization)</dd>
|
||||
<dt><tt>Q_weight</tt> : Q</dt>
|
||||
<dd>2D input tensor with shape (input_hidden_size, hidden_size), where hidden_size = num_heads * head_size</dd>
|
||||
<dt><tt>K_weight</tt> : Q</dt>
|
||||
<dd>2D input tensor with shape (input_hidden_size, hidden_size), where hidden_size = num_heads * head_size</dd>
|
||||
<dt><tt>V_weight</tt> : Q</dt>
|
||||
<dd>2D input tensor with shape (input_hidden_size, hidden_size), where hidden_size = num_heads * head_size</dd>
|
||||
<dt><tt>scale_Q_weight</tt> : S</dt>
|
||||
<dd>scale of the weight (scalar for per-tensor quantization or 1-D of dims [hidden_size] for per-channel quantization)</dd>
|
||||
<dt><tt>scale_K_weight</tt> : S</dt>
|
||||
<dd>scale of the weight (scalar for per-tensor quantization or 1-D of dims [hidden_size] for per-channel quantization)</dd>
|
||||
<dt><tt>scale_V_weight</tt> : S</dt>
|
||||
<dd>scale of the weight (scalar for per-tensor quantization or 1-D of dims [hidden_size] for per-channel quantization)</dd>
|
||||
<dt><tt>Q_bias</tt> : S</dt>
|
||||
<dd>1D input tensor with shape (hidden_size)</dd>
|
||||
<dt><tt>K_bias</tt> : S</dt>
|
||||
<dd>1D input tensor with shape (hidden_size)</dd>
|
||||
<dt><tt>V_bias</tt> : S</dt>
|
||||
<dd>1D input tensor with shape (hidden_size)</dd>
|
||||
<dt><tt>scale_QKT_gemm</tt> (optional) : S</dt>
|
||||
<dd>scale of the gemm - scalar (per-tensor quantization)</dd>
|
||||
<dt><tt>scale_QKT_softmax</tt> (optional) : S</dt>
|
||||
<dd>scale of the softmax result - scalar (per-tensor quantization)</dd>
|
||||
<dt><tt>scale_values_gemm</tt> : S</dt>
|
||||
<dd>scale of the gemm - scalar (per-tensor quantization). Also this is the output scale for the operator.</dd>
|
||||
<dt><tt>mask_index</tt> (optional) : G</dt>
|
||||
<dd>Attention mask with shape (batch_size, 1, max_sequence_length, max_sequence_length), (batch_size, past_sequence_length + sequence_length)or (batch_size, sequence_length, past_sequence_length + sequence_length), or index with shape (batch_size) or (2 * batch_size).</dd>
|
||||
<dt><tt>past</tt> (optional) : Q</dt>
|
||||
<dd>past state for key and value with shape (2, batch_size, num_heads, past_sequence_length, head_size).</dd>
|
||||
<dt><tt>extra_add</tt> (optional) : S</dt>
|
||||
<dd>additional add to QxK' with shape (batch_size, num_heads, sequence_length, sequence_length).</dd>
|
||||
</dl>
|
||||
|
||||
#### Outputs
|
||||
|
||||
<dl>
|
||||
<dt><tt>output</tt> : Q</dt>
|
||||
<dd>3D output tensor with shape (batch_size, sequence_length, hidden_size)</dd>
|
||||
</dl>
|
||||
|
||||
#### Type Constraints
|
||||
|
||||
<dl>
|
||||
<dt><tt>Q</tt> : tensor(int8)</dt>
|
||||
<dd>Constrain input and output types to int8 tensors.</dd>
|
||||
<dt><tt>S</tt> : tensor(float)</dt>
|
||||
<dd>Constrain scales to float32 tensors.</dd>
|
||||
<dt><tt>G</tt> : tensor(int32)</dt>
|
||||
<dd>Constrain to integer types</dd>
|
||||
</dl>
|
||||
|
||||
|
||||
### <a name="com.microsoft.QOrderedGelu"></a><a name="com.microsoft.qorderedgelu">**com.microsoft.QOrderedGelu**</a>
|
||||
|
||||
Ordered Quantize Gelu.
|
||||
|
|
@ -2928,9 +3080,9 @@ This version of the operator has been available since version 1 of the 'com.micr
|
|||
|
||||
<dl>
|
||||
<dt><tt>order_X</tt> : int</dt>
|
||||
<dd>cublasLt order of input X. Default is ROW MAJOR.</dd>
|
||||
<dd>cublasLt order of input X. Optional. See the schema of QuantizeWithOrder for order definition.</dd>
|
||||
<dt><tt>order_Y</tt> : int</dt>
|
||||
<dd>cublasLt order of matrix Y, must be same as order_X. Default is ROW MAJOR.</dd>
|
||||
<dd>cublasLt order of matrix Y, must be same as order_X if specified together. Optional.</dd>
|
||||
</dl>
|
||||
|
||||
#### Inputs
|
||||
|
|
@ -2977,7 +3129,7 @@ This version of the operator has been available since version 1 of the 'com.micr
|
|||
<dt><tt>epsilon</tt> : float</dt>
|
||||
<dd>The epsilon value to use to avoid division by zero.</dd>
|
||||
<dt><tt>order_X</tt> : int</dt>
|
||||
<dd>cublasLt order of input X. Default is ROW MAJOR.</dd>
|
||||
<dd>cublasLt order of input X. Default is ROW MAJOR. See the schema of QuantizeWithOrder for order definition.</dd>
|
||||
<dt><tt>order_Y</tt> : int</dt>
|
||||
<dd>cublasLt order of matrix Y, must be same as order_X. Default is ROW MAJOR.</dd>
|
||||
</dl>
|
||||
|
|
@ -3016,9 +3168,9 @@ This version of the operator has been available since version 1 of the 'com.micr
|
|||
</dl>
|
||||
|
||||
|
||||
### <a name="com.microsoft.QOrderedMatMul"></a><a name="com.microsoft.qorderedmatmul">**com.microsoft.QOrderedMatMul**</a>
|
||||
### <a name="com.microsoft.QOrderedLongformerAttention"></a><a name="com.microsoft.qorderedlongformerattention">**com.microsoft.QOrderedLongformerAttention**</a>
|
||||
|
||||
TODO
|
||||
Quantized version of Longformer Self Attention (using int8 with specific matrix Layout).
|
||||
|
||||
#### Version
|
||||
|
||||
|
|
@ -3027,12 +3179,100 @@ This version of the operator has been available since version 1 of the 'com.micr
|
|||
#### Attributes
|
||||
|
||||
<dl>
|
||||
<dt><tt>order_A</tt> : int</dt>
|
||||
<dd>cublasLt order of matrix A. Default is ROW MAJOR.</dd>
|
||||
<dt><tt>order_B</tt> : int</dt>
|
||||
<dd>cublasLt order of matrix B. Default is ROW MAJOR.</dd>
|
||||
<dt><tt>order_Y</tt> : int</dt>
|
||||
<dd>cublasLt order of matrix Y and optional matrix C. Default is ROW MAJOR.</dd>
|
||||
<dt><tt>num_heads</tt> : int (required)</dt>
|
||||
<dd>Number of attention heads</dd>
|
||||
<dt><tt>order_global_weight</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of weight matrix</dd>
|
||||
<dt><tt>order_input</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of input matrix. See the schema of QuantizeWithOrder for order definition.</dd>
|
||||
<dt><tt>order_output</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of global bias</dd>
|
||||
<dt><tt>order_weight</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of weight matrix</dd>
|
||||
<dt><tt>window</tt> : int (required)</dt>
|
||||
<dd>One sided attention windows length W, or half of total window length</dd>
|
||||
</dl>
|
||||
|
||||
#### Inputs
|
||||
|
||||
<dl>
|
||||
<dt><tt>input</tt> : Q</dt>
|
||||
<dd>3D input tensor with shape (batch_size, sequence_length, hidden_size), hidden_size = num_heads * head_size</dd>
|
||||
<dt><tt>scale_input</tt> : S</dt>
|
||||
<dd>scale of the input</dd>
|
||||
<dt><tt>weight</tt> : Q</dt>
|
||||
<dd>2D input tensor with shape (hidden_size, 3 * hidden_size)</dd>
|
||||
<dt><tt>scale_weight</tt> : S</dt>
|
||||
<dd>scale of the weight</dd>
|
||||
<dt><tt>bias</tt> : S</dt>
|
||||
<dd>1D input tensor with shape (3 * hidden_size), fp32 only currently.</dd>
|
||||
<dt><tt>scale_bias</tt> : S</dt>
|
||||
<dd>reserved. (not used as add bias need float value in cublasLt for normal order.)</dd>
|
||||
<dt><tt>scale_qkv_gemm</tt> : S</dt>
|
||||
<dd>scale of the output for fused kqv gemm</dd>
|
||||
<dt><tt>mask</tt> : F</dt>
|
||||
<dd>Attention mask with shape (batch_size, sequence_length)</dd>
|
||||
<dt><tt>global_weight</tt> : Q</dt>
|
||||
<dd>2D input tensor with shape (hidden_size, 3 * hidden_size)</dd>
|
||||
<dt><tt>scale_global_weight</tt> : S</dt>
|
||||
<dd>scale of the global_weight</dd>
|
||||
<dt><tt>global_bias</tt> : S</dt>
|
||||
<dd>1D input tensor with shape (3 * hidden_size)</dd>
|
||||
<dt><tt>scale_global_gemm</tt> : S</dt>
|
||||
<dd>scale of the global_qkv_gemm</dd>
|
||||
<dt><tt>global</tt> : G</dt>
|
||||
<dd>Global attention flags with shape (batch_size, sequence_length)</dd>
|
||||
<dt><tt>scale_output</tt> : S</dt>
|
||||
<dd>scale of the output</dd>
|
||||
</dl>
|
||||
|
||||
#### Outputs
|
||||
|
||||
<dl>
|
||||
<dt><tt>output</tt> : Q</dt>
|
||||
<dd>3D output tensor with shape (batch_size, sequence_length, hidden_size)</dd>
|
||||
</dl>
|
||||
|
||||
#### Type Constraints
|
||||
|
||||
<dl>
|
||||
<dt><tt>Q</tt> : tensor(int8)</dt>
|
||||
<dd>Constrain input and output types to int8 tensors.</dd>
|
||||
<dt><tt>S</tt> : tensor(float)</dt>
|
||||
<dd>Constrain scales to float32 tensors.</dd>
|
||||
<dt><tt>G</tt> : tensor(int32)</dt>
|
||||
<dd>Constrain to integer types</dd>
|
||||
<dt><tt>F</tt> : tensor(float16)</dt>
|
||||
<dd>Be compatible with float version.</dd>
|
||||
</dl>
|
||||
|
||||
|
||||
### <a name="com.microsoft.QOrderedMatMul"></a><a name="com.microsoft.qorderedmatmul">**com.microsoft.QOrderedMatMul**</a>
|
||||
|
||||
Quantize (Int8) MatMul with order. Implement Y = alpha * A * B + bias + beta * C. Matrix A, B, C, Y are all int8 matrix.
|
||||
Two type of order combination supported:
|
||||
*) When order_B is ORDER_COL, order_A must be ORDER_ROW.
|
||||
bias is vector of {#cols of Y} of float32, C should be batch 1/batch_A. B could be of batch 1 or batch_A.
|
||||
Note B is reorder to ORDER_COL, or Transposed. Not Transposed first and then Reordered here.
|
||||
*) When order_B is specify ORDER_COL4_4R2_8C or ORDER_COL32_2R_4R4, orderA must be ORDER_COL32.
|
||||
MatMul will be implemented using alpha(A * B) + beta * C => Y.
|
||||
bias is not supported here. B in fact is transposed first then reordered into ORDER_COL4_4R2_8C or ORDER_COL32_2R_4R4 here.
|
||||
order_Y and order_C will be same as order_A.
|
||||
Support per column quantized weight, ie, scale_B is 1-D vector of size [#cols of matrix B].
|
||||
|
||||
#### Version
|
||||
|
||||
This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
|
||||
|
||||
#### Attributes
|
||||
|
||||
<dl>
|
||||
<dt><tt>order_A</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of matrix A. See the schema of QuantizeWithOrder for order definition.</dd>
|
||||
<dt><tt>order_B</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of matrix B</dd>
|
||||
<dt><tt>order_Y</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of matrix Y and optional matrix C</dd>
|
||||
</dl>
|
||||
|
||||
#### Inputs (5 - 8)
|
||||
|
|
@ -3041,19 +3281,19 @@ This version of the operator has been available since version 1 of the 'com.micr
|
|||
<dt><tt>A</tt> : Q</dt>
|
||||
<dd>3-dimensional matrix A</dd>
|
||||
<dt><tt>scale_A</tt> : S</dt>
|
||||
<dd>scale of the input A</dd>
|
||||
<dd>scale of the input A.</dd>
|
||||
<dt><tt>B</tt> : Q</dt>
|
||||
<dd>2-dimensional matrix B</dd>
|
||||
<dd>2-dimensional matrix B. Transposed if order_B is ORDER_COL.</dd>
|
||||
<dt><tt>scale_B</tt> : S</dt>
|
||||
<dd>scale of the input B</dd>
|
||||
<dd>scale of the input B. Scalar or 1-D float32.</dd>
|
||||
<dt><tt>scale_Y</tt> : S</dt>
|
||||
<dd>scale of the output Y</dd>
|
||||
<dd>scale of the output Y.</dd>
|
||||
<dt><tt>bias</tt> (optional) : S</dt>
|
||||
<dd>1d bias</dd>
|
||||
<dd>1d bias, not scaled with scale_Y.</dd>
|
||||
<dt><tt>C</tt> (optional) : Q</dt>
|
||||
<dd>3d or 2d matrix C. if 2d expand to 3d first. Shape[0] should be 1 or same as A.shape[0] </dd>
|
||||
<dt><tt>scale_C</tt> (optional) : S</dt>
|
||||
<dd>scale of the input A</dd>
|
||||
<dd>scale of the input A.</dd>
|
||||
</dl>
|
||||
|
||||
#### Outputs
|
||||
|
|
@ -3076,6 +3316,7 @@ This version of the operator has been available since version 1 of the 'com.micr
|
|||
### <a name="com.microsoft.QuantizeBFP"></a><a name="com.microsoft.quantizebfp">**com.microsoft.QuantizeBFP**</a>
|
||||
|
||||
The BFP quantization operator. It consumes a full precision tensor and computes an BFP tensor.
|
||||
More documentation on the BFP format can be found in this paper: https://www.microsoft.com/en-us/research/publication/pushing-the-limits-of-narrow-precision-inferencing-at-cloud-scale-with-microsoft-floating-point/
|
||||
|
||||
#### Version
|
||||
|
||||
|
|
@ -3086,8 +3327,8 @@ This version of the operator has been available since version 1 of the 'com.micr
|
|||
<dl>
|
||||
<dt><tt>bfp_type</tt> : int (required)</dt>
|
||||
<dd>The type of BFP - must match with the BFPType enum</dd>
|
||||
<dt><tt>block_dims</tt> : list of ints</dt>
|
||||
<dd>Numbers within a bounding box will span across these dimensions.Any dimension not in this list is the same for all numbers within a bounding box.As an example, consider a 2D tensor with shape [d0, d1] and block_dims equal to [1].Within a bounding box, all elements will be within the same row but will be from different columnns.The default is the last dimension.</dd>
|
||||
<dt><tt>block_dim</tt> : int</dt>
|
||||
<dd>Each bounding box spans this dimension.Typically, the block dimension corresponds to the reduction dimension of the matrix multipication that consumes the output of this operator.For example, for a 2D matrix multiplication A@W, QuantizeBFP(A) would use block_dim 1 and QuantizeBFP(W) would use block_dim 0.The default is the last dimension.</dd>
|
||||
</dl>
|
||||
|
||||
#### Inputs
|
||||
|
|
@ -3166,6 +3407,51 @@ This version of the operator has been available since version 1 of the 'com.micr
|
|||
</dl>
|
||||
|
||||
|
||||
### <a name="com.microsoft.QuantizeWithOrder"></a><a name="com.microsoft.quantizewithorder">**com.microsoft.QuantizeWithOrder**</a>
|
||||
|
||||
Quantize input matrix to specific layout used in cublaslt.
|
||||
|
||||
#### Version
|
||||
|
||||
This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
|
||||
|
||||
#### Attributes
|
||||
|
||||
<dl>
|
||||
<dt><tt>order_input</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of input matrix. ORDER_COL = 0, ORDER_ROW = 1, ORDER_COL32 = 2, ORDER_COL4_4R2_8C = 3, ORDER_COL32_2R_4R4 = 4. Please refer https://docs.nvidia.com/cuda/cublas/index.html#cublasLtOrder_t for their meaning.</dd>
|
||||
<dt><tt>order_output</tt> : int (required)</dt>
|
||||
<dd>cublasLt order of output matrix.</dd>
|
||||
</dl>
|
||||
|
||||
#### Inputs
|
||||
|
||||
<dl>
|
||||
<dt><tt>input</tt> : F</dt>
|
||||
<dd>TODO: input tensor of (ROWS, COLS). if less than 2d, will broadcast to (1, X). If 3d, it is treated as (B, ROWS, COS)</dd>
|
||||
<dt><tt>scale_input</tt> : S</dt>
|
||||
<dd>scale of the input</dd>
|
||||
</dl>
|
||||
|
||||
#### Outputs
|
||||
|
||||
<dl>
|
||||
<dt><tt>output</tt> : Q</dt>
|
||||
<dd>output tensor</dd>
|
||||
</dl>
|
||||
|
||||
#### Type Constraints
|
||||
|
||||
<dl>
|
||||
<dt><tt>Q</tt> : tensor(int8)</dt>
|
||||
<dd>Constrain input and output types to int8 tensors.</dd>
|
||||
<dt><tt>F</tt> : tensor(float16), tensor(float)</dt>
|
||||
<dd>Constrain to float types</dd>
|
||||
<dt><tt>S</tt> : tensor(float)</dt>
|
||||
<dd>Constrain Scale to float32 types</dd>
|
||||
</dl>
|
||||
|
||||
|
||||
### <a name="com.microsoft.Range"></a><a name="com.microsoft.range">**com.microsoft.Range**</a>
|
||||
|
||||
Creates a sequence of numbers that begins at `start` and extends by increments of `delta`
|
||||
|
|
|
|||
|
|
@ -211,19 +211,21 @@ ONNX_MS_OPERATOR_SET_SCHEMA(DequantizeLinear, 1,
|
|||
}));
|
||||
|
||||
static const char* QuantizeBFP_ver1_doc = R"DOC(
|
||||
The BFP quantization operator. It consumes a full precision tensor and computes an BFP tensor.)DOC";
|
||||
The BFP quantization operator. It consumes a full precision tensor and computes an BFP tensor.
|
||||
More documentation on the BFP format can be found in this paper: https://www.microsoft.com/en-us/research/publication/pushing-the-limits-of-narrow-precision-inferencing-at-cloud-scale-with-microsoft-floating-point/)DOC";
|
||||
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
QuantizeBFP, 1,
|
||||
OpSchema()
|
||||
.Attr("bfp_type", "The type of BFP - must match with the BFPType enum", AttributeProto::INT)
|
||||
.Attr("block_dims",
|
||||
"Numbers within a bounding box will span across these dimensions."
|
||||
"Any dimension not in this list is the same for all numbers within a bounding box."
|
||||
"As an example, consider a 2D tensor with shape [d0, d1] and block_dims equal to [1]."
|
||||
"Within a bounding box, all elements will be within the same row but will be from different columnns."
|
||||
.Attr("block_dim",
|
||||
"Each bounding box spans this dimension."
|
||||
"Typically, the block dimension corresponds to the reduction dimension of the matrix multipication that "
|
||||
"consumes the output of this operator."
|
||||
"For example, for a 2D matrix multiplication A@W, QuantizeBFP(A) would use block_dim 1 and "
|
||||
"QuantizeBFP(W) would use block_dim 0."
|
||||
"The default is the last dimension.",
|
||||
AttributeProto::INTS, std::vector<int64_t>{-1})
|
||||
AttributeProto::INT, static_cast<int64_t>(-1))
|
||||
.Input(0, "x", "N-D full precision input tensor to be quantized.", "T1")
|
||||
.Output(0, "y", "1-D, contiguous BFP data", "T2")
|
||||
.Output(1, "shape", "Shape of x", "T3")
|
||||
|
|
@ -254,19 +256,22 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
|
|||
}));
|
||||
|
||||
static const char* DequantizeBFP_ver1_doc = R"DOC(
|
||||
The BFP dequantization operator. It consumes the raw BFP data and some metadata such as the shape and strides of the original tensor and computes the dequantized tensor.)DOC";
|
||||
The BFP dequantization operator.
|
||||
It consumes the raw BFP data and some metadata such as the shape and strides of the original tensor and computes the dequantized tensor.
|
||||
More documentation on the BFP format can be found in this paper: https://www.microsoft.com/en-us/research/publication/pushing-the-limits-of-narrow-precision-inferencing-at-cloud-scale-with-microsoft-floating-point/)DOC";
|
||||
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
DequantizeBFP, 1,
|
||||
OpSchema()
|
||||
.Attr("bfp_type", "The type of BFP - must match with the BFPType enum", AttributeProto::INT)
|
||||
.Attr("block_dims",
|
||||
"Numbers within a bounding box will span across these dimensions."
|
||||
"Any dimension not in this list is the same for all numbers within a bounding box."
|
||||
"As an example, consider a 2D tensor with shape [d0, d1] and block_dims equal to [1]."
|
||||
"Within a bounding box, all elements will be within the same row but will be from different columnns."
|
||||
.Attr("block_dim",
|
||||
"Each bounding box spans this dimension."
|
||||
"Typically, the block dimension corresponds to the reduction dimension of the matrix multipication that "
|
||||
"consumes the output of this operator."
|
||||
"For example, for a 2D matrix multiplication A@W, QuantizeBFP(A) would use block_dim 1 and "
|
||||
"QuantizeBFP(W) would use block_dim 0."
|
||||
"The default is the last dimension.",
|
||||
AttributeProto::INTS, std::vector<int64_t>{-1})
|
||||
AttributeProto::INT, static_cast<int64_t>(-1))
|
||||
.Attr("dtype", "The datatype to dequantize to.", AttributeProto::INT,
|
||||
static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT)) // default
|
||||
.Input(0, "x", "1-D, contiguous, raw, BFP data to be de-quantized.", "T1")
|
||||
|
|
@ -975,51 +980,61 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
|
|||
.TypeConstraint("T", {"tensor(float)"}, "Constrain input and output types to float32 tensors.")
|
||||
.TypeAndShapeInferenceFunction(EmbedLayerNormalizationShapeInference));
|
||||
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
QuantizeWithOrder,
|
||||
1,
|
||||
OpSchema()
|
||||
.SetDoc(R"DOC(Quantize input matrix to specific layout used in cublaslt.)DOC")
|
||||
.Attr("order_input",
|
||||
"cublasLt order of input matrix. ORDER_COL = 0, ORDER_ROW = 1, ORDER_COL32 = 2, ORDER_COL4_4R2_8C = 3, ORDER_COL32_2R_4R4 = 4. "
|
||||
"Please refer https://docs.nvidia.com/cuda/cublas/index.html#cublasLtOrder_t for their meaning.",
|
||||
AttributeProto::INT)
|
||||
.Attr("order_output", "cublasLt order of output matrix.", AttributeProto::INT)
|
||||
.Input(0, "input", "TODO: input tensor of (ROWS, COLS). if less than 2d, will broadcast to (1, X). If 3d, it is treated as (B, ROWS, COS)", "F")
|
||||
.Input(1, "scale_input", "scale of the input", "S")
|
||||
.Output(0, "output", "output tensor", "Q")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "Constrain input and output types to int8 tensors.")
|
||||
.TypeConstraint("F", {"tensor(float16)", "tensor(float)"}, "Constrain to float types")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "Constrain Scale to float32 types")
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto::INT8, 0);
|
||||
if (!hasInputShape(ctx, 0)) return;
|
||||
auto& input_shape = getInputShape(ctx, 0);
|
||||
updateOutputShape(ctx, 0, input_shape);
|
||||
}));
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
QuantizeWithOrder, 1,
|
||||
OpSchema()
|
||||
.SetDoc(R"DOC(Quantize input matrix to specific layout used in cublaslt.)DOC")
|
||||
.Attr("order_input",
|
||||
"cublasLt order of input matrix. ORDER_COL = 0, ORDER_ROW = 1, ORDER_COL32 = 2, ORDER_COL4_4R2_8C = 3, "
|
||||
"ORDER_COL32_2R_4R4 = 4. "
|
||||
"Please refer https://docs.nvidia.com/cuda/cublas/index.html#cublasLtOrder_t for their meaning.",
|
||||
AttributeProto::INT)
|
||||
.Attr("order_output", "cublasLt order of output matrix.", AttributeProto::INT)
|
||||
.Input(0, "input",
|
||||
"TODO: input tensor of (ROWS, COLS). if less than 2d, will broadcast to (1, X). If 3d, it is treated as "
|
||||
"(B, ROWS, COS)",
|
||||
"F")
|
||||
.Input(1, "scale_input", "scale of the input", "S")
|
||||
.Output(0, "output", "output tensor", "Q")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "Constrain input and output types to int8 tensors.")
|
||||
.TypeConstraint("F", {"tensor(float16)", "tensor(float)"}, "Constrain to float types")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "Constrain Scale to float32 types")
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto::INT8, 0);
|
||||
if (!hasInputShape(ctx, 0)) return;
|
||||
auto& input_shape = getInputShape(ctx, 0);
|
||||
updateOutputShape(ctx, 0, input_shape);
|
||||
}));
|
||||
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
DequantizeWithOrder,
|
||||
1,
|
||||
OpSchema()
|
||||
.SetDoc(R"DOC(Dequantize input matrix to specific layout used in cublaslt. attr to specify output type, float16 or float32)DOC")
|
||||
.Attr("order_input", "cublasLt order of input matrix. See the schema of QuantizeWithOrder for order definition.", AttributeProto::INT)
|
||||
.Attr("order_output", "cublasLt order of output matrix", AttributeProto::INT)
|
||||
.Attr("to", "The output data type, only support TensorProto_DataType_FLOAT (1) and TensorProto_DataType_FLOAT16 (10)", AttributeProto::INT)
|
||||
.Input(0, "input", "TODO: input tensor of (ROWS, COLS). if less than 2d, will broadcast to (1, X). If 3d, it is treated as (B, ROWS, COS)", "Q")
|
||||
.Input(1, "scale_input", "scale of the input", "S")
|
||||
.Output(0, "output", "output tensor", "F")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "Constrain input and output types to int8 tensors.")
|
||||
.TypeConstraint("F", {"tensor(float16)", "tensor(float)"}, "Constrain to float types")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "Constrain Scale to float32 types")
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateElemTypeFromAttributeToOutput(ctx, "to", 0);
|
||||
if (!hasInputShape(ctx, 0)) return;
|
||||
auto& input_shape = getInputShape(ctx, 0);
|
||||
updateOutputShape(ctx, 0, input_shape);
|
||||
}));
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
DequantizeWithOrder, 1,
|
||||
OpSchema()
|
||||
.SetDoc(
|
||||
R"DOC(Dequantize input matrix to specific layout used in cublaslt. attr to specify output type, float16 or float32)DOC")
|
||||
.Attr("order_input",
|
||||
"cublasLt order of input matrix. See the schema of QuantizeWithOrder for order definition.",
|
||||
AttributeProto::INT)
|
||||
.Attr("order_output", "cublasLt order of output matrix", AttributeProto::INT)
|
||||
.Attr("to",
|
||||
"The output data type, only support TensorProto_DataType_FLOAT (1) and TensorProto_DataType_FLOAT16 (10)",
|
||||
AttributeProto::INT)
|
||||
.Input(0, "input",
|
||||
"TODO: input tensor of (ROWS, COLS). if less than 2d, will broadcast to (1, X). If 3d, it is treated as "
|
||||
"(B, ROWS, COS)",
|
||||
"Q")
|
||||
.Input(1, "scale_input", "scale of the input", "S")
|
||||
.Output(0, "output", "output tensor", "F")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "Constrain input and output types to int8 tensors.")
|
||||
.TypeConstraint("F", {"tensor(float16)", "tensor(float)"}, "Constrain to float types")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "Constrain Scale to float32 types")
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateElemTypeFromAttributeToOutput(ctx, "to", 0);
|
||||
if (!hasInputShape(ctx, 0)) return;
|
||||
auto& input_shape = getInputShape(ctx, 0);
|
||||
updateOutputShape(ctx, 0, input_shape);
|
||||
}));
|
||||
|
||||
constexpr const char* QOrderedMatMul_ver1_doc = R"DOC(
|
||||
constexpr const char* QOrderedMatMul_ver1_doc = R"DOC(
|
||||
Quantize (Int8) MatMul with order. Implement Y = alpha * A * B + bias + beta * C. Matrix A, B, C, Y are all int8 matrix.
|
||||
Two type of order combination supported:
|
||||
*) When order_B is ORDER_COL, order_A must be ORDER_ROW.
|
||||
|
|
@ -1032,31 +1047,32 @@ order_Y and order_C will be same as order_A.
|
|||
Support per column quantized weight, ie, scale_B is 1-D vector of size [#cols of matrix B].
|
||||
)DOC";
|
||||
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
QOrderedMatMul,
|
||||
1,
|
||||
OpSchema()
|
||||
.SetDoc(QOrderedMatMul_ver1_doc)
|
||||
.Attr("order_A", "cublasLt order of matrix A. See the schema of QuantizeWithOrder for order definition.", AttributeProto::INT)
|
||||
.Attr("order_B", "cublasLt order of matrix B", AttributeProto::INT)
|
||||
.Attr("order_Y", "cublasLt order of matrix Y and optional matrix C", AttributeProto::INT)
|
||||
.Input(0, "A", "3-dimensional matrix A", "Q")
|
||||
.Input(1, "scale_A", "scale of the input A.", "S")
|
||||
.Input(2, "B", "2-dimensional matrix B. Transposed if order_B is ORDER_COL.", "Q")
|
||||
.Input(3, "scale_B", "scale of the input B. Scalar or 1-D float32.", "S")
|
||||
.Input(4, "scale_Y", "scale of the output Y.", "S")
|
||||
.Input(5, "bias", "1d bias, not scaled with scale_Y.", "S", OpSchema::Optional)
|
||||
.Input(6, "C", "3d or 2d matrix C. if 2d expand to 3d first. Shape[0] should be 1 or same as A.shape[0] ", "Q", OpSchema::Optional)
|
||||
.Input(7, "scale_C", "scale of the input A.", "S", OpSchema::Optional)
|
||||
.Output(0, "Y", "Matrix multiply results from A * B", "Q")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "Constrain input and output types to int8 tensors.")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "Constrain bias and scales to float32")
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
||||
ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 2);
|
||||
}));
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
QOrderedMatMul, 1,
|
||||
OpSchema()
|
||||
.SetDoc(QOrderedMatMul_ver1_doc)
|
||||
.Attr("order_A", "cublasLt order of matrix A. See the schema of QuantizeWithOrder for order definition.",
|
||||
AttributeProto::INT)
|
||||
.Attr("order_B", "cublasLt order of matrix B", AttributeProto::INT)
|
||||
.Attr("order_Y", "cublasLt order of matrix Y and optional matrix C", AttributeProto::INT)
|
||||
.Input(0, "A", "3-dimensional matrix A", "Q")
|
||||
.Input(1, "scale_A", "scale of the input A.", "S")
|
||||
.Input(2, "B", "2-dimensional matrix B. Transposed if order_B is ORDER_COL.", "Q")
|
||||
.Input(3, "scale_B", "scale of the input B. Scalar or 1-D float32.", "S")
|
||||
.Input(4, "scale_Y", "scale of the output Y.", "S")
|
||||
.Input(5, "bias", "1d bias, not scaled with scale_Y.", "S", OpSchema::Optional)
|
||||
.Input(6, "C", "3d or 2d matrix C. if 2d expand to 3d first. Shape[0] should be 1 or same as A.shape[0] ", "Q",
|
||||
OpSchema::Optional)
|
||||
.Input(7, "scale_C", "scale of the input A.", "S", OpSchema::Optional)
|
||||
.Output(0, "Y", "Matrix multiply results from A * B", "Q")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "Constrain input and output types to int8 tensors.")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "Constrain bias and scales to float32")
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
||||
ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 2);
|
||||
}));
|
||||
|
||||
static const char* Attention_QOrdered_doc = R"DOC(
|
||||
static const char* Attention_QOrdered_doc = R"DOC(
|
||||
Quantized version of simplified Multi-Head Self Attention(using int8 with specific matrix Layout).
|
||||
Multi-Head Self Attention that can be either unidirectional (like GPT-2) or bidirectional (like BERT).
|
||||
The mask_index input is optional. Besides raw attention mask with shape (batch_size, past_sequence_length + sequence_length)
|
||||
|
|
@ -1070,128 +1086,159 @@ Current version does not support past/present, extra_add and qkv_hidden_sizes.
|
|||
TODO: Support them if needed in the future.
|
||||
)DOC";
|
||||
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
QOrderedAttention,
|
||||
1,
|
||||
OpSchema()
|
||||
.SetDoc(Attention_QOrdered_doc)
|
||||
.Attr("num_heads", "Number of attention heads", AttributeProto::INT)
|
||||
.Attr("unidirectional", "Whether every token can only attend to previous tokens. Default value is 0.", AttributeProto::INT, static_cast<int64_t>(0))
|
||||
.Attr("qkv_hidden_sizes", "Hidden layer sizes of Q, K, V paths in Attention", AttributeProto::INTS, OPTIONAL_VALUE)
|
||||
.Attr("order_input", "cublasLt order of input matrix. See the schema of QuantizeWithOrder for order definition.", AttributeProto::INT)
|
||||
.Attr("order_weight", "cublasLt order of weight matrix", AttributeProto::INT)
|
||||
.Attr("order_output", "cublasLt order of global bias", AttributeProto::INT)
|
||||
.Input(0, "input", "3D input tensor with shape (batch_size, sequence_length, input_hidden_size)", "Q")
|
||||
.Input(1, "scale_input", "scale of the input, scalar value (per tensor) currently.", "S")
|
||||
.Input(2, "scale_Q_gemm", "scale of the gemm - scalar (per-tensor quantization)", "S")
|
||||
.Input(3, "scale_K_gemm", "scale of the gemm - scalar (per-tensor quantization)", "S")
|
||||
.Input(4, "scale_V_gemm", "scale of the gemm - scalar (per-tensor quantization)", "S")
|
||||
.Input(5, "Q_weight", "2D input tensor with shape (input_hidden_size, hidden_size), where hidden_size = num_heads * head_size", "Q")
|
||||
.Input(6, "K_weight", "2D input tensor with shape (input_hidden_size, hidden_size), where hidden_size = num_heads * head_size", "Q")
|
||||
.Input(7, "V_weight", "2D input tensor with shape (input_hidden_size, hidden_size), where hidden_size = num_heads * head_size", "Q")
|
||||
.Input(8, "scale_Q_weight", "scale of the weight (scalar for per-tensor quantization or 1-D of dims [hidden_size] for per-channel quantization)", "S")
|
||||
.Input(9, "scale_K_weight", "scale of the weight (scalar for per-tensor quantization or 1-D of dims [hidden_size] for per-channel quantization)", "S")
|
||||
.Input(10, "scale_V_weight", "scale of the weight (scalar for per-tensor quantization or 1-D of dims [hidden_size] for per-channel quantization)", "S")
|
||||
.Input(11, "Q_bias", "1D input tensor with shape (hidden_size)", "S")
|
||||
.Input(12, "K_bias", "1D input tensor with shape (hidden_size)", "S")
|
||||
.Input(13, "V_bias", "1D input tensor with shape (hidden_size)", "S")
|
||||
.Input(14, "scale_QKT_gemm", "scale of the gemm - scalar (per-tensor quantization)", "S", OpSchema::Optional)
|
||||
.Input(15, "scale_QKT_softmax", "scale of the softmax result - scalar (per-tensor quantization)", "S", OpSchema::Optional)
|
||||
.Input(16, "scale_values_gemm", "scale of the gemm - scalar (per-tensor quantization). Also this is the output scale for the operator.", "S")
|
||||
.Input(17, "mask_index",
|
||||
"Attention mask with shape (batch_size, 1, max_sequence_length, max_sequence_length), (batch_size, past_sequence_length + sequence_length)"
|
||||
"or (batch_size, sequence_length, past_sequence_length + sequence_length), or index with shape (batch_size) or (2 * batch_size).",
|
||||
"G", OpSchema::Optional)
|
||||
.Input(18, "past", "past state for key and value with shape (2, batch_size, num_heads, past_sequence_length, head_size).", "Q", OpSchema::Optional)
|
||||
.Input(19, "extra_add", "additional add to QxK' with shape (batch_size, num_heads, sequence_length, sequence_length).", "S", OpSchema::Optional)
|
||||
.Output(0, "output", "3D output tensor with shape (batch_size, sequence_length, hidden_size)", "Q")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "Constrain input and output types to int8 tensors.")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "Constrain scales to float32 tensors.")
|
||||
.TypeConstraint("G", {"tensor(int32)"}, "Constrain to integer types")
|
||||
.TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
QOrderedAttention, 1,
|
||||
OpSchema()
|
||||
.SetDoc(Attention_QOrdered_doc)
|
||||
.Attr("num_heads", "Number of attention heads", AttributeProto::INT)
|
||||
.Attr("unidirectional", "Whether every token can only attend to previous tokens. Default value is 0.",
|
||||
AttributeProto::INT, static_cast<int64_t>(0))
|
||||
.Attr("qkv_hidden_sizes", "Hidden layer sizes of Q, K, V paths in Attention", AttributeProto::INTS,
|
||||
OPTIONAL_VALUE)
|
||||
.Attr("order_input",
|
||||
"cublasLt order of input matrix. See the schema of QuantizeWithOrder for order definition.",
|
||||
AttributeProto::INT)
|
||||
.Attr("order_weight", "cublasLt order of weight matrix", AttributeProto::INT)
|
||||
.Attr("order_output", "cublasLt order of global bias", AttributeProto::INT)
|
||||
.Input(0, "input", "3D input tensor with shape (batch_size, sequence_length, input_hidden_size)", "Q")
|
||||
.Input(1, "scale_input", "scale of the input, scalar value (per tensor) currently.", "S")
|
||||
.Input(2, "scale_Q_gemm", "scale of the gemm - scalar (per-tensor quantization)", "S")
|
||||
.Input(3, "scale_K_gemm", "scale of the gemm - scalar (per-tensor quantization)", "S")
|
||||
.Input(4, "scale_V_gemm", "scale of the gemm - scalar (per-tensor quantization)", "S")
|
||||
.Input(5, "Q_weight",
|
||||
"2D input tensor with shape (input_hidden_size, hidden_size), where hidden_size = num_heads * head_size",
|
||||
"Q")
|
||||
.Input(6, "K_weight",
|
||||
"2D input tensor with shape (input_hidden_size, hidden_size), where hidden_size = num_heads * head_size",
|
||||
"Q")
|
||||
.Input(7, "V_weight",
|
||||
"2D input tensor with shape (input_hidden_size, hidden_size), where hidden_size = num_heads * head_size",
|
||||
"Q")
|
||||
.Input(8, "scale_Q_weight",
|
||||
"scale of the weight (scalar for per-tensor quantization or 1-D of dims [hidden_size] for per-channel "
|
||||
"quantization)",
|
||||
"S")
|
||||
.Input(9, "scale_K_weight",
|
||||
"scale of the weight (scalar for per-tensor quantization or 1-D of dims [hidden_size] for per-channel "
|
||||
"quantization)",
|
||||
"S")
|
||||
.Input(10, "scale_V_weight",
|
||||
"scale of the weight (scalar for per-tensor quantization or 1-D of dims [hidden_size] for per-channel "
|
||||
"quantization)",
|
||||
"S")
|
||||
.Input(11, "Q_bias", "1D input tensor with shape (hidden_size)", "S")
|
||||
.Input(12, "K_bias", "1D input tensor with shape (hidden_size)", "S")
|
||||
.Input(13, "V_bias", "1D input tensor with shape (hidden_size)", "S")
|
||||
.Input(14, "scale_QKT_gemm", "scale of the gemm - scalar (per-tensor quantization)", "S", OpSchema::Optional)
|
||||
.Input(15, "scale_QKT_softmax", "scale of the softmax result - scalar (per-tensor quantization)", "S",
|
||||
OpSchema::Optional)
|
||||
.Input(16, "scale_values_gemm",
|
||||
"scale of the gemm - scalar (per-tensor quantization). Also this is the output scale for the operator.",
|
||||
"S")
|
||||
.Input(17, "mask_index",
|
||||
"Attention mask with shape (batch_size, 1, max_sequence_length, max_sequence_length), (batch_size, "
|
||||
"past_sequence_length + sequence_length)"
|
||||
"or (batch_size, sequence_length, past_sequence_length + sequence_length), or index with shape "
|
||||
"(batch_size) or (2 * batch_size).",
|
||||
"G", OpSchema::Optional)
|
||||
.Input(18, "past",
|
||||
"past state for key and value with shape (2, batch_size, num_heads, past_sequence_length, head_size).",
|
||||
"Q", OpSchema::Optional)
|
||||
.Input(19, "extra_add",
|
||||
"additional add to QxK' with shape (batch_size, num_heads, sequence_length, sequence_length).", "S",
|
||||
OpSchema::Optional)
|
||||
.Output(0, "output", "3D output tensor with shape (batch_size, sequence_length, hidden_size)", "Q")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "Constrain input and output types to int8 tensors.")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "Constrain scales to float32 tensors.")
|
||||
.TypeConstraint("G", {"tensor(int32)"}, "Constrain to integer types")
|
||||
.TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
|
||||
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
QOrderedLayerNormalization,
|
||||
1,
|
||||
OpSchema()
|
||||
.SetDoc("QOrderedLayerNormalization")
|
||||
.Attr("axis",
|
||||
"The first normalization dimension: normalization "
|
||||
"will be performed along dimensions axis "
|
||||
": rank(inputs).",
|
||||
AttributeProto::INT,
|
||||
static_cast<int64_t>(-1))
|
||||
.Attr("epsilon", "The epsilon value to use to avoid division by zero.",
|
||||
AttributeProto::FLOAT, 1e-5f)
|
||||
.Attr("order_X", "cublasLt order of input X. Default is ROW MAJOR. See the schema of QuantizeWithOrder for order definition.",
|
||||
AttributeProto::INT, static_cast<int64_t>(1))
|
||||
.Attr("order_Y", "cublasLt order of matrix Y, must be same as order_X. Default is ROW MAJOR.",
|
||||
AttributeProto::INT, static_cast<int64_t>(1))
|
||||
.AllowUncheckedAttributes()
|
||||
.Input(0, "X", "Input data tensor from the previous layer.", "Q")
|
||||
.Input(1, "scale_X", "scale of the quantized X", "S")
|
||||
.Input(2, "scale", "Scale tensor, i.e., gamma vector.", "F")
|
||||
.Input(3, "B", "Bias tensor.", "F", OpSchema::Optional)
|
||||
.Input(4, "scale_Y", "scale of the quantized X", "S")
|
||||
.Output(0, "Y", "Output data tensor.", "Q")
|
||||
.TypeConstraint("F", {"tensor(float16)", "tensor(float)"},
|
||||
"Constrain input gamma and bias could be float16/float tensors. "
|
||||
"float may get better precision, float16 runs faster.")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "quantization scale must be float tensors.")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "quantization tensor must be int8 tensors.")
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateShapeAndTypeFromFirstInput(ctx);
|
||||
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
||||
}));
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(QOrderedLayerNormalization, 1,
|
||||
OpSchema()
|
||||
.SetDoc("QOrderedLayerNormalization")
|
||||
.Attr("axis",
|
||||
"The first normalization dimension: normalization "
|
||||
"will be performed along dimensions axis "
|
||||
": rank(inputs).",
|
||||
AttributeProto::INT, static_cast<int64_t>(-1))
|
||||
.Attr("epsilon", "The epsilon value to use to avoid division by zero.",
|
||||
AttributeProto::FLOAT, 1e-5f)
|
||||
.Attr("order_X",
|
||||
"cublasLt order of input X. Default is ROW MAJOR. See the schema of "
|
||||
"QuantizeWithOrder for order definition.",
|
||||
AttributeProto::INT, static_cast<int64_t>(1))
|
||||
.Attr("order_Y",
|
||||
"cublasLt order of matrix Y, must be same as order_X. Default is ROW MAJOR.",
|
||||
AttributeProto::INT, static_cast<int64_t>(1))
|
||||
.AllowUncheckedAttributes()
|
||||
.Input(0, "X", "Input data tensor from the previous layer.", "Q")
|
||||
.Input(1, "scale_X", "scale of the quantized X", "S")
|
||||
.Input(2, "scale", "Scale tensor, i.e., gamma vector.", "F")
|
||||
.Input(3, "B", "Bias tensor.", "F", OpSchema::Optional)
|
||||
.Input(4, "scale_Y", "scale of the quantized X", "S")
|
||||
.Output(0, "Y", "Output data tensor.", "Q")
|
||||
.TypeConstraint("F", {"tensor(float16)", "tensor(float)"},
|
||||
"Constrain input gamma and bias could be float16/float tensors. "
|
||||
"float may get better precision, float16 runs faster.")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "quantization scale must be float tensors.")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "quantization tensor must be int8 tensors.")
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateShapeAndTypeFromFirstInput(ctx);
|
||||
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
||||
}));
|
||||
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
QOrderedGelu,
|
||||
1,
|
||||
OpSchema()
|
||||
.SetDoc(R"DOC(Ordered Quantize Gelu.)DOC")
|
||||
.Attr("order_X", "cublasLt order of input X. Optional. See the schema of QuantizeWithOrder for order definition.",
|
||||
AttributeProto::INT, OPTIONAL_VALUE)
|
||||
.Attr("order_Y", "cublasLt order of matrix Y, must be same as order_X if specified together. Optional.",
|
||||
AttributeProto::INT, OPTIONAL_VALUE)
|
||||
.Input(0, "X", "N-dimensional input A", "Q")
|
||||
.Input(1, "scale_X", "scale of the input A", "S")
|
||||
.Input(2, "scale_Y", "scale of the output Y", "S")
|
||||
.Output(0, "Y", "Output of the Gelu", "Q")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "Constrain input and output types to int8 tensors.")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "Constrain scales to float32")
|
||||
.TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
QOrderedGelu, 1,
|
||||
OpSchema()
|
||||
.SetDoc(R"DOC(Ordered Quantize Gelu.)DOC")
|
||||
.Attr("order_X",
|
||||
"cublasLt order of input X. Optional. See the schema of QuantizeWithOrder for order definition.",
|
||||
AttributeProto::INT, OPTIONAL_VALUE)
|
||||
.Attr("order_Y", "cublasLt order of matrix Y, must be same as order_X if specified together. Optional.",
|
||||
AttributeProto::INT, OPTIONAL_VALUE)
|
||||
.Input(0, "X", "N-dimensional input A", "Q")
|
||||
.Input(1, "scale_X", "scale of the input A", "S")
|
||||
.Input(2, "scale_Y", "scale of the output Y", "S")
|
||||
.Output(0, "Y", "Output of the Gelu", "Q")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "Constrain input and output types to int8 tensors.")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "Constrain scales to float32")
|
||||
.TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
|
||||
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
QOrderedLongformerAttention,
|
||||
1,
|
||||
OpSchema()
|
||||
.SetDoc(R"DOC(Quantized version of Longformer Self Attention (using int8 with specific matrix Layout).)DOC")
|
||||
.Attr("num_heads", "Number of attention heads", AttributeProto::INT)
|
||||
.Attr("window", "One sided attention windows length W, or half of total window length", AttributeProto::INT)
|
||||
.Attr("order_input", "cublasLt order of input matrix. See the schema of QuantizeWithOrder for order definition.", AttributeProto::INT)
|
||||
.Attr("order_weight", "cublasLt order of weight matrix", AttributeProto::INT)
|
||||
.Attr("order_global_weight", "cublasLt order of weight matrix", AttributeProto::INT)
|
||||
.Attr("order_output", "cublasLt order of global bias", AttributeProto::INT)
|
||||
.Input(0, "input", "3D input tensor with shape (batch_size, sequence_length, hidden_size), hidden_size = num_heads * head_size", "Q")
|
||||
.Input(1, "scale_input", "scale of the input", "S")
|
||||
.Input(2, "weight", "2D input tensor with shape (hidden_size, 3 * hidden_size)", "Q")
|
||||
.Input(3, "scale_weight", "scale of the weight", "S")
|
||||
.Input(4, "bias", "1D input tensor with shape (3 * hidden_size), fp32 only currently.", "S")
|
||||
.Input(5, "scale_bias", "reserved. (not used as add bias need float value in cublasLt for normal order.)", "S")
|
||||
.Input(6, "scale_qkv_gemm", "scale of the output for fused kqv gemm", "S")
|
||||
.Input(7, "mask", "Attention mask with shape (batch_size, sequence_length)", "F")
|
||||
.Input(8, "global_weight", "2D input tensor with shape (hidden_size, 3 * hidden_size)", "Q")
|
||||
.Input(9, "scale_global_weight", "scale of the global_weight", "S")
|
||||
.Input(10, "global_bias", "1D input tensor with shape (3 * hidden_size)", "S")
|
||||
.Input(11, "scale_global_gemm", "scale of the global_qkv_gemm", "S")
|
||||
.Input(12, "global", "Global attention flags with shape (batch_size, sequence_length)", "G")
|
||||
.Input(13, "scale_output", "scale of the output", "S")
|
||||
.Output(0, "output", "3D output tensor with shape (batch_size, sequence_length, hidden_size)", "Q")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "Constrain input and output types to int8 tensors.")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "Constrain scales to float32 tensors.")
|
||||
.TypeConstraint("G", {"tensor(int32)"}, "Constrain to integer types")
|
||||
.TypeConstraint("F", {"tensor(float16)"}, "Be compatible with float version.")
|
||||
.TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(
|
||||
QOrderedLongformerAttention, 1,
|
||||
OpSchema()
|
||||
.SetDoc(R"DOC(Quantized version of Longformer Self Attention (using int8 with specific matrix Layout).)DOC")
|
||||
.Attr("num_heads", "Number of attention heads", AttributeProto::INT)
|
||||
.Attr("window", "One sided attention windows length W, or half of total window length", AttributeProto::INT)
|
||||
.Attr("order_input",
|
||||
"cublasLt order of input matrix. See the schema of QuantizeWithOrder for order definition.",
|
||||
AttributeProto::INT)
|
||||
.Attr("order_weight", "cublasLt order of weight matrix", AttributeProto::INT)
|
||||
.Attr("order_global_weight", "cublasLt order of weight matrix", AttributeProto::INT)
|
||||
.Attr("order_output", "cublasLt order of global bias", AttributeProto::INT)
|
||||
.Input(0, "input",
|
||||
"3D input tensor with shape (batch_size, sequence_length, hidden_size), hidden_size = num_heads * "
|
||||
"head_size",
|
||||
"Q")
|
||||
.Input(1, "scale_input", "scale of the input", "S")
|
||||
.Input(2, "weight", "2D input tensor with shape (hidden_size, 3 * hidden_size)", "Q")
|
||||
.Input(3, "scale_weight", "scale of the weight", "S")
|
||||
.Input(4, "bias", "1D input tensor with shape (3 * hidden_size), fp32 only currently.", "S")
|
||||
.Input(5, "scale_bias", "reserved. (not used as add bias need float value in cublasLt for normal order.)", "S")
|
||||
.Input(6, "scale_qkv_gemm", "scale of the output for fused kqv gemm", "S")
|
||||
.Input(7, "mask", "Attention mask with shape (batch_size, sequence_length)", "F")
|
||||
.Input(8, "global_weight", "2D input tensor with shape (hidden_size, 3 * hidden_size)", "Q")
|
||||
.Input(9, "scale_global_weight", "scale of the global_weight", "S")
|
||||
.Input(10, "global_bias", "1D input tensor with shape (3 * hidden_size)", "S")
|
||||
.Input(11, "scale_global_gemm", "scale of the global_qkv_gemm", "S")
|
||||
.Input(12, "global", "Global attention flags with shape (batch_size, sequence_length)", "G")
|
||||
.Input(13, "scale_output", "scale of the output", "S")
|
||||
.Output(0, "output", "3D output tensor with shape (batch_size, sequence_length, hidden_size)", "Q")
|
||||
.TypeConstraint("Q", {"tensor(int8)"}, "Constrain input and output types to int8 tensors.")
|
||||
.TypeConstraint("S", {"tensor(float)"}, "Constrain scales to float32 tensors.")
|
||||
.TypeConstraint("G", {"tensor(int32)"}, "Constrain to integer types")
|
||||
.TypeConstraint("F", {"tensor(float16)"}, "Be compatible with float version.")
|
||||
.TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
|
||||
|
||||
} // namespace contrib
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -34,11 +34,11 @@ TEST(QuantizeBFPTest, CreateQuantizeGraph) {
|
|||
bfp_type.set_i(static_cast<int64_t>(onnxruntime::contrib::BFPType::BFP_1_8_8_16));
|
||||
bfp_type.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
|
||||
attributes["bfp_type"] = bfp_type;
|
||||
ONNX_NAMESPACE::AttributeProto block_dims;
|
||||
block_dims.set_name("block_dims");
|
||||
block_dims.add_ints(1); // bounding box is over dimension 1
|
||||
block_dims.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INTS);
|
||||
attributes["block_dims"] = block_dims;
|
||||
ONNX_NAMESPACE::AttributeProto block_dim;
|
||||
block_dim.set_name("block_dim");
|
||||
block_dim.set_i(1); // bounding box is over dimension 1
|
||||
block_dim.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
|
||||
attributes["block_dim"] = block_dim;
|
||||
|
||||
std::vector<onnxruntime::NodeArg*> output_defs;
|
||||
ONNX_NAMESPACE::TypeProto y_byte;
|
||||
|
|
@ -91,11 +91,11 @@ TEST(DequantizeBFPTest, CreateDequantizeGraph) {
|
|||
bfp_type.set_i(static_cast<int64_t>(onnxruntime::contrib::BFPType::BFP_1_8_8_16));
|
||||
bfp_type.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
|
||||
attributes["bfp_type"] = bfp_type;
|
||||
ONNX_NAMESPACE::AttributeProto block_dims;
|
||||
block_dims.set_name("block_dims");
|
||||
block_dims.add_ints(1); // bounding box is over dimension 1
|
||||
block_dims.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INTS);
|
||||
attributes["block_dims"] = block_dims;
|
||||
ONNX_NAMESPACE::AttributeProto block_dim;
|
||||
block_dim.set_name("block_dim");
|
||||
block_dim.set_i(1); // bounding box is over dimension 1
|
||||
block_dim.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
|
||||
attributes["block_dim"] = block_dim;
|
||||
ONNX_NAMESPACE::AttributeProto dtype;
|
||||
dtype.set_name("dtype");
|
||||
dtype.set_i(static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT));
|
||||
|
|
|
|||
Loading…
Reference in a new issue