mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-24 22:17:32 +00:00
BUG #23273 This PR does below optimizations: 1. When output channels is one, 1) calculate the offset before the inchannel loop to reduce indices to offsets calculation, 2) split the `inputChannelsPerGroup` into `inputChannelsPerGroupInt` and `inputChannelsRemainder` parts so that we can always access 4 data for `inputChannelsPerGroupInt`. 2. Use precise initial value to reduce useless loop iterations. Thanks @jiangzhaoming 's suggestion's on this. With this PR, ConvTranspose becomes 3.7s from 8.4s on Intel Meteor Lake. On NV RTX 2000 Ada, it becomes 1.6s from 2.7s. |
||
|---|---|---|
| .. | ||
| _example.jsonc | ||
| abs-int32.jsonc | ||
| abs.jsonc | ||
| absr.jsonc | ||
| abss.jsonc | ||
| acos.jsonc | ||
| add.jsonc | ||
| add_int32.jsonc | ||
| add_zero-sized.jsonc | ||
| and.jsonc | ||
| asin.jsonc | ||
| attention.jsonc | ||
| batch-norm.jsonc | ||
| bias-add.jsonc | ||
| bias-split-gelu.jsonc | ||
| cast.jsonc | ||
| ceil.jsonc | ||
| clip.jsonc | ||
| concat.jsonc | ||
| concat_int32.jsonc | ||
| concat_zero-sized.jsonc | ||
| conv-transpose.jsonc | ||
| conv.jsonc | ||
| conv1d.jsonc | ||
| conv3dncdhw.jsonc | ||
| cos.jsonc | ||
| cumsum.jsonc | ||
| depth-to-space.jsonc | ||
| dequantize-linear-int4.jsonc | ||
| dequantizelinear.jsonc | ||
| div.jsonc | ||
| div_int32.jsonc | ||
| einsum.jsonc | ||
| equal.jsonc | ||
| exp.jsonc | ||
| expand.jsonc | ||
| fast-gelu.jsonc | ||
| floor.jsonc | ||
| fused-conv.jsonc | ||
| fused-conv3dncdhw.jsonc | ||
| gather-block-quantized.jsonc | ||
| gather-elements.jsonc | ||
| gather-nd.jsonc | ||
| gather.jsonc | ||
| gelu.jsonc | ||
| gemm.jsonc | ||
| global-average-pool.jsonc | ||
| greater.jsonc | ||
| group-query-attention.jsonc | ||
| identity.jsonc | ||
| image-scaler.jsonc | ||
| instance-norm.jsonc | ||
| layer-norm.jsonc | ||
| leaky-relu.jsonc | ||
| less.jsonc | ||
| log.jsonc | ||
| matmul-broadcast.jsonc | ||
| matmul.jsonc | ||
| matmulnbits.jsonc | ||
| max-pool.jsonc | ||
| mul.jsonc | ||
| mul_int32.jsonc | ||
| multihead-attention.jsonc | ||
| neg-int32.jsonc | ||
| neg.jsonc | ||
| not.jsonc | ||
| or.jsonc | ||
| pad-big.jsonc | ||
| pad.jsonc | ||
| pad_f16.jsonc | ||
| pow-big-number.jsonc | ||
| pow.jsonc | ||
| pow_int32.jsonc | ||
| quick-gelu.jsonc | ||
| reduce-min.jsonc | ||
| relu.jsonc | ||
| reshape-int32.jsonc | ||
| reshape-pack.jsonc | ||
| reshape.jsonc | ||
| resize-pack.jsonc | ||
| resize.jsonc | ||
| rotary-embedding.jsonc | ||
| scatternd.jsonc | ||
| shape.jsonc | ||
| simplified-layer-norm.jsonc | ||
| sin.jsonc | ||
| skip-layer-norm.jsonc | ||
| skip-simplified-layer-norm.jsonc | ||
| slice.jsonc | ||
| softmax.jsonc | ||
| split.jsonc | ||
| sqrt.jsonc | ||
| sub.jsonc | ||
| sub_int32.jsonc | ||
| tan.jsonc | ||
| tanh.jsonc | ||
| tile.jsonc | ||
| transpose.jsonc | ||
| transpose_int32_uint32.jsonc | ||
| upsample.jsonc | ||
| where.jsonc | ||
| where_broadcast.jsonc | ||
| xor.jsonc | ||