mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-29 03:30:52 +00:00
### Description <!-- Describe your changes. --> BUG #22031 In the demucs model, there are lots of MatMul ops with shapes like below: `input[0]: [3448,1,512] | float32, input[1]: [512,1536] | float32, output[0]: [3448,1,1536] | float32` We can see that for this kind of shape, the batch size is a big value, but M = 1. Our current algorithm is based on [M, N] to partition tiles, which is not efficient for such kind of shapes. This PR reshapes the inputs to improve the matmul performance. Before: [3448,1,512] x [512,1536] = [3448,1,1536] After: [1, 3448, 512] x [512, 1536] = [1, 3448, 1536] , then the output can be reshaped to [3448, 1, 1536] The overall MatMul time in demucs model becomes 1778.45 ms from 4418.17 ms on my iGPUs. --------- Co-authored-by: Yulong Wang <7679871+fs-eire@users.noreply.github.com> |
||
|---|---|---|
| .. | ||
| _example.jsonc | ||
| abs-int32.jsonc | ||
| abs.jsonc | ||
| absr.jsonc | ||
| abss.jsonc | ||
| acos.jsonc | ||
| add.jsonc | ||
| add_int32.jsonc | ||
| add_zero-sized.jsonc | ||
| and.jsonc | ||
| asin.jsonc | ||
| attention.jsonc | ||
| batch-norm.jsonc | ||
| bias-add.jsonc | ||
| bias-split-gelu.jsonc | ||
| cast.jsonc | ||
| ceil.jsonc | ||
| clip.jsonc | ||
| concat.jsonc | ||
| concat_int32.jsonc | ||
| concat_zero-sized.jsonc | ||
| conv-transpose.jsonc | ||
| conv.jsonc | ||
| conv1d.jsonc | ||
| conv3dncdhw.jsonc | ||
| cos.jsonc | ||
| cumsum.jsonc | ||
| depth-to-space.jsonc | ||
| dequantize-linear-int4.jsonc | ||
| dequantizelinear.jsonc | ||
| div.jsonc | ||
| div_int32.jsonc | ||
| einsum.jsonc | ||
| equal.jsonc | ||
| exp.jsonc | ||
| expand.jsonc | ||
| fast-gelu.jsonc | ||
| floor.jsonc | ||
| fused-conv.jsonc | ||
| fused-conv3dncdhw.jsonc | ||
| gather-block-quantized.jsonc | ||
| gather-elements.jsonc | ||
| gather.jsonc | ||
| gelu.jsonc | ||
| gemm.jsonc | ||
| global-average-pool.jsonc | ||
| greater.jsonc | ||
| group-query-attention.jsonc | ||
| identity.jsonc | ||
| image-scaler.jsonc | ||
| instance-norm.jsonc | ||
| layer-norm.jsonc | ||
| leaky-relu.jsonc | ||
| less.jsonc | ||
| log.jsonc | ||
| matmul-broadcast.jsonc | ||
| matmul.jsonc | ||
| matmulnbits.jsonc | ||
| max-pool.jsonc | ||
| mul.jsonc | ||
| mul_int32.jsonc | ||
| multihead-attention.jsonc | ||
| neg-int32.jsonc | ||
| neg.jsonc | ||
| not.jsonc | ||
| or.jsonc | ||
| pad-big.jsonc | ||
| pad.jsonc | ||
| pad_f16.jsonc | ||
| pow-big-number.jsonc | ||
| pow.jsonc | ||
| pow_int32.jsonc | ||
| quick-gelu.jsonc | ||
| reduce-min.jsonc | ||
| relu.jsonc | ||
| reshape-int32.jsonc | ||
| reshape-pack.jsonc | ||
| reshape.jsonc | ||
| resize-pack.jsonc | ||
| resize.jsonc | ||
| rotary-embedding.jsonc | ||
| shape.jsonc | ||
| simplified-layer-norm.jsonc | ||
| sin.jsonc | ||
| skip-layer-norm.jsonc | ||
| skip-simplified-layer-norm.jsonc | ||
| slice.jsonc | ||
| softmax.jsonc | ||
| split.jsonc | ||
| sqrt.jsonc | ||
| sub.jsonc | ||
| sub_int32.jsonc | ||
| tan.jsonc | ||
| tanh.jsonc | ||
| tile.jsonc | ||
| transpose.jsonc | ||
| transpose_int32_uint32.jsonc | ||
| upsample.jsonc | ||
| where.jsonc | ||
| where_broadcast.jsonc | ||
| xor.jsonc | ||