From d981b153d32534c6f06e5dea0df33d1b1a707cf3 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Thu, 6 Feb 2025 20:26:25 +0200
Subject: [PATCH] [webgpu/js] Optimize resize webgpu op & fix precision issues
 (#23591)

### Description
<!-- Describe your changes. -->

This PR is a follow-up to
https://github.com/microsoft/onnxruntime/pull/23488 and partially
improves upon https://github.com/microsoft/onnxruntime/issues/23403. It
does the following:
- Prevents unnecessary cache shader recompilation for 'nearest' resize
operation.
- Fixes precision (offset-by-one) errors with asymmetric coordinate
transform. When running the Kokoro TTS model, values for the
`/decoder/decoder/generator/f0_upsamp/Resize_output_0` results in
differences at the end bounds due to precision issues when dividing
21600 by 72 (should be 300, but seemingly results in 299.999, which
causes issues when flooring)

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

I did a deep dive over the weekend to try fix Kokoro TTS on WebGPU and
found that the above node had a large difference. Thinking this was a
major issue, I spent some time fixing it. Turns out, it only happens for
a small number of values, leading to high maximum error, but most values
are correct (as seen here).

BEFORE:
```
[/decoder/decoder/generator/f0_upsamp/Resize_output_0] atol: 78.6640682220459 | rtol: 24.13991587587724 | avgDiff: 0.009967932171121087 | medianDiff: 0.000030517578125
```

AFTER:
```
[/decoder/decoder/generator/f0_upsamp/Resize_output_0] atol: 0.0011138916015625 | rtol: 0.0020059924232260704 | avgDiff: 0.00008570214675873825 | medianDiff: 0.000030517578125
```

So, although it has a very small impact on the final output (waveform),
this bug could appear with other models in a more severe way.

BEFORE:
```
[waveform] atol: 0.04784199967980385 | rtol: 1366.0462001093495 | avgDiff: 0.0009544936942737713 | medianDiff: 0.00015346752479672432
```

AFTER:
```
[waveform] atol: 0.04775865003466606 | rtol: 1354.7002460360852 | avgDiff: 0.000954830244055033 | medianDiff: 0.00015274062752723694
```
---
 js/web/lib/wasm/jsep/webgpu/ops/resize.ts | 36 +++++++++++++++--------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/resize.ts b/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
index edc0ea12b6..a0abfc0270 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
@@ -157,6 +157,16 @@ const validateInputs = (
   }
 };
 
+const getSafeIntegerDivision = (a: string, b: string, c: string, dType: string): string => `
+  // The whole part and the fractional part are calculated separately due to inaccuracy of floating
+  // point division. As an example, f32(21) / f32(7) may evaluate to 2.99... instead of 3, causing an
+  // offset-by-one error later in floor().
+  let big = (${a}) * (${b});
+  let whole = ${dType}(big / (${c}));
+  let fract = ${dType}(big % (${c})) / ${dType}(${c});
+  return whole + fract;
+`;
+
 const getOriginalCoordinateFromResizedCoordinate = (
   coordinateTransferMode: CoordinateTransformMode,
   dType: string,
@@ -166,7 +176,13 @@ const getOriginalCoordinateFromResizedCoordinate = (
   (() => {
     switch (coordinateTransferMode) {
       case 'asymmetric':
-        return `return ${dType}(xResized) / ${dType}(xScale);`;
+        return `
+          if (xScale < 1.0 || floor(xScale) != xScale) {
+            return ${dType}(xResized) / ${dType}(xScale);
+          } else {
+            ${getSafeIntegerDivision('xResized', 'lengthOriginal', 'lengthResized', dType)}
+          }
+        `;
       case 'pytorch_half_pixel':
         return `if (lengthResized > 1) {
                     return (${dType}(xResized) + 0.5) / ${dType}(xScale) - 0.5;
@@ -179,13 +195,7 @@ const getOriginalCoordinateFromResizedCoordinate = (
         return `if (lengthResized == 1) {
                     return 0.0;
                   } else {
-                    // The whole part and the fractional part are calculated separately due to inaccuracy of floating
-                    // point division. As an example, f32(21) / f32(7) may evaluate to 2.99... instead of 3, causing an
-                    // offset-by-one error later in floor().
-                    let whole = ${dType}(xResized * (lengthOriginal - 1) / (lengthResized - 1));
-                    let fract =
-                        ${dType}(xResized * (lengthOriginal - 1) % (lengthResized - 1)) / ${dType}(lengthResized - 1);
-                    return whole + fract;
+                    ${getSafeIntegerDivision('xResized', 'lengthOriginal - 1', 'lengthResized - 1', dType)}
                   }`;
       case 'tf_crop_and_resize':
         return `if (lengthResized > 1) {
@@ -375,7 +385,7 @@ const calculateInputIndicesFromOutputIndices = (
             input_index = u32(original_idx);
           }
         }
-        ${input.indicesSet('input_indices', 'i', ' input_index')}
+        ${input.indicesSet('input_indices', 'i', 'input_index')}
       }
       return input_indices;
     }`;
@@ -758,9 +768,11 @@ const createResizeProgramInfo = (
   return {
     name: 'Resize',
     shaderCache: {
-      hint: `${attributes.cacheKey}|${opsetVersion}|${scales.length > 0 ? scales : ''}|${
-        sizes.length > 0 ? sizes : ''
-      }|${roi.length > 0 ? roi : ''}|${noScale}|${inputShape}`,
+      hint: `${attributes.cacheKey}|${opsetVersion}|${
+        scales.length > 0 ? (attributes.mode === 'cubic' ? scales : scales.length) : ''
+      }|${sizes.length > 0 ? sizes : ''}|${roi.length > 0 ? roi : ''}|${noScale}|${
+        attributes.mode === 'nearest' ? inputShape.length : inputShape
+      }`,
       inputDependencies: ['rank'],
     },
     getShaderSource,