aom: Optimize multiply_and_scale() by precomputation (fd059)

From fd059efcfecfac7be90b3aed3c2159a38fa3cd5d Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <[EMAIL REDACTED]>
Date: Thu, 29 Feb 2024 14:23:42 -0800
Subject: [PATCH] Optimize multiply_and_scale() by precomputation

multiply_and_scale() is called in nested loops. Precompute the
components of w (w1 and w2) and pass the components of w to
multiply_and_scale().

Bug: b:319140742
Bug: oss-fuzz:66474
Change-Id: I1ecf42dd4a7a552739136cbfc21f16aaba8f4523
(cherry picked from commit 9ae3ff78826cb7ed785ebe999acefc3426d1820f)
---
 av1/encoder/pickrst.c | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 6753c9edc..369529a4f 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -1103,13 +1103,26 @@ static INLINE int wrap_index(int i, int wiener_win) {
   return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i);
 }
 
-// Calculates x * w / WIENER_TAP_SCALE_FACTOR. The multiplication may overflow,
-// so we do the multiplication by components and combine it with the division.
-static INLINE int64_t multiply_and_scale(int64_t x, int32_t w) {
-  // Let w = w1 * WIENER_TAP_SCALE_FACTOR + w2
-  const int32_t w1 = w / WIENER_TAP_SCALE_FACTOR;
-  const int32_t w2 = w - w1 * WIENER_TAP_SCALE_FACTOR;
+// Splits each w[i] into smaller components w1[i] and w2[i] such that
+// w[i] = w1[i] * WIENER_TAP_SCALE_FACTOR + w2[i].
+static INLINE void split_wiener_filter_coefficients(int wiener_win,
+                                                    const int32_t *w,
+                                                    int32_t *w1, int32_t *w2) {
+  for (int i = 0; i < wiener_win; i++) {
+    w1[i] = w[i] / WIENER_TAP_SCALE_FACTOR;
+    w2[i] = w[i] - w1[i] * WIENER_TAP_SCALE_FACTOR;
+    assert(w[i] == w1[i] * WIENER_TAP_SCALE_FACTOR + w2[i]);
+  }
+}
+
+// Calculates x * w / WIENER_TAP_SCALE_FACTOR, where
+// w = w1 * WIENER_TAP_SCALE_FACTOR + w2.
+//
+// The multiplication x * w may overflow, so we multiply x by the components of
+// w (w1 and w2) and combine the multiplication with the division.
+static INLINE int64_t multiply_and_scale(int64_t x, int32_t w1, int32_t w2) {
   // Let y = x * w / WIENER_TAP_SCALE_FACTOR
+  //       = x * (w1 * WIENER_TAP_SCALE_FACTOR + w2) / WIENER_TAP_SCALE_FACTOR
   const int64_t y = x * w1 + x * w2 / WIENER_TAP_SCALE_FACTOR;
   return y;
 }
@@ -1190,6 +1203,7 @@ static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
   int i, j;
   int64_t S[WIENER_WIN];
   int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+  int32_t b1[WIENER_WIN], b2[WIENER_WIN];
   const int wiener_win2 = wiener_win * wiener_win;
   const int wiener_halfwin1 = (wiener_win >> 1) + 1;
   memset(A, 0, sizeof(A));
@@ -1200,6 +1214,7 @@ static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
       A[jj] += Mc[i][j] * b[i] / WIENER_TAP_SCALE_FACTOR;
     }
   }
+  split_wiener_filter_coefficients(wiener_win, b, b1, b2);
 
   for (i = 0; i < wiener_win; i++) {
     for (j = 0; j < wiener_win; j++) {
@@ -1217,7 +1232,8 @@ static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
           // multiplication with the last division.
           const int64_t x = Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] /
                             WIENER_TAP_SCALE_FACTOR;
-          B[ll * wiener_halfwin1 + kk] += multiply_and_scale(x, b[j]);
+          // b[j] = b1[j] * WIENER_TAP_SCALE_FACTOR + b2[j]
+          B[ll * wiener_halfwin1 + kk] += multiply_and_scale(x, b1[j], b2[j]);
         }
       }
     }
@@ -1257,6 +1273,7 @@ static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
   int i, j;
   int64_t S[WIENER_WIN];
   int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+  int32_t a1[WIENER_WIN], a2[WIENER_WIN];
   const int wiener_win2 = wiener_win * wiener_win;
   const int wiener_halfwin1 = (wiener_win >> 1) + 1;
   memset(A, 0, sizeof(A));
@@ -1267,6 +1284,7 @@ static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
       A[ii] += Mc[i][j] * a[j] / WIENER_TAP_SCALE_FACTOR;
     }
   }
+  split_wiener_filter_coefficients(wiener_win, a, a1, a2);
 
   for (i = 0; i < wiener_win; i++) {
     const int ii = wrap_index(i, wiener_win);
@@ -1284,7 +1302,8 @@ static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
           // multiplication with the last division.
           const int64_t x = Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] /
                             WIENER_TAP_SCALE_FACTOR;
-          B[jj * wiener_halfwin1 + ii] += multiply_and_scale(x, a[l]);
+          // a[l] = a1[l] * WIENER_TAP_SCALE_FACTOR + a2[l]
+          B[jj * wiener_halfwin1 + ii] += multiply_and_scale(x, a1[l], a2[l]);
         }
       }
     }