aom: Fix accumulator value for Neon implementation of compute_stats

From 90defa923d158539c837810db0f1fa39e2838fd2 Mon Sep 17 00:00:00 2001
From: Salome Thirot <[EMAIL REDACTED]>
Date: Fri, 2 Aug 2024 14:13:41 +0100
Subject: [PATCH] Fix accumulator value for Neon implementation of
 compute_stats

Fix the value of the maximum number of pixels that can be computed in a
signed 32-bit integer. That value assumed accumulating in an unsigned
32-bit integer, which is wrong and could trigger an integer overflow for
10-bit and 12-bit.

This was not detected by the unit tests, but they will be amended in a
subsequent commit.

Change-Id: I0214792293ad03e3aa0d331c8320f416e5df8d81
---
 av1/encoder/arm/highbd_pickrst_neon.c | 16 ++++++++--------
 av1/encoder/arm/pickrst_neon.c        |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/av1/encoder/arm/highbd_pickrst_neon.c b/av1/encoder/arm/highbd_pickrst_neon.c
index 60beca2dc0..cfc5e0c7e8 100644
--- a/av1/encoder/arm/highbd_pickrst_neon.c
+++ b/av1/encoder/arm/highbd_pickrst_neon.c
@@ -454,10 +454,10 @@ static INLINE void highbd_compute_stats_win7_neon(
   const uint8x16_t lut10 = vld1q_u8(shuffle_stats7_highbd + 160);
   const uint8x16_t lut11 = vld1q_u8(shuffle_stats7_highbd + 176);
 
-  // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results
-  // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can
-  // be as high as 32768/2048/128 for the compute stats.
-  const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1;
+  // We can accumulate up to 32768/2048/128 8/10/12-bit multiplication results
+  // in a signed 32-bit integer. We are processing 2 pixels at a time, so the
+  // accumulator max can be as high as 16384/1024/64 for the compute stats.
+  const int acc_cnt_max = (1 << (31 - 2 * bit_depth)) >> 1;
   int acc_cnt = acc_cnt_max;
   const int src_next = src_stride - width;
   const int dgd_next = dgd_stride - width;
@@ -736,10 +736,10 @@ static void highbd_compute_stats_win5_neon(const uint16_t *dgd,
   const uint8x16_t lut4 = vld1q_u8(shuffle_stats5_highbd + 64);
   const uint8x16_t lut5 = vld1q_u8(shuffle_stats5_highbd + 80);
 
-  // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results
-  // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can
-  // be as high as 32768/2048/128 for the compute stats.
-  const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1;
+  // We can accumulate up to 32768/2048/128 8/10/12-bit multiplication results
+  // in a signed 32-bit integer. We are processing 2 pixels at a time, so the
+  // accumulator max can be as high as 16384/1024/64 for the compute stats.
+  const int acc_cnt_max = (1 << (31 - 2 * bit_depth)) >> 1;
   int acc_cnt = acc_cnt_max;
   const int src_next = src_stride - width;
   const int dgd_next = dgd_stride - width;
diff --git a/av1/encoder/arm/pickrst_neon.c b/av1/encoder/arm/pickrst_neon.c
index 015378ac98..b6fba99d8d 100644
--- a/av1/encoder/arm/pickrst_neon.c
+++ b/av1/encoder/arm/pickrst_neon.c
@@ -176,10 +176,10 @@ int64_t av1_lowbd_pixel_proj_error_neon(
   return sse;
 }
 
-// We can accumulate up to 65536 8-bit multiplication results in 32-bit. We are
-// processing 2 pixels at a time, so the accumulator max can be as high as 32768
-// for the compute stats.
-#define STAT_ACCUMULATOR_MAX 32768
+// We can accumulate up to 32768 8-bit multiplication results in a signed
+// 32-bit integer. We are processing 2 pixels at a time, so the accumulator max
+// can be as high as 16384 for the compute stats.
+#define STAT_ACCUMULATOR_MAX 16384
 
 static INLINE uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) {
 #if AOM_ARCH_AARCH64