From 90defa923d158539c837810db0f1fa39e2838fd2 Mon Sep 17 00:00:00 2001
From: Salome Thirot <[EMAIL REDACTED]>
Date: Fri, 2 Aug 2024 14:13:41 +0100
Subject: [PATCH] Fix accumulator value for Neon implementation of
compute_stats
Fix the value of the maximum number of pixels that can be computed in a
signed 32-bit integer. That value assumed accumulating in an unsigned
32-bit integer, which is wrong and could trigger an integer overflow for
10-bit and 12-bit.
This was not detected by the unit tests, but they will be amended in a
subsequent commit.
Change-Id: I0214792293ad03e3aa0d331c8320f416e5df8d81
---
av1/encoder/arm/highbd_pickrst_neon.c | 16 ++++++++--------
av1/encoder/arm/pickrst_neon.c | 8 ++++----
2 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/av1/encoder/arm/highbd_pickrst_neon.c b/av1/encoder/arm/highbd_pickrst_neon.c
index 60beca2dc0..cfc5e0c7e8 100644
--- a/av1/encoder/arm/highbd_pickrst_neon.c
+++ b/av1/encoder/arm/highbd_pickrst_neon.c
@@ -454,10 +454,10 @@ static INLINE void highbd_compute_stats_win7_neon(
const uint8x16_t lut10 = vld1q_u8(shuffle_stats7_highbd + 160);
const uint8x16_t lut11 = vld1q_u8(shuffle_stats7_highbd + 176);
- // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results
- // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can
- // be as high as 32768/2048/128 for the compute stats.
- const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1;
+ // We can accumulate up to 32768/2048/128 8/10/12-bit multiplication results
+ // in a signed 32-bit integer. We are processing 2 pixels at a time, so the
+ // accumulator max can be as high as 16384/1024/64 for the compute stats.
+ const int acc_cnt_max = (1 << (31 - 2 * bit_depth)) >> 1;
int acc_cnt = acc_cnt_max;
const int src_next = src_stride - width;
const int dgd_next = dgd_stride - width;
@@ -736,10 +736,10 @@ static void highbd_compute_stats_win5_neon(const uint16_t *dgd,
const uint8x16_t lut4 = vld1q_u8(shuffle_stats5_highbd + 64);
const uint8x16_t lut5 = vld1q_u8(shuffle_stats5_highbd + 80);
- // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results
- // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can
- // be as high as 32768/2048/128 for the compute stats.
- const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1;
+ // We can accumulate up to 32768/2048/128 8/10/12-bit multiplication results
+ // in a signed 32-bit integer. We are processing 2 pixels at a time, so the
+ // accumulator max can be as high as 16384/1024/64 for the compute stats.
+ const int acc_cnt_max = (1 << (31 - 2 * bit_depth)) >> 1;
int acc_cnt = acc_cnt_max;
const int src_next = src_stride - width;
const int dgd_next = dgd_stride - width;
diff --git a/av1/encoder/arm/pickrst_neon.c b/av1/encoder/arm/pickrst_neon.c
index 015378ac98..b6fba99d8d 100644
--- a/av1/encoder/arm/pickrst_neon.c
+++ b/av1/encoder/arm/pickrst_neon.c
@@ -176,10 +176,10 @@ int64_t av1_lowbd_pixel_proj_error_neon(
return sse;
}
-// We can accumulate up to 65536 8-bit multiplication results in 32-bit. We are
-// processing 2 pixels at a time, so the accumulator max can be as high as 32768
-// for the compute stats.
-#define STAT_ACCUMULATOR_MAX 32768
+// We can accumulate up to 32768 8-bit multiplication results in a signed
+// 32-bit integer. We are processing 2 pixels at a time, so the accumulator max
+// can be as high as 16384 for the compute stats.
+#define STAT_ACCUMULATOR_MAX 16384
static INLINE uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) {
#if AOM_ARCH_AARCH64