aom: Handle w=2 case in aom_highbd_convolve_copy_neon()

From 34d29d40dd03eaaec297eacf5d2da2a8c864f35e Mon Sep 17 00:00:00 2001
From: Wan-Teh Chang <[EMAIL REDACTED]>
Date: Thu, 27 Jun 2024 10:28:24 -0700
Subject: [PATCH] Handle w=2 case in aom_highbd_convolve_copy_neon()

Fix the incorrect assumption that if w < 8, then w == 4. w may be equal
to 2.

Tested:
cmake ../aom -G Ninja -DFORCE_HIGHBITDEPTH_DECODING=1
ninja
./test_libaom --gtest_filter=*TestVectorTest*

cmake ../aom -G Ninja -DSANITIZE=address
ninja
./test_libaom --gtest_filter=*ConvolveCopy*

Bug: 349832592
Change-Id: I2600d00b097a94a079c4827cdc894d02cf03c42e
---
 aom_dsp/arm/aom_convolve_copy_neon.c | 13 +++++++++-
 test/av1_convolve_test.cc            | 39 +++++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/aom_dsp/arm/aom_convolve_copy_neon.c b/aom_dsp/arm/aom_convolve_copy_neon.c
index b90b1bd0e1..447ae37e56 100644
--- a/aom_dsp/arm/aom_convolve_copy_neon.c
+++ b/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -57,7 +57,18 @@ void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
 void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
                                    uint16_t *dst, ptrdiff_t dst_stride, int w,
                                    int h) {
-  if (w < 8) {  // copy4
+  if (w < 4) {  // copy2
+    do {
+      memcpy(dst, src, 4);
+      src += src_stride;
+      dst += dst_stride;
+
+      memcpy(dst, src, 4);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 4) {  // copy4
     uint16x4_t s0, s1;
     do {
       s0 = vld1_u16(src);
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index aac8006e50..79bb942d0b 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -9,6 +9,10 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
 #include <ostream>
 #include <set>
 #include <vector>
@@ -218,12 +222,12 @@ class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> {
 
   // Check that two 8-bit output buffers are identical.
   void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width,
-                            int height) {
+                            int height, ptrdiff_t stride = kOutputStride) {
     ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations";
     for (int j = 0; j < height; ++j) {
       if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
-        p1 += kOutputStride;
-        p2 += kOutputStride;
+        p1 += stride;
+        p2 += stride;
         continue;
       }
       for (int i = 0; i < width; ++i) {
@@ -236,12 +240,12 @@ class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> {
 
   // Check that two 16-bit output buffers are identical.
   void AssertOutputBufferEq(const uint16_t *p1, const uint16_t *p2, int width,
-                            int height) {
+                            int height, ptrdiff_t stride = kOutputStride) {
     ASSERT_TRUE(p1 != p2) << "Buffers must be in different memory locations";
     for (int j = 0; j < height; ++j) {
       if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
-        p1 += kOutputStride;
-        p2 += kOutputStride;
+        p1 += stride;
+        p2 += stride;
         continue;
       }
       for (int i = 0; i < width; ++i) {
@@ -1122,6 +1126,17 @@ class AV1ConvolveCopyTest : public AV1ConvolveTest<convolve_copy_func> {
     DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
     AssertOutputBufferEq(reference, test, width, height);
+
+    // Test again with dst_stride=width.
+    std::unique_ptr<uint8_t[]> reference2(new (std::nothrow)
+                                              uint8_t[width * height]);
+    ASSERT_NE(reference2, nullptr);
+    aom_convolve_copy_c(input, width, reference2.get(), width, width, height);
+    std::unique_ptr<uint8_t[]> test2(new (std::nothrow)
+                                         uint8_t[width * height]);
+    ASSERT_NE(test2, nullptr);
+    GetParam().TestFunction()(input, width, test2.get(), width, width, height);
+    AssertOutputBufferEq(reference2.get(), test2.get(), width, height, width);
   }
 };
 
@@ -1169,6 +1184,18 @@ class AV1ConvolveCopyHighbdTest
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
     AssertOutputBufferEq(reference, test, width, height);
+
+    // Test again with dst_stride=width.
+    std::unique_ptr<uint16_t[]> reference2(new (std::nothrow)
+                                               uint16_t[width * height]);
+    ASSERT_NE(reference2, nullptr);
+    aom_highbd_convolve_copy_c(input, width, reference2.get(), width, width,
+                               height);
+    std::unique_ptr<uint16_t[]> test2(new (std::nothrow)
+                                          uint16_t[width * height]);
+    ASSERT_NE(test2, nullptr);
+    GetParam().TestFunction()(input, width, test2.get(), width, width, height);
+    AssertOutputBufferEq(reference2.get(), test2.get(), width, height, width);
   }
 };