aom: Restore a couple of SIMD functions for x86

From 8fbd5fb331159dddeb22a7ad0806785ed15bb318 Mon Sep 17 00:00:00 2001
From: Rachel Barker <[EMAIL REDACTED]>
Date: Mon, 4 Mar 2024 06:14:19 +0000
Subject: [PATCH] Restore a couple of SIMD functions for x86

For normal use of libaom, we expect SSE4.1 support. Therefore in commmit
8e0b26f, we removed many SSE2 and SSSE3 functions which would never be
used on machines which support SSE4.1.

However, Valgrind on 32-bit x86 only supports up to SSSE3. This means
that the Valgrind tests became much slower, and caused timeouts.

To address this, restore the SSSE3 implementations of the lowbd forward
transform and of various CDEF functions. These are enabled only for
32-bit builds - 64-bit Valgrind does support SSE4.1, so these functions
are not required in 64-bit mode.

BUG: b/322787141
Change-Id: I9067a9dc89678a191e6e8b9148c555c24a424b1f
---
 av1/av1.cmake                       |  6 +++
 av1/common/av1_rtcd_defs.pl         | 36 +++++++++------
 av1/common/x86/cdef_block_ssse3.c   | 51 ++++++++++++++++++++++
 av1/encoder/x86/av1_fwd_txfm_sse2.c | 40 +++++++++++++++++
 test/av1_fwd_txfm2d_test.cc         | 38 +++++++++++++---
 test/cdef_test.cc                   | 68 ++++++++++++++++++++++++++++-
 6 files changed, 220 insertions(+), 19 deletions(-)
 create mode 100644 av1/common/x86/cdef_block_ssse3.c

diff --git a/av1/av1.cmake b/av1/av1.cmake
index f0587ac9d..672e21cc4 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -275,6 +275,12 @@ list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c"
             "${AOM_ROOT}/av1/common/x86/resize_ssse3.c")
 
+# Fallbacks to support Valgrind on 32-bit x86
+if(AOM_ARCH_X86)
+  list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
+              "${AOM_ROOT}/av1/common/x86/cdef_block_ssse3.c")
+endif()
+
 list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1
             "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c"
             "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 522bb8127..5ae4b60e7 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -77,6 +77,16 @@ ()
 }
 forward_decls qw/av1_common_forward_decls/;
 
+# Fallbacks for Valgrind support
+# For normal use, we require SSE4.1. However, 32-bit Valgrind does not support
+# SSE4.1, so we include fallbacks for some critical functions to improve
+# performance
+$ssse3_x86 = '';
+if ($opts{arch} eq "x86") {
+  $sse2_x86 = 'sse2';
+  $ssse3_x86 = 'ssse3';
+}
+
 # functions that are 64 bit only.
 $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
 if ($opts{arch} eq "x86_64") {
@@ -345,7 +355,7 @@ ()
 
   #fwd txfm
   add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
-  specialize qw/av1_lowbd_fwd_txfm sse4_1 avx2 neon/;
+  specialize qw/av1_lowbd_fwd_txfm sse4_1 avx2 neon/, $sse2_x86;
 
   add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   specialize qw/av1_fwd_txfm2d_4x8 sse4_1 neon/;
@@ -521,21 +531,21 @@ ()
 # structs as arguments, which makes the v256 type of the intrinsics
 # hard to support, so optimizations for this target are disabled.
 if ($opts{config} !~ /libs-x86-win32-vs.*/) {
-  specialize qw/cdef_find_dir sse4_1 avx2 neon/;
-  specialize qw/cdef_find_dir_dual sse4_1 avx2 neon/;
+  specialize qw/cdef_find_dir sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_find_dir_dual sse4_1 avx2 neon/, "$ssse3_x86";
 
-  specialize qw/cdef_filter_8_0 sse4_1 avx2 neon/;
-  specialize qw/cdef_filter_8_1 sse4_1 avx2 neon/;
-  specialize qw/cdef_filter_8_2 sse4_1 avx2 neon/;
-  specialize qw/cdef_filter_8_3 sse4_1 avx2 neon/;
+  specialize qw/cdef_filter_8_0 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_8_1 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_8_2 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_8_3 sse4_1 avx2 neon/, "$ssse3_x86";
 
-  specialize qw/cdef_filter_16_0 sse4_1 avx2 neon/;
-  specialize qw/cdef_filter_16_1 sse4_1 avx2 neon/;
-  specialize qw/cdef_filter_16_2 sse4_1 avx2 neon/;
-  specialize qw/cdef_filter_16_3 sse4_1 avx2 neon/;
+  specialize qw/cdef_filter_16_0 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_16_1 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_16_2 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_16_3 sse4_1 avx2 neon/, "$ssse3_x86";
 
-  specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon/;
-  specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon/;
+  specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86";
 }
 
 # WARPED_MOTION / GLOBAL_MOTION functions
diff --git a/av1/common/x86/cdef_block_ssse3.c b/av1/common/x86/cdef_block_ssse3.c
new file mode 100644
index 000000000..14eb6c9e3
--- /dev/null
+++ b/av1/common/x86/cdef_block_ssse3.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Include SSSE3 CDEF code only for 32-bit x86, to support Valgrind.
+// For normal use, we require SSE4.1, so cdef_*_sse4_1 will be used instead of
+// these functions. However, 32-bit Valgrind does not support SSE4.1, so we
+// include a fallback to SSSE3 to improve performance
+
+#include "config/aom_config.h"
+
+#if !AOM_ARCH_X86
+#error "cdef_block_ssse3.c is included for compatibility with 32-bit x86 only"
+#endif  // !AOM_ARCH_X86
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_ssse3
+#include "av1/common/cdef_block_simd.h"
+
+void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2,
+                              int stride, int32_t *var_out_1st,
+                              int32_t *var_out_2nd, int coeff_shift,
+                              int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+  // Process first 8x8.
+  *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
+
+  // Process second 8x8.
+  *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
+}
+
+void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride,
+                                         const uint8_t *src, int sstride,
+                                         int width, int height) {
+  int j;
+  for (int i = 0; i < height; i++) {
+    for (j = 0; j < (width & ~0x7); j += 8) {
+      v64 row = v64_load_unaligned(&src[i * sstride + j]);
+      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+    }
+    for (; j < width; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
diff --git a/av1/encoder/x86/av1_fwd_txfm_sse2.c b/av1/encoder/x86/av1_fwd_txfm_sse2.c
index cf6c3a20f..31cc37db7 100644
--- a/av1/encoder/x86/av1_fwd_txfm_sse2.c
+++ b/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -2637,3 +2637,43 @@ void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
     store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 32, 16);
   }
 }
+
+// Include top-level function only for 32-bit x86, to support Valgrind.
+// For normal use, we require SSE4.1, so av1_lowbd_fwd_txfm_sse4_1 will be used
+// instead of this function. However, 32-bit Valgrind does not support SSE4.1,
+// so we include a fallback to SSE2 to improve performance
+#if AOM_ARCH_X86
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,    // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,    // 8x8 transform
+  av1_lowbd_fwd_txfm2d_16x16_sse2,  // 16x16 transform
+  av1_lowbd_fwd_txfm2d_32x32_sse2,  // 32x32 transform
+  NULL,                             // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,    // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,    // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,   // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,   // 16x8 transform
+  av1_lowbd_fwd_txfm2d_16x32_sse2,  // 16x32 transform
+  av1_lowbd_fwd_txfm2d_32x16_sse2,  // 32x16 transform
+  NULL,                             // 32x64 transform
+  NULL,                             // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,   // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,   // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,   // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,   // 32x8 transform
+  av1_lowbd_fwd_txfm2d_16x64_sse2,  // 16x64 transform
+  av1_lowbd_fwd_txfm2d_64x16_sse2,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff,
+                             int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4))
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  else
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+}
+#endif  // AOM_ARCH_X86
diff --git a/test/av1_fwd_txfm2d_test.cc b/test/av1_fwd_txfm2d_test.cc
index 95ff48079..4a5a63454 100644
--- a/test/av1_fwd_txfm2d_test.cc
+++ b/test/av1_fwd_txfm2d_test.cc
@@ -443,13 +443,41 @@ using ::testing::Combine;
 using ::testing::Values;
 using ::testing::ValuesIn;
 
-#if HAVE_SSE4_1
-static TX_SIZE fwd_txfm_for_sse41[] = {
-  TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_64X64, TX_4X8,   TX_8X4,
-  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
-  TX_16X4, TX_8X32, TX_32X8,  TX_16X64, TX_64X16,
+#if AOM_ARCH_X86 && HAVE_SSE2
+static TX_SIZE fwd_txfm_for_sse2[] = {
+  TX_4X4,
+  TX_8X8,
+  TX_16X16,
+  TX_32X32,
+  // TX_64X64,
+  TX_4X8,
+  TX_8X4,
+  TX_8X16,
+  TX_16X8,
+  TX_16X32,
+  TX_32X16,
+  // TX_32X64,
+  // TX_64X32,
+  TX_4X16,
+  TX_16X4,
+  TX_8X32,
+  TX_32X8,
+  TX_16X64,
+  TX_64X16,
 };
 
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1FwdTxfm2dTest,
+                         Combine(ValuesIn(fwd_txfm_for_sse2),
+                                 Values(av1_lowbd_fwd_txfm_sse2)));
+#endif  // AOM_ARCH_X86 && HAVE_SSE2
+
+#if HAVE_SSE4_1
+static TX_SIZE fwd_txfm_for_sse41[] = { TX_4X4,   TX_8X8,   TX_16X16, TX_32X32,
+                                        TX_64X64, TX_4X8,   TX_8X4,   TX_8X16,
+                                        TX_16X8,  TX_16X32, TX_32X16, TX_32X64,
+                                        TX_64X32, TX_4X16,  TX_16X4,  TX_8X32,
+                                        TX_32X8,  TX_16X64, TX_64X16 };
+
 INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1FwdTxfm2dTest,
                          Combine(ValuesIn(fwd_txfm_for_sse41),
                                  Values(av1_lowbd_fwd_txfm_sse4_1)));
diff --git a/test/cdef_test.cc b/test/cdef_test.cc
index 041d083bb..ac0591f6a 100644
--- a/test/cdef_test.cc
+++ b/test/cdef_test.cc
@@ -614,7 +614,7 @@ TEST_P(CDEFCopyRect16to16Test, TestSIMDNoMismatch) {
 
 using std::make_tuple;
 
-#if (HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON)
+#if ((AOM_ARCH_X86 && HAVE_SSSE3) || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON)
 static const CdefFilterBlockFunctions kCdefFilterFuncC[] = {
   { &cdef_filter_8_0_c, &cdef_filter_8_1_c, &cdef_filter_8_2_c,
     &cdef_filter_8_3_c }
@@ -626,6 +626,49 @@ static const CdefFilterBlockFunctions kCdefFilterHighbdFuncC[] = {
 };
 #endif
 
+#if AOM_ARCH_X86 && HAVE_SSSE3
+static const CdefFilterBlockFunctions kCdefFilterFuncSsse3[] = {
+  { &cdef_filter_8_0_ssse3, &cdef_filter_8_1_ssse3, &cdef_filter_8_2_ssse3,
+    &cdef_filter_8_3_ssse3 }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncSsse3[] = {
+  { &cdef_filter_16_0_ssse3, &cdef_filter_16_1_ssse3, &cdef_filter_16_2_ssse3,
+    &cdef_filter_16_3_ssse3 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, CDEFBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSsse3),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, CDEFBlockHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSsse3),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                      &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirDualTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_dual_ssse3,
+                                                      &cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, CDEFCopyRect8to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+                                 &cdef_copy_rect8_8bit_to_16bit_ssse3)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, CDEFCopyRect16to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+                                 &cdef_copy_rect8_16bit_to_16bit_ssse3)));
+#endif
+
 #if HAVE_SSE4_1
 static const CdefFilterBlockFunctions kCdefFilterFuncSse4_1[] = {
   { &cdef_filter_8_0_sse4_1, &cdef_filter_8_1_sse4_1, &cdef_filter_8_2_sse4_1,
@@ -757,6 +800,29 @@ INSTANTIATE_TEST_SUITE_P(
 #endif
 
 // Test speed for all supported architectures
+#if AOM_ARCH_X86 && HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, CDEFSpeedTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSsse3),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, CDEFSpeedHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSsse3),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Values(10)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                      &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirDualSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_dual_ssse3,
+                                                      &cdef_find_dir_dual_c)));
+#endif
+
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, CDEFSpeedTest,