aom: Move SVE-Neon bridge helper functions to separate header file

SDLPushPostBot · February 27, 2024, 9:58pm

From 6511f8aec707dead347aa12aef2646a5a78d4df0 Mon Sep 17 00:00:00 2001
From: Salome Thirot <[EMAIL REDACTED]>
Date: Mon, 26 Feb 2024 14:54:08 +0000
Subject: [PATCH] Move SVE-Neon bridge helper functions to separate header file

Extend dot_sve.h to include more helper functions making use of the
SVE-Neon bridge, and create a similar file for SVE2 helpers.

Change-Id: Iaf1273e70fccdd48043c970264f60afaad64dd8d
---
 aom_dsp/arm/aom_neon_sve2_bridge.h            | 36 +++++++++++++++++++
 .../arm/{dot_sve.h => aom_neon_sve_bridge.h}  | 19 +++++++---
 aom_dsp/arm/avg_sve.c                         |  2 +-
 aom_dsp/arm/blk_sse_sum_sve.c                 |  2 +-
 aom_dsp/arm/highbd_convolve8_sve.c            |  2 +-
 aom_dsp/arm/highbd_sse_sve.c                  |  2 +-
 aom_dsp/arm/highbd_variance_sve.c             |  2 +-
 aom_dsp/arm/sum_squares_sve.c                 |  2 +-
 av1/common/arm/highbd_convolve_sve2.c         | 21 ++---------
 av1/common/arm/highbd_warp_plane_sve.c        |  2 +-
 av1/common/arm/warp_plane_sve.c               |  2 +-
 av1/encoder/arm/neon/av1_error_sve.c          |  2 +-
 av1/encoder/arm/neon/wedge_utils_sve.c        |  2 +-
 13 files changed, 62 insertions(+), 34 deletions(-)
 create mode 100644 aom_dsp/arm/aom_neon_sve2_bridge.h
 rename aom_dsp/arm/{dot_sve.h => aom_neon_sve_bridge.h} (74%)

diff --git a/aom_dsp/arm/aom_neon_sve2_bridge.h b/aom_dsp/arm/aom_neon_sve2_bridge.h
new file mode 100644
index 0000000000..6e7d2d6365
--- /dev/null
+++ b/aom_dsp/arm/aom_neon_sve2_bridge.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_
+#define AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_
+
+#include <arm_neon_sve_bridge.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+// We can access instructions exclusive to the SVE2 instruction set from a
+// predominantly Neon context by making use of the Neon-SVE bridge intrinsics
+// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE
+// vector (if it's longer than 128 bits) being "don't care".
+
+// While sub-optimal on machines that have SVE vector length > 128-bit - as the
+// remainder of the vector is unused - this approach is still beneficial when
+// compared to a Neon-only solution.
+
+static INLINE int16x8_t aom_tbl2_s16(int16x8_t s0, int16x8_t s1,
+                                     uint16x8_t tbl) {
+  svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0),
+                                      svset_neonq_s16(svundef_s16(), s1));
+  return svget_neonq_s16(
+      svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl)));
+}
+
+#endif  // AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_
diff --git a/aom_dsp/arm/dot_sve.h b/aom_dsp/arm/aom_neon_sve_bridge.h
similarity index 74%
rename from aom_dsp/arm/dot_sve.h
rename to aom_dsp/arm/aom_neon_sve_bridge.h
index a02716933d..3da80e22ba 100644
--- a/aom_dsp/arm/dot_sve.h
+++ b/aom_dsp/arm/aom_neon_sve_bridge.h
@@ -8,16 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef AOM_AOM_DSP_ARM_DOT_SVE_H_
-#define AOM_AOM_DSP_ARM_DOT_SVE_H_
+#ifndef AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_
+#define AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_
 
 #include <arm_neon_sve_bridge.h>
 
 #include "config/aom_dsp_rtcd.h"
 #include "config/aom_config.h"
 
-// Dot product instructions operating on 16-bit input elements are exclusive to
-// the SVE instruction set. However, we can access these instructions from a
+// We can access instructions exclusive to the SVE instruction set from a
 // predominantly Neon context by making use of the Neon-SVE bridge intrinsics
 // to reinterpret Neon vectors as SVE vectors - with the high part of the SVE
 // vector (if it's longer than 128 bits) being "don't care".
@@ -44,4 +43,14 @@ static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
                                  svset_neonq_s16(svundef_s16(), s0),  \
                                  svset_neonq_s16(svundef_s16(), f), lane))
 
-#endif  // AOM_AOM_DSP_ARM_DOT_SVE_H_
+static INLINE uint16x8_t aom_tbl_u16(uint16x8_t s, uint16x8_t tbl) {
+  return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), s),
+                                   svset_neonq_u16(svundef_u16(), tbl)));
+}
+
+static INLINE int16x8_t aom_tbl_s16(int16x8_t s, uint16x8_t tbl) {
+  return svget_neonq_s16(svtbl_s16(svset_neonq_s16(svundef_s16(), s),
+                                   svset_neonq_u16(svundef_u16(), tbl)));
+}
+
+#endif  // AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_
diff --git a/aom_dsp/arm/avg_sve.c b/aom_dsp/arm/avg_sve.c
index bbf5a9447c..57a546501a 100644
--- a/aom_dsp/arm/avg_sve.c
+++ b/aom_dsp/arm/avg_sve.c
@@ -14,7 +14,7 @@
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
-#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
 
diff --git a/aom_dsp/arm/blk_sse_sum_sve.c b/aom_dsp/arm/blk_sse_sum_sve.c
index 18bdc5dbfe..f538346d8b 100644
--- a/aom_dsp/arm/blk_sse_sum_sve.c
+++ b/aom_dsp/arm/blk_sse_sum_sve.c
@@ -15,7 +15,7 @@
 #include "config/aom_dsp_rtcd.h"
 #include "config/aom_config.h"
 
-#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "aom_dsp/arm/mem_neon.h"
 
 static INLINE void get_blk_sse_sum_4xh_sve(const int16_t *data, int stride,
diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index b00f4d38f9..46131b9736 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -16,7 +16,7 @@
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "aom_dsp/arm/mem_neon.h"
 
 static INLINE uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
diff --git a/aom_dsp/arm/highbd_sse_sve.c b/aom_dsp/arm/highbd_sse_sve.c
index b267da5cfb..9ea13ab67a 100644
--- a/aom_dsp/arm/highbd_sse_sve.c
+++ b/aom_dsp/arm/highbd_sse_sve.c
@@ -10,7 +10,7 @@
 
 #include <arm_neon.h>
 
-#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
diff --git a/aom_dsp/arm/highbd_variance_sve.c b/aom_dsp/arm/highbd_variance_sve.c
index a2c30a1688..ad1f55e367 100644
--- a/aom_dsp/arm/highbd_variance_sve.c
+++ b/aom_dsp/arm/highbd_variance_sve.c
@@ -16,7 +16,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/aom_filter.h"
-#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/variance.h"
 
diff --git a/aom_dsp/arm/sum_squares_sve.c b/aom_dsp/arm/sum_squares_sve.c
index 724e43859e..c7e6dfcb02 100644
--- a/aom_dsp/arm/sum_squares_sve.c
+++ b/aom_dsp/arm/sum_squares_sve.c
@@ -11,7 +11,7 @@
 
 #include <arm_neon.h>
 
-#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
diff --git a/av1/common/arm/highbd_convolve_sve2.c b/av1/common/arm/highbd_convolve_sve2.c
index 5b6cb45c1f..c297117ad3 100644
--- a/av1/common/arm/highbd_convolve_sve2.c
+++ b/av1/common/arm/highbd_convolve_sve2.c
@@ -16,7 +16,8 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
+#include "aom_dsp/arm/aom_neon_sve2_bridge.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
 #include "av1/common/convolve.h"
@@ -27,19 +28,6 @@ DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = {
   4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
 };
 
-static INLINE int16x8_t aom_tbl_s16(int16x8_t s, uint16x8_t tbl) {
-  return svget_neonq_s16(svtbl_s16(svset_neonq_s16(svundef_s16(), s),
-                                   svset_neonq_u16(svundef_u16(), tbl)));
-}
-
-static INLINE int16x8_t aom_tbl2_s16(int16x8_t s0, int16x8_t s1,
-                                     uint16x8_t tbl) {
-  svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0),
-                                      svset_neonq_s16(svundef_s16(), s1));
-  return svget_neonq_s16(
-      svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl)));
-}
-
 static INLINE uint16x4_t convolve12_4_x(
     int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11,
     const int64x2_t offset, uint16x8x4_t permute_tbl, uint16x4_t max) {
@@ -259,11 +247,6 @@ DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = {
 };
 // clang-format on
 
-static INLINE uint16x8_t aom_tbl_u16(uint16x8_t src, uint16x8_t table) {
-  return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), src),
-                                   svset_neonq_u16(svundef_u16(), table)));
-}
-
 static INLINE uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter,
                                        int64x2_t offset,
                                        uint16x8x2_t permute_tbl,
diff --git a/av1/common/arm/highbd_warp_plane_sve.c b/av1/common/arm/highbd_warp_plane_sve.c
index 3653012ade..87e033fd00 100644
--- a/av1/common/arm/highbd_warp_plane_sve.c
+++ b/av1/common/arm/highbd_warp_plane_sve.c
@@ -15,7 +15,7 @@
 #include <arm_neon_sve_bridge.h>
 
 #include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
diff --git a/av1/common/arm/warp_plane_sve.c b/av1/common/arm/warp_plane_sve.c
index 9528adc624..c70b066174 100644
--- a/av1/common/arm/warp_plane_sve.c
+++ b/av1/common/arm/warp_plane_sve.c
@@ -11,7 +11,7 @@
 
 #include <arm_neon.h>
 
-#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "warp_plane_neon.h"
 
 DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = {
diff --git a/av1/encoder/arm/neon/av1_error_sve.c b/av1/encoder/arm/neon/av1_error_sve.c
index 63aad0b785..52803a9838 100644
--- a/av1/encoder/arm/neon/av1_error_sve.c
+++ b/av1/encoder/arm/neon/av1_error_sve.c
@@ -14,7 +14,7 @@
 #include "config/aom_config.h"
 
 #include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "aom_dsp/arm/mem_neon.h"
 
 int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
diff --git a/av1/encoder/arm/neon/wedge_utils_sve.c b/av1/encoder/arm/neon/wedge_utils_sve.c
index b15811c72d..521601a3f3 100644
--- a/av1/encoder/arm/neon/wedge_utils_sve.c
+++ b/av1/encoder/arm/neon/wedge_utils_sve.c
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 #include <assert.h>
 
-#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "aom_dsp/arm/sum_neon.h"
 #include "av1/common/reconinter.h"