aom: Add SVE2 impl of HBD dist_wtd_convolve_x for 8-tap filters

From 7a4971371a5e88a48d55530fdd7783041e57b2a9 Mon Sep 17 00:00:00 2001
From: Salome Thirot <[EMAIL REDACTED]>
Date: Thu, 7 Mar 2024 11:03:20 +0000
Subject: [PATCH] Add SVE2 impl of HBD dist_wtd_convolve_x for 8-tap filters

Add SVE2 implementation of av1_highbd_dist_wtd_convolve_x for 8-tap
filters, as well as the corresponding tests. This implementation uses
the same averaging helpers as the Neon one, so move them to a separate
header file. This gives up to 20% uplift over the Neon implementation.

Change-Id: I22acd5be2f5e2bdaecd4301df39f2f6b82214b7c
---
 av1/av1.cmake                                 |   1 +
 .../arm/highbd_compound_convolve_neon.c       | 259 +---------------
 .../arm/highbd_compound_convolve_neon.h       | 278 ++++++++++++++++++
 .../arm/highbd_compound_convolve_sve2.c       | 158 ++++++++++
 av1/common/av1_rtcd_defs.pl                   |   2 +-
 test/av1_convolve_test.cc                     |   6 +
 6 files changed, 445 insertions(+), 259 deletions(-)
 create mode 100644 av1/common/arm/highbd_compound_convolve_neon.h
 create mode 100644 av1/common/arm/highbd_compound_convolve_sve2.c

diff --git a/av1/av1.cmake b/av1/av1.cmake
index 37af5231cb..32645f6065 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -481,6 +481,7 @@ if(CONFIG_AV1_HIGHBITDEPTH)
               "${AOM_ROOT}/av1/common/arm/highbd_wiener_convolve_neon.c")
 
   list(APPEND AOM_AV1_COMMON_INTRIN_SVE2
+              "${AOM_ROOT}/av1/common/arm/highbd_compound_convolve_sve2.c"
               "${AOM_ROOT}/av1/common/arm/highbd_convolve_sve2.c")
 
   list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
diff --git a/av1/common/arm/highbd_compound_convolve_neon.c b/av1/common/arm/highbd_compound_convolve_neon.c
index fc03a2ee04..05773393d7 100644
--- a/av1/common/arm/highbd_compound_convolve_neon.c
+++ b/av1/common/arm/highbd_compound_convolve_neon.c
@@ -20,266 +20,9 @@
 #include "aom_ports/mem.h"
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
+#include "av1/common/arm/highbd_compound_convolve_neon.h"
 #include "av1/common/arm/highbd_convolve_neon.h"
 
-#define ROUND_SHIFT 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS
-
-static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr,
-                                           int src_stride, uint16_t *dst_ptr,
-                                           int dst_stride, int w, int h,
-                                           ConvolveParams *conv_params,
-                                           const int offset, const int bd) {
-  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
-  const int ref_stride = conv_params->dst_stride;
-  const uint16x4_t offset_vec = vdup_n_u16(offset);
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-
-  if (w == 4) {
-    do {
-      const uint16x4_t src = vld1_u16(src_ptr);
-      const uint16x4_t ref = vld1_u16(ref_ptr);
-
-      uint16x4_t avg = vhadd_u16(src, ref);
-      int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
-
-      uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2);
-      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
-
-      vst1_u16(dst_ptr, d0_u16);
-
-      src_ptr += src_stride;
-      ref_ptr += ref_stride;
-      dst_ptr += dst_stride;
-    } while (--h != 0);
-  } else {
-    do {
-      int width = w;
-      const uint16_t *src = src_ptr;
-      const uint16_t *ref = ref_ptr;
-      uint16_t *dst = dst_ptr;
-      do {
-        const uint16x8_t s = vld1q_u16(src);
-        const uint16x8_t r = vld1q_u16(ref);
-
-        uint16x8_t avg = vhaddq_u16(s, r);
-        int32x4_t d0_lo =
-            vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
-        int32x4_t d0_hi =
-            vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
-
-        uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT - 2),
-                                     vqrshrun_n_s32(d0_hi, ROUND_SHIFT - 2));
-        d0 = vminq_u16(d0, max);
-        vst1q_u16(dst, d0);
-
-        src += 8;
-        ref += 8;
-        dst += 8;
-        width -= 8;
-      } while (width != 0);
-
-      src_ptr += src_stride;
-      ref_ptr += ref_stride;
-      dst_ptr += dst_stride;
-    } while (--h != 0);
-  }
-}
-
-static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
-                                        uint16_t *dst_ptr, int dst_stride,
-                                        int w, int h,
-                                        ConvolveParams *conv_params,
-                                        const int offset, const int bd) {
-  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
-  const int ref_stride = conv_params->dst_stride;
-  const uint16x4_t offset_vec = vdup_n_u16(offset);
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-
-  if (w == 4) {
-    do {
-      const uint16x4_t src = vld1_u16(src_ptr);
-      const uint16x4_t ref = vld1_u16(ref_ptr);
-
-      uint16x4_t avg = vhadd_u16(src, ref);
-      int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
-
-      uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT);
-      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
-
-      vst1_u16(dst_ptr, d0_u16);
-
-      src_ptr += src_stride;
-      ref_ptr += ref_stride;
-      dst_ptr += dst_stride;
-    } while (--h != 0);
-  } else {
-    do {
-      int width = w;
-      const uint16_t *src = src_ptr;
-      const uint16_t *ref = ref_ptr;
-      uint16_t *dst = dst_ptr;
-      do {
-        const uint16x8_t s = vld1q_u16(src);
-        const uint16x8_t r = vld1q_u16(ref);
-
-        uint16x8_t avg = vhaddq_u16(s, r);
-        int32x4_t d0_lo =
-            vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
-        int32x4_t d0_hi =
-            vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
-
-        uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT),
-                                     vqrshrun_n_s32(d0_hi, ROUND_SHIFT));
-        d0 = vminq_u16(d0, max);
-        vst1q_u16(dst, d0);
-
-        src += 8;
-        ref += 8;
-        dst += 8;
-        width -= 8;
-      } while (width != 0);
-
-      src_ptr += src_stride;
-      ref_ptr += ref_stride;
-      dst_ptr += dst_stride;
-    } while (--h != 0);
-  }
-}
-
-static INLINE void highbd_12_dist_wtd_comp_avg_neon(
-    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
-    int w, int h, ConvolveParams *conv_params, const int offset, const int bd) {
-  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
-  const int ref_stride = conv_params->dst_stride;
-  const uint32x4_t offset_vec = vdupq_n_u32(offset);
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-  uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
-  uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
-
-  // Weighted averaging
-  if (w == 4) {
-    do {
-      const uint16x4_t src = vld1_u16(src_ptr);
-      const uint16x4_t ref = vld1_u16(ref_ptr);
-
-      uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
-      wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
-      wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
-      int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
-
-      uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2);
-      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
-
-      vst1_u16(dst_ptr, d0_u16);
-
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      ref_ptr += ref_stride;
-    } while (--h != 0);
-  } else {
-    do {
-      int width = w;
-      const uint16_t *src = src_ptr;
-      const uint16_t *ref = ref_ptr;
-      uint16_t *dst = dst_ptr;
-      do {
-        const uint16x8_t s = vld1q_u16(src);
-        const uint16x8_t r = vld1q_u16(ref);
-
-        uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
-        wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
-        wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
-        int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
-
-        uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
-        wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
-        wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
-        int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
-
-        uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT - 2),
-                                      vqrshrun_n_s32(d1, ROUND_SHIFT - 2));
-        d01 = vminq_u16(d01, max);
-        vst1q_u16(dst, d01);
-
-        src += 8;
-        ref += 8;
-        dst += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      ref_ptr += ref_stride;
-    } while (--h != 0);
-  }
-}
-
-static INLINE void highbd_dist_wtd_comp_avg_neon(
-    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
-    int w, int h, ConvolveParams *conv_params, const int offset, const int bd) {
-  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
-  const int ref_stride = conv_params->dst_stride;
-  const uint32x4_t offset_vec = vdupq_n_u32(offset);
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-  uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
-  uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
-
-  // Weighted averaging
-  if (w == 4) {
-    do {
-      const uint16x4_t src = vld1_u16(src_ptr);
-      const uint16x4_t ref = vld1_u16(ref_ptr);
-
-      uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
-      wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
-      wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
-      int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
-
-      uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT);
-      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
-
-      vst1_u16(dst_ptr, d0_u16);
-
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      ref_ptr += ref_stride;
-    } while (--h != 0);
-  } else {
-    do {
-      int width = w;
-      const uint16_t *src = src_ptr;
-      const uint16_t *ref = ref_ptr;
-      uint16_t *dst = dst_ptr;
-      do {
-        const uint16x8_t s = vld1q_u16(src);
-        const uint16x8_t r = vld1q_u16(ref);
-
-        uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
-        wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
-        wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
-        int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
-
-        uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
-        wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
-        wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
-        int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
-
-        uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT),
-                                      vqrshrun_n_s32(d1, ROUND_SHIFT));
-        d01 = vminq_u16(d01, max);
-        vst1q_u16(dst, d01);
-
-        src += 8;
-        ref += 8;
-        dst += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      ref_ptr += ref_stride;
-    } while (--h != 0);
-  }
-}
-
 static INLINE uint16x4_t highbd_12_convolve6_4(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
diff --git a/av1/common/arm/highbd_compound_convolve_neon.h b/av1/common/arm/highbd_compound_convolve_neon.h
new file mode 100644
index 0000000000..efe70440fa
--- /dev/null
+++ b/av1/common/arm/highbd_compound_convolve_neon.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+
+#define ROUND_SHIFT 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS
+
+static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr,
+                                           int src_stride, uint16_t *dst_ptr,
+                                           int dst_stride, int w, int h,
+                                           ConvolveParams *conv_params,
+                                           const int offset, const int bd) {
+  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+  const int ref_stride = conv_params->dst_stride;
+  const uint16x4_t offset_vec = vdup_n_u16(offset);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w == 4) {
+    do {
+      const uint16x4_t src = vld1_u16(src_ptr);
+      const uint16x4_t ref = vld1_u16(ref_ptr);
+
+      uint16x4_t avg = vhadd_u16(src, ref);
+      int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
+
+      uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2);
+      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+      vst1_u16(dst_ptr, d0_u16);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *src = src_ptr;
+      const uint16_t *ref = ref_ptr;
+      uint16_t *dst = dst_ptr;
+      do {
+        const uint16x8_t s = vld1q_u16(src);
+        const uint16x8_t r = vld1q_u16(ref);
+
+        uint16x8_t avg = vhaddq_u16(s, r);
+        int32x4_t d0_lo =
+            vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
+        int32x4_t d0_hi =
+            vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
+
+        uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT - 2),
+                                     vqrshrun_n_s32(d0_hi, ROUND_SHIFT - 2));
+        d0 = vminq_u16(d0, max);
+        vst1q_u16(dst, d0);
+
+        src += 8;
+        ref += 8;
+        dst += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
+                                        uint16_t *dst_ptr, int dst_stride,
+                                        int w, int h,
+                                        ConvolveParams *conv_params,
+                                        const int offset, const int bd) {
+  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+  const int ref_stride = conv_params->dst_stride;
+  const uint16x4_t offset_vec = vdup_n_u16(offset);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w == 4) {
+    do {
+      const uint16x4_t src = vld1_u16(src_ptr);
+      const uint16x4_t ref = vld1_u16(ref_ptr);
+
+      uint16x4_t avg = vhadd_u16(src, ref);
+      int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
+
+      uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT);
+      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+      vst1_u16(dst_ptr, d0_u16);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *src = src_ptr;
+      const uint16_t *ref = ref_ptr;
+      uint16_t *dst = dst_ptr;
+      do {
+        const uint16x8_t s = vld1q_u16(src);
+        const uint16x8_t r = vld1q_u16(ref);
+
+        uint16x8_t avg = vhaddq_u16(s, r);
+        int32x4_t d0_lo =
+            vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
+        int32x4_t d0_hi =
+            vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
+
+        uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT),
+                                     vqrshrun_n_s32(d0_hi, ROUND_SHIFT));
+        d0 = vminq_u16(d0, max);
+        vst1q_u16(dst, d0);
+
+        src += 8;
+        ref += 8;
+        dst += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void highbd_12_dist_wtd_comp_avg_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, ConvolveParams *conv_params, const int offset, const int bd) {
+  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+  const int ref_stride = conv_params->dst_stride;
+  const uint32x4_t offset_vec = vdupq_n_u32(offset);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+  uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
+  uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
+
+  // Weighted averaging
+  if (w == 4) {
+    do {
+      const uint16x4_t src = vld1_u16(src_ptr);
+      const uint16x4_t ref = vld1_u16(ref_ptr);
+
+      uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
+      wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
+      wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
+      int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
+
+      uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2);
+      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+      vst1_u16(dst_ptr, d0_u16);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      ref_ptr += ref_stride;
+    } while (--h != 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *src = src_ptr;
+      const uint16_t *ref = ref_ptr;
+      uint16_t *dst = dst_ptr;
+      do {
+        const uint16x8_t s = vld1q_u16(src);
+        const uint16x8_t r = vld1q_u16(ref);
+
+        uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
+        wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
+        wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
+        int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
+
+        uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
+        wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
+        wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
+        int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
+
+        uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT - 2),
+                                      vqrshrun_n_s32(d1, ROUND_SHIFT - 2));
+        d01 = vminq_u16(d01, max);
+        vst1q_u16(dst, d01);
+
+        src += 8;
+        ref += 8;
+        dst += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      ref_ptr += ref_stride;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void highbd_dist_wtd_comp_avg_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, ConvolveParams *conv_params, const int offset, const int bd) {
+  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+  const int ref_stride = conv_params->dst_stride;
+  const uint32x4_t offset_vec = vdupq_n_u32(offset);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+  uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
+  uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
+
+  // Weighted averaging
+  if (w == 4) {
+    do {
+      const uint16x4_t src = vld1_u16(src_ptr);
+      const uint16x4_t ref = vld1_u16(ref_ptr);
+
+      uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
+      wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
+      wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
+      int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
+
+      uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT);
+      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+      vst1_u16(dst_ptr, d0_u16);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      ref_ptr += ref_stride;
+    } while (--h != 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *src = src_ptr;
+      const uint16_t *ref = ref_ptr;
+      uint16_t *dst = dst_ptr;
+      do {
+        const uint16x8_t s = vld1q_u16(src);
+        const uint16x8_t r = vld1q_u16(ref);
+
+        uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
+        wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
+        wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
+        int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
+
+        uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
+        wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
+        wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
+        int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
+
+        uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT),
+                                      vqrshrun_n_s32(d1, ROUND_SHIFT));
+        d01 = vminq_u16(d01, max);
+        vst1q_u16(dst, d01);
+
+        src += 8;
+        ref += 8;
+        dst += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      ref_ptr += ref_stride;
+    } while (--h != 0);
+  }
+}
diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c
new file mode 100644
index 0000000000..0447b5587c
--- /dev/null
+++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
+#include "aom_dsp/arm/aom_neon_sve2_bridge.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/highbd_compound_convolve_neon.h"
+#include "av1/common/arm/highbd_convolve_neon.h"
+
+static INLINE uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter,
+                                       int64x2_t offset, int32x4_t shift) {
+  int64x2_t sum[8];
+  sum[0] = aom_sdotq_s16(offset, s0[0], filter);
+  sum[1] = aom_sdotq_s16(offset, s0[1], filter);
+  sum[2] = aom_sdotq_s16(offset, s0[2], filter);
+  sum[3] = aom_sdotq_s16(offset, s0[3], filter);
+  sum[4] = aom_sdotq_s16(offset, s0[4], filter);
+  sum[5] = aom_sdotq_s16(offset, s0[5], filter);
+  sum[6] = aom_sdotq_s16(offset, s0[6], filter);
+  sum[7] = aom_sdotq_s16(offset, s0[7], filter);
+
+  sum[0] = vpaddq_s64(sum[0], sum[1]);
+  sum[2] = vpaddq_s64(sum[2], sum[3]);
+  sum[4] = vpaddq_s64(sum[4], sum[5]);
+  sum[6] = vpaddq_s64(sum[6], sum[7]);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
+  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6]));
+
+  sum0123 = vshlq_s32(sum0123, shift);
+  sum4567 = vshlq_s32(sum4567, shift);
+
+  return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
+}
+
+static INLINE void highbd_dist_wtd_convolve_x_sve2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    int width, int height, const int16_t *x_filter_ptr,
+    ConvolveParams *conv_params, const int offset) {
+  const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
+  const int64x2_t offset_vec = vdupq_n_s64(offset);
+
+  const int64x2_t offset_lo =
+      vcombine_s64(vget_low_s64(offset_vec), vdup_n_s64(0));
+  const int16x8_t filter = vld1q_s16(x_filter_ptr);
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int w = width;
+
+    do {
+      int16x8_t s0[8], s1[8], s2[8], s3[8];
+      load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5], &s0[6], &s0[7]);
+      load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5], &s1[6], &s1[7]);
+      load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5], &s2[6], &s2[7]);
+      load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5], &s3[6], &s3[7]);
+
+      uint16x8_t d0 = convolve8_8_x(s0, filter, offset_lo, shift);
+      uint16x8_t d1 = convolve8_8_x(s1, filter, offset_lo, shift);
+      uint16x8_t d2 = convolve8_8_x(s2, filter, offset_lo, shift);
+      uint16x8_t d3 = convolve8_8_x(s3, filter, offset_lo, shift);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      w -= 8;
+    } while (w != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    height -= 4;
+  } while (height != 0);
+}
+
+void av1_highbd_dist_wtd_convolve_x_sve2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(16, uint16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+
+  if (x_filter_taps != 8) {
+    av1_highbd_dist_wtd_convolve_x_neon(src, src_stride, dst, dst_stride, w, h,
+                                        filter_params_x, subpel_x_qn,
+                                        conv_params, bd);
+    return;
+  }
+
+  int dst16_stride = conv_params->dst_stride;
+  const int im_stride = MAX_SB_SIZE;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int offset_avg = (1 << (offset_bits - conv_params->round_1)) +
+                         (1 << (offset_bits - conv_params->round_1 - 1));
+  const int offset_convolve = (1 << (conv_params->round_0 - 1)) +
+                              (1 << (bd + FILTER_BITS)) +
+                              (1 << (bd + FILTER_BITS - 1));
+
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  src -= horiz_offset;
+
+  if (conv_params->do_average) {
+    highbd_dist_wtd_convolve_x_sve2(src, src_stride, im_block, im_stride, w, h,
+                                    x_filter_ptr, conv_params, offset_convolve);
+
+    if (conv_params->use_dist_wtd_comp_avg) {
+      if (bd == 12) {
+        highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
+                                         w, h, conv_params, offset_avg, bd);
+
+      } else {
+        highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
+                                      h, conv_params, offset_avg, bd);
+      }
+
+    } else {
+      if (bd == 12) {
+        highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                conv_params, offset_avg, bd);
+
+      } else {
+        highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+                             conv_params, offset_avg, bd);
+      }
+    }
+  } else {
+    highbd_dist_wtd_convolve_x_sve2(src, src_stride, dst16, dst16_stride, w, h,
+                                    x_filter_ptr, conv_params, offset_convolve);
+  }
+}
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index f0afc48ad8..79107c6fb0 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -607,7 +607,7 @@ ()
   specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/;
   if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2 neon/;
-    specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2 neon/;
+    specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2 neon sve2/;
     specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2 neon/;
     specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2 neon/;
     specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2 neon sve2/;
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index cf82ef3941..12997dbfed 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -1958,6 +1958,12 @@ INSTANTIATE_TEST_SUITE_P(
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_neon));
 #endif
 
+#if HAVE_SVE2
+INSTANTIATE_TEST_SUITE_P(
+    SVE2, AV1ConvolveXHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_sve2));
+#endif
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 ////////////////////////////////////////////////