aom: merge aom_asm_stubs.c and highbd_convolve_sse2.c

From 8cbb78176c4f9247dd232492f9e29f130ab40a58 Mon Sep 17 00:00:00 2001
From: James Zern <[EMAIL REDACTED]>
Date: Fri, 26 Apr 2024 12:11:47 -0700
Subject: [PATCH] merge aom_asm_stubs.c and highbd_convolve_sse2.c

The allows the functions in highbd_convolve_sse2.c to be made static.
This fixes some -Wmissing-prototypes warnings.

This change is similar to what was done in libvpx:
c67a2e76a subpixel_8t sse2: resolve missing declarations

Bug: aomedia:3416
Change-Id: I473da99c88edfec47ca5b3384a74d3f076b565d9
---
 aom_dsp/aom_dsp.cmake                     |  1 -
 aom_dsp/x86/aom_asm_stubs.c               | 61 ---------------
 aom_dsp/x86/aom_high_subpixel_8t_sse2.asm | 90 ++++++++++++-----------
 aom_dsp/x86/highbd_convolve_sse2.c        | 79 ++++++++++++++------
 4 files changed, 103 insertions(+), 128 deletions(-)
 delete mode 100644 aom_dsp/x86/aom_asm_stubs.c

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 27099d36b2..6d8e5a961b 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -58,7 +58,6 @@ list(APPEND AOM_DSP_COMMON_ASM_SSE2
 
 list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
             "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c"
-            "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
             "${AOM_ROOT}/aom_dsp/x86/convolve.h"
             "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
             "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c
deleted file mode 100644
index 6c7fdd6eb1..0000000000
--- a/aom_dsp/x86/aom_asm_stubs.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve.h"
-
-#if HAVE_SSE2
-#if CONFIG_AV1_HIGHBITDEPTH
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
-
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2;
-
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
-
-// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
-//                                      ptrdiff_t src_stride,
-//                                      uint8_t *dst,
-//                                      ptrdiff_t dst_stride,
-//                                      const int16_t *filter_x,
-//                                      int x_step_q4,
-//                                      const int16_t *filter_y,
-//                                      int y_step_q4,
-//                                      int w, int h, int bd);
-// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
-//                                     ptrdiff_t src_stride,
-//                                     uint8_t *dst,
-//                                     ptrdiff_t dst_stride,
-//                                     const int16_t *filter_x,
-//                                     int x_step_q4,
-//                                     const int16_t *filter_y,
-//                                     int y_step_q4,
-//                                     int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
-#endif
-#endif  // HAVE_SSE2
diff --git a/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
index d392225906..f84f8fa1f7 100644
--- a/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+++ b/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -202,14 +202,15 @@
 
 SECTION .text
 
-;void aom_filter_block1d4_v8_sse2
+;void aom_highbd_filter_block1d4_v8_sse2
 ;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d4_v8_sse2)
 sym(aom_highbd_filter_block1d4_v8_sse2):
@@ -272,14 +273,15 @@ sym(aom_highbd_filter_block1d4_v8_sse2):
     pop         rbp
     ret
 
-;void aom_filter_block1d8_v8_sse2
+;void aom_highbd_filter_block1d8_v8_sse2
 ;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d8_v8_sse2)
 sym(aom_highbd_filter_block1d8_v8_sse2):
@@ -331,14 +333,15 @@ sym(aom_highbd_filter_block1d8_v8_sse2):
     pop         rbp
     ret
 
-;void aom_filter_block1d16_v8_sse2
+;void aom_highbd_filter_block1d16_v8_sse2
 ;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d16_v8_sse2)
 sym(aom_highbd_filter_block1d16_v8_sse2):
@@ -394,14 +397,15 @@ sym(aom_highbd_filter_block1d16_v8_sse2):
     pop         rbp
     ret
 
-;void aom_filter_block1d4_h8_sse2
+;void aom_highbd_filter_block1d4_h8_sse2
 ;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d4_h8_sse2)
 sym(aom_highbd_filter_block1d4_h8_sse2):
@@ -469,14 +473,15 @@ sym(aom_highbd_filter_block1d4_h8_sse2):
     pop         rbp
     ret
 
-;void aom_filter_block1d8_h8_sse2
+;void aom_highbd_filter_block1d8_h8_sse2
 ;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d8_h8_sse2)
 sym(aom_highbd_filter_block1d8_h8_sse2):
@@ -535,14 +540,15 @@ sym(aom_highbd_filter_block1d8_h8_sse2):
     pop         rbp
     ret
 
-;void aom_filter_block1d16_h8_sse2
+;void aom_highbd_filter_block1d16_h8_sse2
 ;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d16_h8_sse2)
 sym(aom_highbd_filter_block1d16_h8_sse2):
diff --git a/aom_dsp/x86/highbd_convolve_sse2.c b/aom_dsp/x86/highbd_convolve_sse2.c
index a2bb283222..40201aa193 100644
--- a/aom_dsp/x86/highbd_convolve_sse2.c
+++ b/aom_dsp/x86/highbd_convolve_sse2.c
@@ -15,10 +15,9 @@
 
 // -----------------------------------------------------------------------------
 
-void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
-                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                        ptrdiff_t dst_pitch, uint32_t height,
-                                        const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d4_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   __m128i filtersReg;
   __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
   __m128i srcReg23_lo, srcReg34_lo;
@@ -101,10 +100,9 @@ void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
   }
 }
 
-void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
-                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                        ptrdiff_t dst_pitch, uint32_t height,
-                                        const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d4_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   __m128i filtersReg;
   __m128i addFilterReg64;
   __m128i secondFilters, thirdFilters;
@@ -153,10 +151,9 @@ void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
   }
 }
 
-void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
-                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                        ptrdiff_t dst_pitch, uint32_t height,
-                                        const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d8_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   __m128i filtersReg;
   __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
   __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
@@ -262,10 +259,9 @@ void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
   }
 }
 
-void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
-                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                        ptrdiff_t dst_pitch, uint32_t height,
-                                        const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d8_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   __m128i filtersReg;
   __m128i addFilterReg64;
   __m128i secondFilters, thirdFilters;
@@ -330,22 +326,57 @@ void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
   }
 }
 
-void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
-                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                         ptrdiff_t dst_pitch, uint32_t height,
-                                         const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d16_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
                                      height, filter, bd);
   aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
                                      dst_pitch, height, filter, bd);
 }
 
-void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
-                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                         ptrdiff_t dst_pitch, uint32_t height,
-                                         const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d16_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
                                      height, filter, bd);
   aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
                                      dst_pitch, height, filter, bd);
 }
+
+// From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
+
+// From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
+
+// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
+//                                      ptrdiff_t src_stride,
+//                                      uint8_t *dst,
+//                                      ptrdiff_t dst_stride,
+//                                      const int16_t *filter_x,
+//                                      int x_step_q4,
+//                                      const int16_t *filter_y,
+//                                      int y_step_q4,
+//                                      int w, int h, int bd);
+// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
+//                                     ptrdiff_t src_stride,
+//                                     uint8_t *dst,
+//                                     ptrdiff_t dst_stride,
+//                                     const int16_t *filter_x,
+//                                     int x_step_q4,
+//                                     const int16_t *filter_y,
+//                                     int y_step_q4,
+//                                     int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)