aom: Remove unused SIMD functions

From 8e0b26f4e830cfe39541195352119c6f86459f5e Mon Sep 17 00:00:00 2001
From: Rachel Barker <[EMAIL REDACTED]>
Date: Wed, 24 Jan 2024 15:46:02 +0000
Subject: [PATCH] Remove unused SIMD functions

We have several functions which are implemented for multiple SSE
versions, eg. SSE2 + SSSE3 or SSE2 + SSSE3 + SSE4.1. Our baseline
requirement for support is SSE4.1. Therefore the older versions of
these functions are never used in practice, but still take up space
in the library.

Remove these outdated SSE implementations, leaving just the newest
version.

Functions affected:
* aom_convolve8_horiz and aom_convolve8_vert
* aom_sub_pixel(_avg)_variance_MxN
* Many CDEF functions
* av1_lowbd_fwd_txfm (*)
* av1_dist_wtd_convolve_2d

(*): The top level function av1_lowbd_fwd_txfm_sse2 is never used, but
the individual transform functions (av1_lowbd_fwd_txfm2d_MxN_sse2) can
be called via av1_lowbd_fwd_txfm_sse4_1. Therefore these must be kept.
In order to ensure that these functions are still tested, their tests
are moved under the SSE4.1 test suite.

Savings:
Source code: ~2000 lines
libaom.so (x86-64 Linux, full build): 133KB (~ 1.6%)
libaom.so (x86-64 Linux, realtime only): 1.8KB (~ 0.1%)

No change to encoder output or speed (for SSE4.1+ machines)

Change-Id: Idc136c59d8adb42c0a97cb6e3e7b2d03a021f65f
---
 aom_dsp/aom_dsp.cmake                         |   7 +-
 aom_dsp/aom_dsp_rtcd_defs.pl                  |  94 +--
 aom_dsp/x86/aom_asm_stubs.c                   |  34 -
 aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c     | 569 ----------------
 aom_dsp/x86/aom_subpixel_8t_sse2.asm          | 615 ------------------
 aom_dsp/x86/aom_subpixel_bilinear_sse2.asm    | 295 ---------
 ...nce_sse2.asm => subpel_variance_ssse3.asm} |  28 -
 aom_dsp/x86/variance_sse2.c                   |   4 -
 av1/av1.cmake                                 |   2 -
 av1/common/av1_rtcd_defs.pl                   |  28 +-
 av1/common/x86/cdef_block_sse2.c              |  40 --
 av1/common/x86/cdef_block_ssse3.c             |  40 --
 av1/common/x86/jnt_convolve_sse2.c            | 229 -------
 av1/encoder/x86/av1_fwd_txfm_sse2.c           |  34 -
 test/av1_convolve_test.cc                     |   5 -
 test/av1_fwd_txfm2d_test.cc                   |  35 +-
 test/cdef_test.cc                             | 134 +---
 test/convolve_test.cc                         |   6 -
 test/variance_test.cc                         |  58 --
 19 files changed, 68 insertions(+), 2189 deletions(-)
 delete mode 100644 aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
 delete mode 100644 aom_dsp/x86/aom_subpixel_8t_sse2.asm
 delete mode 100644 aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
 rename aom_dsp/x86/{subpel_variance_sse2.asm => subpel_variance_ssse3.asm} (98%)
 delete mode 100644 av1/common/x86/cdef_block_sse2.c
 delete mode 100644 av1/common/x86/cdef_block_ssse3.c

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 653f690741..66b4a6e96d 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -52,15 +52,12 @@ list(APPEND AOM_DSP_COMMON_SOURCES
 list(APPEND AOM_DSP_COMMON_ASM_SSE2
             "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
             "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm"
-            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
-            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm"
             "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_asm_sse2.asm"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_asm_sse2.asm"
             "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
             "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c"
-            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
             "${AOM_ROOT}/aom_dsp/x86/convolve.h"
             "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
@@ -208,7 +205,6 @@ if(CONFIG_AV1_ENCODER)
 
   list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
               "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
-              "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm"
               "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm")
 
   list(APPEND AOM_DSP_ENCODER_ASM_SSE2_X86_64
@@ -227,6 +223,9 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/jnt_sad_sse2.c")
 
+  list(APPEND AOM_DSP_ENCODER_ASM_SSSE3
+              "${AOM_ROOT}/aom_dsp/x86/subpel_variance_ssse3.asm")
+
   list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
               "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
               "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 7bb156ac59..076577454c 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -498,8 +498,8 @@ ()
 add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 
 specialize qw/aom_convolve_copy       neon                        sse2 avx2/;
-specialize qw/aom_convolve8_horiz     neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
-specialize qw/aom_convolve8_vert      neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_horiz     neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_vert      neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3";
 
 add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/aom_scaled_2d ssse3 neon/;
@@ -1406,39 +1406,39 @@ ()
   specialize qw/aom_variance4x8       sse2      neon neon_dotprod/;
   specialize qw/aom_variance4x4       sse2      neon neon_dotprod/;
 
-  specialize qw/aom_sub_pixel_variance128x128   avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance128x64    avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x128    avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x64     avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x32     avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x64     avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x32     avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x16     avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x32     avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x16     avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x8      avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x16           neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x8            neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x4            neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance4x8            neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance4x4            neon sse2 ssse3/;
-
-  specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance128x64  avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x128  avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x64   avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x32   avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x64   avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x32   avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x16   avx2 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x32        neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x16        neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x8         neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x16         neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x8          neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x4          neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x8          neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x4          neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance128x128   avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_variance128x64    avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_variance64x128    avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_variance64x64     avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_variance64x32     avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_variance32x64     avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_variance32x32     avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_variance32x16     avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_variance16x32     avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_variance16x16     avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_variance16x8      avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_variance8x16           neon ssse3/;
+  specialize qw/aom_sub_pixel_variance8x8            neon ssse3/;
+  specialize qw/aom_sub_pixel_variance8x4            neon ssse3/;
+  specialize qw/aom_sub_pixel_variance4x8            neon ssse3/;
+  specialize qw/aom_sub_pixel_variance4x4            neon ssse3/;
+
+  specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance128x64  avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x128  avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x64   avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x32   avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x64   avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x32   avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x16   avx2 neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x32        neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x16        neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x8         neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x16         neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x8          neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x4          neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x8          neon ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x4          neon ssse3/;
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
     specialize qw/aom_variance4x16  neon neon_dotprod sse2/;
@@ -1448,18 +1448,18 @@ ()
     specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/;
     specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/;
 
-    specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance8x32 neon sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance32x8 neon sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance16x64 neon avx2 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance64x16 neon sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance4x16 neon sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance16x4 neon sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance8x32 neon sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance32x8 neon sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance16x64 neon sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance64x16 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance4x16 neon ssse3/;
+    specialize qw/aom_sub_pixel_variance16x4 neon avx2 ssse3/;
+    specialize qw/aom_sub_pixel_variance8x32 neon ssse3/;
+    specialize qw/aom_sub_pixel_variance32x8 neon ssse3/;
+    specialize qw/aom_sub_pixel_variance16x64 neon avx2 ssse3/;
+    specialize qw/aom_sub_pixel_variance64x16 neon ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance4x16 neon ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance16x4 neon ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance8x32 neon ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance32x8 neon ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance16x64 neon ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance64x16 neon ssse3/;
 
     specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  neon ssse3/;
     specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  neon ssse3/;
diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c
index b08ec2546b..6c7fdd6eb1 100644
--- a/aom_dsp/x86/aom_asm_stubs.c
+++ b/aom_dsp/x86/aom_asm_stubs.c
@@ -15,40 +15,6 @@
 #include "aom_dsp/x86/convolve.h"
 
 #if HAVE_SSE2
-filter8_1dfunction aom_filter_block1d16_v8_sse2;
-filter8_1dfunction aom_filter_block1d16_h8_sse2;
-filter8_1dfunction aom_filter_block1d8_v8_sse2;
-filter8_1dfunction aom_filter_block1d8_h8_sse2;
-filter8_1dfunction aom_filter_block1d4_v8_sse2;
-filter8_1dfunction aom_filter_block1d4_h8_sse2;
-filter8_1dfunction aom_filter_block1d16_v4_sse2;
-filter8_1dfunction aom_filter_block1d16_h4_sse2;
-
-filter8_1dfunction aom_filter_block1d8_h4_sse2;
-filter8_1dfunction aom_filter_block1d8_v4_sse2;
-filter8_1dfunction aom_filter_block1d4_h4_sse2;
-filter8_1dfunction aom_filter_block1d4_v4_sse2;
-
-filter8_1dfunction aom_filter_block1d16_v2_sse2;
-filter8_1dfunction aom_filter_block1d16_h2_sse2;
-filter8_1dfunction aom_filter_block1d8_v2_sse2;
-filter8_1dfunction aom_filter_block1d8_h2_sse2;
-filter8_1dfunction aom_filter_block1d4_v2_sse2;
-filter8_1dfunction aom_filter_block1d4_h2_sse2;
-
-// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
-
 #if CONFIG_AV1_HIGHBITDEPTH
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
deleted file mode 100644
index 5c36b68727..0000000000
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
+++ /dev/null
@@ -1,569 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/x86/convolve.h"
-#include "aom_ports/mem.h"
-
-void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
-                                  ptrdiff_t src_pixels_per_line,
-                                  uint8_t *output_ptr, ptrdiff_t output_pitch,
-                                  uint32_t output_height,
-                                  const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i addFilterReg32;
-  __m128i secondFilters, thirdFilters;
-  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1,
-      srcRegFilt32b2_2;
-  __m128i srcReg32b1, srcReg32b2;
-  unsigned int i;
-  src_ptr -= 3;
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
-
-  for (i = output_height; i > 0; i -= 1) {
-    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
-    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
-    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
-    __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
-    __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
-    __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
-    __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
-    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
-
-    __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3);
-    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5);
-    __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
-    __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
-    d1 = _mm_madd_epi16(ss_1_2, secondFilters);
-    d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
-    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
-
-    __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
-    __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
-    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
-
-    // reading stride of the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
-
-    ss_2 = _mm_srli_si128(srcReg32b2, 2);
-    ss_4 = _mm_srli_si128(srcReg32b2, 4);
-    ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
-    ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
-    d1 = _mm_madd_epi16(ss_1_1, secondFilters);
-    d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
-    srcRegFilt32b2_1 = _mm_add_epi32(d1, d2);
-
-    ss_1 = _mm_srli_si128(srcReg32b2, 3);
-    ss_3 = _mm_srli_si128(srcReg32b2, 5);
-    ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
-    ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
-    d1 = _mm_madd_epi16(ss_1_2, secondFilters);
-    d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
-    srcRegFilt32b2_2 = _mm_add_epi32(d1, d2);
-
-    res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
-    res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
-    srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
-    srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
-
-    src_ptr += src_pixels_per_line;
-
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *output_ptr, ptrdiff_t out_pitch,
-                                  uint32_t output_height,
-                                  const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
-  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
-  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
-  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
-  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
-  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
-  __m128i resReg23_45, resReg34_56;
-  __m128i addFilterReg32, secondFilters, thirdFilters;
-  __m128i tmp_0, tmp_1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
-  srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
-  __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
-  __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
-  __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128());
-  __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128());
-
-  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
-  srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
-  __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
-  __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
-  __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128());
-  __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128());
-
-  for (i = output_height; i > 1; i -= 2) {
-    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-
-    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
-    srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
-
-    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-
-    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-
-    tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
-    resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
-    resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
-    __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
-    resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
-    __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
-    resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    // add and saturate the results together
-    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
-    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
-
-    // multiply 2 adjacent elements with the filter and add the result
-
-    tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters);
-    resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
-    tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters);
-    resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128());
-    __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters);
-    resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128());
-    __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters);
-    resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
-    // add and saturate the results together
-    resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
-    resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
-
-    // shift by 6 bit each 16 bit
-    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
-    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
-    resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
-    resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
-    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
-    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
-    resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
-    resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
-    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
-
-    src_ptr += src_stride;
-
-    _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    resReg23_lo_1 = resReg45_lo_1;
-    resReg23_lo_2 = resReg45_lo_2;
-    resReg23_hi_1 = resReg45_hi_1;
-    resReg23_hi_2 = resReg45_hi_2;
-    resReg34_lo_1 = resReg56_lo_1;
-    resReg34_lo_2 = resReg56_lo_2;
-    resReg34_hi_1 = resReg56_hi_1;
-    resReg34_hi_2 = resReg56_hi_2;
-    srcReg4 = srcReg6;
-  }
-}
-
-void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
-                                 ptrdiff_t src_pixels_per_line,
-                                 uint8_t *output_ptr, ptrdiff_t output_pitch,
-                                 uint32_t output_height,
-                                 const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i addFilterReg32;
-  __m128i secondFilters, thirdFilters;
-  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
-  __m128i srcReg32b1;
-  unsigned int i;
-  src_ptr -= 3;
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
-
-  for (i = output_height; i > 0; i -= 1) {
-    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
-    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
-    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
-    ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
-    ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
-    __m128i d1 = _mm_madd_epi16(ss_2, secondFilters);
-    __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
-    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
-
-    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
-    __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
-    ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
-    ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
-    d1 = _mm_madd_epi16(ss_3, secondFilters);
-    d2 = _mm_madd_epi16(ss_5, thirdFilters);
-    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
-
-    __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
-    __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
-    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
-
-    src_ptr += src_pixels_per_line;
-
-    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
-                                 uint32_t output_height,
-                                 const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
-  __m128i srcReg23_lo, srcReg34_lo;
-  __m128i srcReg45_lo, srcReg56_lo;
-  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
-  __m128i resReg23_45_lo, resReg34_56_lo;
-  __m128i resReg23_45, resReg34_56;
-  __m128i addFilterReg32, secondFilters, thirdFilters;
-  __m128i tmp_0, tmp_1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
-  __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
-  __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
-
-  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
-  __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
-  __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
-
-  for (i = output_height; i > 1; i -= 2) {
-    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
-
-    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-
-    tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
-    resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
-    resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    _

(Patch may be truncated, please check the link at the top of this post.)