From cf61d393e9c8c70b739e3a4e27c955d82e0c4e9a Mon Sep 17 00:00:00 2001
From: James Zern <[EMAIL REDACTED]>
Date: Fri, 10 May 2024 13:47:47 -0700
Subject: [PATCH] {jnt,}convolve_sse2: move load closer to first use
generates mildly better assembly with gcc-13 and clang-16.
Change-Id: I1e8fb2a6407e292c15e44dc7dd2676bad9a69857
---
av1/common/x86/convolve_sse2.c | 2 +-
av1/common/x86/jnt_convolve_sse2.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 4787d3f1d..9272e91b5 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -201,7 +201,6 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
if (w <= 4) {
__m128i s[8], src6, res, res_round, res16;
int res_int;
- src6 = xx_loadl_32(src_ptr + 6 * src_stride);
s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
xx_loadl_32(src_ptr + 1 * src_stride));
s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
@@ -212,6 +211,7 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
xx_loadl_32(src_ptr + 4 * src_stride));
s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
xx_loadl_32(src_ptr + 5 * src_stride));
+ src6 = xx_loadl_32(src_ptr + 6 * src_stride);
s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
do {
diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c
index 338615058..6b1227890 100644
--- a/av1/common/x86/jnt_convolve_sse2.c
+++ b/av1/common/x86/jnt_convolve_sse2.c
@@ -179,7 +179,6 @@ void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
if (w == 4) {
__m128i s[8], src6, res, res_shift;
- src6 = xx_loadl_32(src_ptr + 6 * src_stride);
s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
xx_loadl_32(src_ptr + 1 * src_stride));
s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
@@ -190,6 +189,7 @@ void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
xx_loadl_32(src_ptr + 4 * src_stride));
s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
xx_loadl_32(src_ptr + 5 * src_stride));
+ src6 = xx_loadl_32(src_ptr + 6 * src_stride);
s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
do {