aom: Introduce av1_resize_plane_to_half() for Global Motion tool

From 9e633f0dcef31d86316c3e739fe2b9dbdfb2880d Mon Sep 17 00:00:00 2001
From: Samuthirika S <[EMAIL REDACTED]>
Date: Sun, 10 Mar 2024 11:31:57 +0530
Subject: [PATCH] Introduce av1_resize_plane_to_half() for Global Motion tool

Currently, the GM tool invokes av1_resize_plane() with a
downsample factor of exactly 2. To facilitate the SIMD for the
same, this CL introduces av1_resize_plane_to_half(), which
incorporates the necessary conditions from av1_resize_plane().
This is a bit-exact change with no impact on encode time.

Change-Id: I87ed23892221472477a209357cddd08919ad8edf
---
 aom_dsp/pyramid.c   | 31 +++++++++++++++++++++----
 av1/common/resize.c | 55 +++++++++++++++++++++++++++++++++++++++++++++
 av1/common/resize.h |  6 +++++
 3 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/aom_dsp/pyramid.c b/aom_dsp/pyramid.c
index 5de001dbd..05ddbb2f5 100644
--- a/aom_dsp/pyramid.c
+++ b/aom_dsp/pyramid.c
@@ -305,6 +305,7 @@ static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
 
   // Fill in the remaining levels through progressive downsampling
   for (int level = already_filled_levels; level < n_levels; ++level) {
+    bool mem_status = false;
     PyramidLayer *prev_layer = &frame_pyr->layers[level - 1];
     uint8_t *prev_buffer = prev_layer->buffer;
     int prev_stride = prev_layer->stride;
@@ -315,6 +316,11 @@ static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
     int this_height = this_layer->height;
     int this_stride = this_layer->stride;
 
+    // The width and height of the previous layer that needs to be considered to
+    // derive the current layer frame.
+    const int input_layer_width = this_width << 1;
+    const int input_layer_height = this_height << 1;
+
     // Compute the this pyramid level by downsampling the current level.
     //
     // We downsample by a factor of exactly 2, clipping the rightmost and
@@ -329,13 +335,30 @@ static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
     // 2) Up/downsampling by a factor of 2 can be implemented much more
     //    efficiently than up/downsampling by a generic ratio.
     //    TODO(rachelbarker): Use optimized downsample-by-2 function
-    if (!av1_resize_plane(prev_buffer, this_height << 1, this_width << 1,
-                          prev_stride, this_buffer, this_height, this_width,
-                          this_stride)) {
-      // If we can't allocate memory, we'll have to terminate early
+
+    // SIMD support has been added specifically for cases where the downsample
+    // factor is exactly 2. In such instances, horizontal and vertical resizing
+    // is performed utilizing the down2_symeven() function, which considers the
+    // even dimensions of the input layer.
+    if (should_resize_by_half(input_layer_height, input_layer_width,
+                              this_height, this_width)) {
+      assert(input_layer_height % 2 == 0 && input_layer_width % 2 == 0 &&
+             "Input width or height cannot be odd.");
+      mem_status = av1_resize_plane_to_half(
+          prev_buffer, input_layer_height, input_layer_width, prev_stride,
+          this_buffer, this_height, this_width, this_stride);
+    } else {
+      mem_status = av1_resize_plane(prev_buffer, input_layer_height,
+                                    input_layer_width, prev_stride, this_buffer,
+                                    this_height, this_width, this_stride);
+    }
+
+    // Terminate early in cases of memory allocation failure.
+    if (!mem_status) {
       frame_pyr->filled_levels = n_levels;
       return -1;
     }
+
     fill_border(this_buffer, this_width, this_height, this_stride);
   }
 
diff --git a/av1/common/resize.c b/av1/common/resize.c
index 441323ab1..ef35fa227 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -524,6 +524,61 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
   }
 }
 
+static INLINE bool resize_vert_dir(uint8_t *intbuf, uint8_t *output,
+                                   int out_stride, int height, int height2,
+                                   int width2) {
+  bool mem_status = true;
+  uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height);
+  uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2);
+  if (arrbuf == NULL || arrbuf2 == NULL) {
+    mem_status = false;
+    goto Error;
+  }
+
+  for (int i = 0; i < width2; ++i) {
+    fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    down2_symeven(arrbuf, height, arrbuf2);
+    fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
+  }
+
+Error:
+  aom_free(arrbuf);
+  aom_free(arrbuf2);
+  return mem_status;
+}
+
+static INLINE void resize_horz_dir(const uint8_t *const input, int in_stride,
+                                   uint8_t *intbuf, int height,
+                                   int filtered_length, int width2) {
+  for (int i = 0; i < height; ++i)
+    down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i);
+}
+
+bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
+                              int in_stride, uint8_t *output, int height2,
+                              int width2, int out_stride) {
+  uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(*intbuf) * width2 * height);
+  if (intbuf == NULL) {
+    return false;
+  }
+
+  // Resize in the horizontal direction
+  resize_horz_dir(input, in_stride, intbuf, height, width, width2);
+  // Resize in the vertical direction
+  bool mem_status =
+      resize_vert_dir(intbuf, output, out_stride, height, height2, width2);
+  aom_free(intbuf);
+  return mem_status;
+}
+
+// Check if both the output width and height are half of input width and
+// height respectively.
+bool should_resize_by_half(int height, int width, int height2, int width2) {
+  const bool is_width_by_2 = get_down2_length(width, 1) == width2;
+  const bool is_height_by_2 = get_down2_length(height, 1) == height2;
+  return (is_width_by_2 && is_height_by_2);
+}
+
 bool av1_resize_plane(const uint8_t *input, int height, int width,
                       int in_stride, uint8_t *output, int height2, int width2,
                       int out_stride) {
diff --git a/av1/common/resize.h b/av1/common/resize.h
index d573a538b..6e7d46e0d 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -93,6 +93,12 @@ void av1_calculate_unscaled_superres_size(int *width, int *height, int denom);
 void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool,
                           bool alloc_pyramid);
 
+bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
+                              int in_stride, uint8_t *output, int height2,
+                              int width2, int out_stride);
+
+bool should_resize_by_half(int height, int width, int height2, int width2);
+
 // Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
 static INLINE int av1_superres_scaled(const AV1_COMMON *cm) {
   // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling