aom: rtc: Speedup for dynamic screen content

From 2a16696ea0f1640097d0d27b75322fd39ac808f9 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <[EMAIL REDACTED]>
Date: Tue, 4 Jun 2024 14:45:27 -0700
Subject: [PATCH] rtc: Speedup for dynamic screen content

For rtc screen content mode, speed >= 11:
Detect if the content has high motion, from
source_sad and fast/coarse ME analysis in the
scene detection. If detected, adjust some speed
features and increase the base partition threshold
to make the encoder faster. Coarse ME is done on center
superblock to avoid setting high_motion flag for scroll.

This also contains a fix to bsize_select for the
fixed partitioning.

This has small effect on most clips in the rtc_screen set,
except for very high motion content where it reduces
instruction count by ~50-60%. psnr loss ~0.4dB on high
motion clip.

Change-Id: I68071da2b40731cc81ac15a8010976e6ef234776
---
 av1/encoder/aq_cyclicrefresh.c |   3 +-
 av1/encoder/encodeframe.c      |   2 +-
 av1/encoder/mcomp.c            |  12 ++--
 av1/encoder/mcomp.h            |   3 +
 av1/encoder/ratectrl.c         | 118 +++++++++++++++++++++++++++++++++
 av1/encoder/ratectrl.h         |   1 +
 av1/encoder/speed_features.c   |  16 +++--
 av1/encoder/var_based_part.c   |  35 +++-------
 8 files changed, 154 insertions(+), 36 deletions(-)

diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index e9fd77107..4d8be3112 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -439,7 +439,8 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   // should we enable cyclic refresh on this frame.
   cr->apply_cyclic_refresh = 1;
   if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
-      scene_change_detected || svc->temporal_layer_id > 0 ||
+      cpi->rc.high_motion_screen_content || scene_change_detected ||
+      svc->temporal_layer_id > 0 ||
       svc->prev_number_spatial_layers != svc->number_spatial_layers ||
       p_rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
       (svc->number_spatial_layers > 1 &&
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 788ac8082..cac8d8151 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -545,7 +545,7 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
     BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size;
     if (sf->rt_sf.use_fast_fixed_part &&
         x->content_state_sb.source_sad_nonrd < kLowSad) {
-      bsize_select = BLOCK_64X64;
+      bsize_select = cm->seq_params->sb_size;
     }
     const BLOCK_SIZE bsize = seg_skip ? sb_size : bsize_select;
     av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index cf44db760..94fd17e4f 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -1993,8 +1993,8 @@ int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd,
   return best_hash_cost;
 }
 
-static int vector_match(int16_t *ref, int16_t *src, int bwl, int search_size,
-                        int full_search, int *sad) {
+int av1_vector_match(const int16_t *ref, const int16_t *src, int bwl,
+                     int search_size, int full_search, int *sad) {
   int best_sad = INT_MAX;
   int this_sad;
   int d;
@@ -2174,11 +2174,11 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
 
   // Find the best match per 1-D search
   best_int_mv->as_fullmv.col =
-      vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize], search_size_width,
-                   full_search, &best_sad_col);
+      av1_vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize],
+                       search_size_width, full_search, &best_sad_col);
   best_int_mv->as_fullmv.row =
-      vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize], search_size_height,
-                   full_search, &best_sad_row);
+      av1_vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize],
+                       search_size_height, full_search, &best_sad_row);
 
   // For screen: select between horiz or vert motion.
   if (is_screen) {
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index d6dc8cba2..7dd32e0a7 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -262,6 +262,9 @@ void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv);
 
 int av1_init_search_range(int size);
 
+int av1_vector_match(const int16_t *ref, const int16_t *src, int bwl,
+                     int search_size, int full_search, int *sad);
+
 unsigned int av1_int_pro_motion_estimation(
     const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
     int mi_col, const MV *ref_mv, unsigned int *y_sad_zero,
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index ec9ae1056..a34ce7875 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -33,6 +33,7 @@
 #include "av1/encoder/encoder_utils.h"
 #include "av1/encoder/encode_strategy.h"
 #include "av1/encoder/gop_structure.h"
+#include "av1/encoder/mcomp.h"
 #include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
 
@@ -3017,6 +3018,80 @@ static int set_block_is_active(unsigned char *const active_map_4x4, int mi_cols,
   return 0;
 }
 
+// Returns the best sad for column or row motion of the superblock.
+static unsigned int estimate_scroll_motion(
+    const AV1_COMP *cpi, uint8_t *src_buf, uint8_t *last_src_buf,
+    int src_stride, int ref_stride, BLOCK_SIZE bsize, int pos_col, int pos_row,
+    int *best_intmv_col, int *best_intmv_row) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int full_search = 1;
+  // Keep border a multiple of 16.
+  const int border = (cpi->oxcf.border_in_pixels >> 4) << 4;
+  // Make search_size_height larger to capture more common vertical scroll.
+  // Increase the search if last two frames were dropped.
+  // Values set based on screen test set.
+  int search_size_width = 96;
+  int search_size_height = cpi->rc.drop_count_consec > 1 ? 224 : 192;
+  // Adjust based on boundary.
+  if ((pos_col - search_size_width < -border) ||
+      (pos_col + search_size_width > cm->width + border))
+    search_size_width = border;
+  if ((pos_row - search_size_height < -border) ||
+      (pos_row + search_size_height > cm->height + border))
+    search_size_height = border;
+  const uint8_t *ref_buf;
+  const int row_norm_factor = mi_size_high_log2[bsize] + 1;
+  const int col_norm_factor = 3 + (bw >> 5);
+  const int ref_buf_width = (search_size_width << 1) + bw;
+  const int ref_buf_height = (search_size_height << 1) + bh;
+  int16_t *hbuf = (int16_t *)aom_malloc(ref_buf_width * sizeof(*hbuf));
+  int16_t *vbuf = (int16_t *)aom_malloc(ref_buf_height * sizeof(*vbuf));
+  int16_t *src_hbuf = (int16_t *)aom_malloc(bw * sizeof(*src_hbuf));
+  int16_t *src_vbuf = (int16_t *)aom_malloc(bh * sizeof(*src_vbuf));
+  if (!hbuf || !vbuf || !src_hbuf || !src_vbuf) {
+    aom_free(hbuf);
+    aom_free(vbuf);
+    aom_free(src_hbuf);
+    aom_free(src_vbuf);
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf");
+  }
+  // Set up prediction 1-D reference set for rows.
+  ref_buf = last_src_buf - search_size_width;
+  aom_int_pro_row(hbuf, ref_buf, ref_stride, ref_buf_width, bh,
+                  row_norm_factor);
+  // Set up prediction 1-D reference set for cols
+  ref_buf = last_src_buf - search_size_height * ref_stride;
+  aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, ref_buf_height,
+                  col_norm_factor);
+  // Set up src 1-D reference set
+  aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor);
+  aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor);
+  unsigned int best_sad;
+  int best_sad_col, best_sad_row;
+  // Find the best match per 1-D search
+  *best_intmv_col =
+      av1_vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize],
+                       search_size_width, full_search, &best_sad_col);
+  *best_intmv_row =
+      av1_vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize],
+                       search_size_height, full_search, &best_sad_row);
+  if (best_sad_col < best_sad_row) {
+    *best_intmv_row = 0;
+    best_sad = best_sad_col;
+  } else {
+    *best_intmv_col = 0;
+    best_sad = best_sad_row;
+  }
+  aom_free(hbuf);
+  aom_free(vbuf);
+  aom_free(src_hbuf);
+  aom_free(src_vbuf);
+  return best_sad;
+}
+
 /*!\brief Check for scene detection, for 1 pass real-time mode.
  *
  * Compute average source sad (temporal sad: between current source and
@@ -3184,6 +3259,49 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
   if (num_samples > 0)
     rc->percent_blocks_with_motion =
         ((num_samples - num_zero_temp_sad) * 100) / num_samples;
+  // Update the high_motion_screen_content flag on TL0. Avoid the update
+  // if too many consecutive frame drops occurred.
+  const uint64_t thresh_high_motion = 9 * 64 * 64;
+  if (cpi->svc.temporal_layer_id == 0 && rc->drop_count_consec < 3) {
+    cpi->rc.high_motion_screen_content = 0;
+    if (cpi->oxcf.speed >= 11 &&
+        cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+        rc->percent_blocks_with_motion > 40 &&
+        rc->prev_avg_source_sad > thresh_high_motion &&
+        rc->avg_source_sad > thresh_high_motion &&
+        rc->avg_frame_low_motion < 60 && unscaled_src->y_width >= 1280 &&
+        unscaled_src->y_height >= 720) {
+      cpi->rc.high_motion_screen_content = 1;
+      // Compute fast coarse/global motion for 128x128 superblock centered
+      // at middle of frames, to determine if motion is scroll.
+      int pos_col = (unscaled_src->y_width >> 1) - 64;
+      int pos_row = (unscaled_src->y_height >> 1) - 64;
+      src_y = unscaled_src->y_buffer + pos_row * src_ystride + pos_col;
+      last_src_y =
+          unscaled_last_src->y_buffer + pos_row * last_src_ystride + pos_col;
+      int best_intmv_col = 0;
+      int best_intmv_row = 0;
+      unsigned int y_sad = estimate_scroll_motion(
+          cpi, src_y, last_src_y, src_ystride, last_src_ystride, BLOCK_128X128,
+          pos_col, pos_row, &best_intmv_col, &best_intmv_row);
+      if (y_sad < 100 && (abs(best_intmv_col) > 16 || abs(best_intmv_row) > 16))
+        cpi->rc.high_motion_screen_content = 0;
+    }
+    // Pass the flag value to all layer frames.
+    if (cpi->svc.number_spatial_layers > 1 ||
+        cpi->svc.number_temporal_layers > 1) {
+      SVC *svc = &cpi->svc;
+      for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+        for (int tl = 1; tl < svc->number_temporal_layers; ++tl) {
+          const int layer =
+              LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+          LAYER_CONTEXT *lc = &svc->layer_context[layer];
+          RATE_CONTROL *lrc = &lc->rc;
+          lrc->high_motion_screen_content = rc->high_motion_screen_content;
+        }
+      }
+    }
+  }
   // Scene detection is only on base SLO, and using full/orignal resolution.
   // Pass the state to the upper spatial layers.
   if (cpi->svc.number_spatial_layers > 1) {
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 5fcb65e07..0a5cfbc17 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -190,6 +190,7 @@ typedef struct {
   int sframe_due;
 
   int high_source_sad;
+  int high_motion_screen_content;
   uint64_t avg_source_sad;
   uint64_t prev_avg_source_sad;
   uint64_t frame_source_sad;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 31fe03aeb..4b6ea16d7 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1608,10 +1608,18 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
           sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
       }
     }
-    if (cpi->rc.max_block_source_sad > 20000 &&
-        cpi->rc.frame_source_sad > 100 && speed >= 6 &&
-        (cpi->rc.percent_blocks_with_motion > 1 ||
-         cpi->svc.last_layer_dropped[0])) {
+    if (speed >= 11 && cpi->rc.high_motion_screen_content) {
+      sf->rt_sf.higher_thresh_scene_detection = 1;
+      sf->rt_sf.force_only_last_ref = 1;
+      sf->rt_sf.use_nonrd_filter_search = 0;
+      sf->part_sf.fixed_partition_size = BLOCK_32X32;
+      sf->rt_sf.use_fast_fixed_part = 1;
+      sf->rt_sf.increase_source_sad_thresh = 1;
+      sf->rt_sf.selective_cdf_update = 1;
+    } else if (cpi->rc.max_block_source_sad > 20000 &&
+               cpi->rc.frame_source_sad > 100 && speed >= 6 &&
+               (cpi->rc.percent_blocks_with_motion > 1 ||
+                cpi->svc.last_layer_dropped[0])) {
       sf->mv_sf.search_method = NSTEP;
       sf->rt_sf.fullpel_search_step_param = 2;
     }
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index e5908f41c..0b449e89d 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -627,14 +627,12 @@ static AOM_INLINE void tune_thresh_based_on_resolution(
   }
 }
 
-// Increase partition thresholds for noisy content. Apply it only for
-// superblocks where sumdiff is low, as we assume the sumdiff of superblock
-// whose only change is due to noise will be low (i.e, noise will average
-// out over large block).
-static AOM_INLINE int64_t tune_thresh_noisy_content(AV1_COMP *cpi,
-                                                    int64_t threshold_base,
-                                                    int content_lowsumdiff,
-                                                    int num_pixels) {
+// Increase the base partition threshold, based on content and noise level.
+static AOM_INLINE int64_t tune_base_thresh_content(AV1_COMP *cpi,
+                                                   int64_t threshold_base,
+                                                   int content_lowsumdiff,
+                                                   int source_sad_nonrd,
+                                                   int num_pixels) {
   AV1_COMMON *const cm = &cpi->common;
   int64_t updated_thresh_base = threshold_base;
   if (cpi->noise_estimate.enabled && content_lowsumdiff &&
@@ -647,23 +645,12 @@ static AOM_INLINE int64_t tune_thresh_noisy_content(AV1_COMP *cpi,
              !cpi->sf.rt_sf.prefer_large_partition_blocks)
       updated_thresh_base = (5 * updated_thresh_base) >> 2;
   }
-  // TODO(kyslov) Enable var based partition adjusment on temporal denoising
-#if 0  // CONFIG_AV1_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
-      cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow)
-      updated_thresh_base =
-          av1_scale_part_thresh(updated_thresh_base, cpi->denoiser.denoising_level,
-                                content_state, cpi->svc.temporal_layer_id);
-  else
-    threshold_base =
-        scale_part_thresh_content(updated_thresh_base, cpi->oxcf.speed, cm->width,
-                                  cm->height, cpi->ppi->rtc_ref.non_reference_frame);
-#else
-  // Increase base variance threshold based on content_state/sum_diff level.
   updated_thresh_base = scale_part_thresh_content(
       updated_thresh_base, cpi->oxcf.speed, cm->width, cm->height,
       cpi->ppi->rtc_ref.non_reference_frame);
-#endif
+  if (cpi->oxcf.speed >= 11 && source_sad_nonrd > kLowSad &&
+      cpi->rc.high_motion_screen_content)
+    updated_thresh_base = updated_thresh_base << 5;
   return updated_thresh_base;
 }
 
@@ -686,8 +673,8 @@ static AOM_INLINE void set_vbp_thresholds(
     return;
   }
 
-  threshold_base = tune_thresh_noisy_content(cpi, threshold_base,
-                                             content_lowsumdiff, num_pixels);
+  threshold_base = tune_base_thresh_content(
+      cpi, threshold_base, content_lowsumdiff, source_sad_nonrd, num_pixels);
   thresholds[0] = threshold_base >> 1;
   thresholds[1] = threshold_base;
   thresholds[3] = threshold_base << threshold_left_shift;