summaryrefslogtreecommitdiff
path: root/third_party/aom/av1
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/av1')
-rw-r--r--third_party/aom/av1/av1.cmake10
-rw-r--r--third_party/aom/av1/av1_cx_iface.c59
-rw-r--r--third_party/aom/av1/av1_dx_iface.c135
-rw-r--r--third_party/aom/av1/common/alloccommon.c4
-rw-r--r--third_party/aom/av1/common/arm/av1_inv_txfm_neon.c844
-rw-r--r--third_party/aom/av1/common/arm/av1_inv_txfm_neon.h152
-rw-r--r--third_party/aom/av1/common/arm/convolve_neon.c24
-rw-r--r--third_party/aom/av1/common/arm/intrapred_neon.c79
-rw-r--r--third_party/aom/av1/common/arm/jnt_convolve_neon.c24
-rw-r--r--third_party/aom/av1/common/arm/mem_neon.h84
-rw-r--r--third_party/aom/av1/common/arm/selfguided_neon.c1506
-rw-r--r--third_party/aom/av1/common/arm/transpose_neon.h38
-rw-r--r--third_party/aom/av1/common/av1_loopfilter.c51
-rw-r--r--third_party/aom/av1/common/av1_rtcd.c6
-rwxr-xr-xthird_party/aom/av1/common/av1_rtcd_defs.pl52
-rw-r--r--third_party/aom/av1/common/av1_txfm.h47
-rw-r--r--third_party/aom/av1/common/blockd.h47
-rw-r--r--third_party/aom/av1/common/cdef.c6
-rw-r--r--third_party/aom/av1/common/cfl.c15
-rw-r--r--third_party/aom/av1/common/convolve.c154
-rw-r--r--third_party/aom/av1/common/convolve.h18
-rw-r--r--third_party/aom/av1/common/enums.h2
-rw-r--r--third_party/aom/av1/common/filter.c120
-rw-r--r--third_party/aom/av1/common/filter.h116
-rw-r--r--third_party/aom/av1/common/mv.h3
-rw-r--r--third_party/aom/av1/common/mvref_common.h2
-rw-r--r--third_party/aom/av1/common/onyxc_int.h76
-rw-r--r--third_party/aom/av1/common/quant_common.c23
-rw-r--r--third_party/aom/av1/common/quant_common.h1
-rw-r--r--third_party/aom/av1/common/reconinter.c65
-rw-r--r--third_party/aom/av1/common/reconinter.h10
-rw-r--r--third_party/aom/av1/common/reconintra.c19
-rw-r--r--third_party/aom/av1/common/reconintra.h10
-rw-r--r--third_party/aom/av1/common/resize.c41
-rw-r--r--third_party/aom/av1/common/restoration.c40
-rw-r--r--third_party/aom/av1/common/restoration.h2
-rw-r--r--third_party/aom/av1/common/scan.h7
-rw-r--r--third_party/aom/av1/common/thread_common.c2
-rw-r--r--third_party/aom/av1/common/tile_common.c4
-rw-r--r--third_party/aom/av1/common/timing.c4
-rw-r--r--third_party/aom/av1/common/timing.h10
-rw-r--r--third_party/aom/av1/common/txb_common.h25
-rw-r--r--third_party/aom/av1/common/warped_motion.c27
-rw-r--r--third_party/aom/av1/common/warped_motion.h5
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c16
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c1058
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h141
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.c11
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.h11
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_avx2.c16
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_sse2.c16
-rw-r--r--third_party/aom/av1/common/x86/convolve_avx2.c16
-rw-r--r--third_party/aom/av1/common/x86/convolve_sse2.c10
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c8
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c4
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c12
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c12
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c40
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c8
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_avx2.c32
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_sse2.c8
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_ssse3.c8
-rw-r--r--third_party/aom/av1/common/x86/selfguided_sse4.c11
-rw-r--r--third_party/aom/av1/decoder/decodeframe.c1726
-rw-r--r--third_party/aom/av1/decoder/decodeframe.h29
-rw-r--r--third_party/aom/av1/decoder/decodemv.c13
-rw-r--r--third_party/aom/av1/decoder/decoder.c56
-rw-r--r--third_party/aom/av1/decoder/decoder.h96
-rw-r--r--third_party/aom/av1/decoder/decodetxb.c31
-rw-r--r--third_party/aom/av1/decoder/decodetxb.h8
-rw-r--r--third_party/aom/av1/decoder/dthread.c8
-rw-r--r--third_party/aom/av1/decoder/dthread.h1
-rw-r--r--third_party/aom/av1/decoder/obu.c252
-rw-r--r--third_party/aom/av1/encoder/aq_complexity.c8
-rw-r--r--third_party/aom/av1/encoder/aq_cyclicrefresh.c40
-rw-r--r--third_party/aom/av1/encoder/aq_variance.c8
-rw-r--r--third_party/aom/av1/encoder/av1_quantize.c8
-rw-r--r--third_party/aom/av1/encoder/bitstream.c232
-rw-r--r--third_party/aom/av1/encoder/block.h2
-rw-r--r--third_party/aom/av1/encoder/dwt.c11
-rw-r--r--third_party/aom/av1/encoder/dwt.h11
-rw-r--r--third_party/aom/av1/encoder/encodeframe.c263
-rw-r--r--third_party/aom/av1/encoder/encoder.c604
-rw-r--r--third_party/aom/av1/encoder/encoder.h20
-rw-r--r--third_party/aom/av1/encoder/encodetxb.c18
-rw-r--r--third_party/aom/av1/encoder/encodetxb.h10
-rw-r--r--third_party/aom/av1/encoder/ethread.c7
-rw-r--r--third_party/aom/av1/encoder/firstpass.c493
-rw-r--r--third_party/aom/av1/encoder/firstpass.h7
-rw-r--r--third_party/aom/av1/encoder/hash_motion.c11
-rw-r--r--third_party/aom/av1/encoder/partition_model_weights.h (renamed from third_party/aom/av1/encoder/ab_partition_model_weights.h)475
-rw-r--r--third_party/aom/av1/encoder/pickcdef.c8
-rw-r--r--third_party/aom/av1/encoder/picklpf.c12
-rw-r--r--third_party/aom/av1/encoder/pickrst.c21
-rw-r--r--third_party/aom/av1/encoder/pustats.h208
-rw-r--r--third_party/aom/av1/encoder/rate_distortion_model_params.h591
-rw-r--r--third_party/aom/av1/encoder/ratectrl.c186
-rw-r--r--third_party/aom/av1/encoder/ratectrl.h14
-rw-r--r--third_party/aom/av1/encoder/rd.c12
-rw-r--r--third_party/aom/av1/encoder/rd.h3
-rw-r--r--third_party/aom/av1/encoder/rdopt.c2289
-rw-r--r--third_party/aom/av1/encoder/rdopt.h8
-rw-r--r--third_party/aom/av1/encoder/speed_features.c43
-rw-r--r--third_party/aom/av1/encoder/speed_features.h7
-rw-r--r--third_party/aom/av1/encoder/temporal_filter.c4
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c11
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c2068
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h103
-rw-r--r--third_party/aom/av1/encoder/x86/corner_match_sse4.c11
-rw-r--r--third_party/aom/av1/encoder/x86/wedge_utils_avx2.c215
110 files changed, 11824 insertions, 3845 deletions
diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
index 1c7f937e1a..4c4f542fe7 100644
--- a/third_party/aom/av1/av1.cmake
+++ b/third_party/aom/av1/av1.cmake
@@ -45,7 +45,6 @@ list(APPEND AOM_AV1_COMMON_SOURCES
"${AOM_ROOT}/av1/common/entropymv.c"
"${AOM_ROOT}/av1/common/entropymv.h"
"${AOM_ROOT}/av1/common/enums.h"
- "${AOM_ROOT}/av1/common/filter.c"
"${AOM_ROOT}/av1/common/filter.h"
"${AOM_ROOT}/av1/common/frame_buffers.c"
"${AOM_ROOT}/av1/common/frame_buffers.h"
@@ -274,7 +273,10 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
"${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
- "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c")
+ "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
"${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
@@ -296,7 +298,9 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON
"${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
"${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
"${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c"
- "${AOM_ROOT}/av1/common/arm/intrapred_neon.c"
+ "${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
+ "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
+ "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
"${AOM_ROOT}/av1/common/cdef_block_neon.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c
index 9d5414c1e5..3bc4804c9b 100644
--- a/third_party/aom/av1/av1_cx_iface.c
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -94,6 +94,10 @@ struct av1_extracfg {
int enable_warped_motion; // sequence level
int allow_warped_motion; // frame level
int enable_superres;
+#if CONFIG_DENOISE
+ float noise_level;
+ int noise_block_size;
+#endif
};
static struct av1_extracfg default_extra_cfg = {
@@ -160,6 +164,10 @@ static struct av1_extracfg default_extra_cfg = {
1, // enable_warped_motion at sequence level
1, // allow_warped_motion at frame level
1, // superres
+#if CONFIG_DENOISE
+ 0, // noise_level
+ 32, // noise_block_size
+#endif
};
struct aom_codec_alg_priv {
@@ -464,7 +472,7 @@ static aom_codec_err_t set_encoder_config(
oxcf->buffer_model.num_units_in_decoding_tick = cfg->g_timebase.num;
oxcf->timing_info.equal_picture_interval = 0;
oxcf->decoder_model_info_present_flag = 1;
- oxcf->buffer_removal_delay_present = 1;
+ oxcf->buffer_removal_time_present = 1;
oxcf->display_model_info_present_flag = 1;
}
if (oxcf->init_framerate > 180) {
@@ -612,6 +620,10 @@ static aom_codec_err_t set_encoder_config(
oxcf->film_grain_test_vector = extra_cfg->film_grain_test_vector;
oxcf->film_grain_table_filename = extra_cfg->film_grain_table_filename;
}
+#if CONFIG_DENOISE
+ oxcf->noise_level = extra_cfg->noise_level;
+ oxcf->noise_block_size = extra_cfg->noise_block_size;
+#endif
oxcf->large_scale_tile = cfg->large_scale_tile;
oxcf->single_tile_decoding =
(oxcf->large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
@@ -710,7 +722,7 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
ctx->cfg = *cfg;
set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
// On profile change, request a key frame
- force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
+ force_key |= ctx->cpi->common.seq_params.profile != ctx->oxcf.profile;
av1_change_config(ctx->cpi, &ctx->oxcf);
}
@@ -1055,6 +1067,23 @@ static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+#if CONFIG_DENOISE
+static aom_codec_err_t ctrl_set_denoise_noise_level(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.noise_level =
+ ((float)CAST(AV1E_SET_DENOISE_NOISE_LEVEL, args)) / 10.0f;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_denoise_block_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.noise_block_size = CAST(AV1E_SET_DENOISE_BLOCK_SIZE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+
static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
va_list args) {
struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1119,7 +1148,7 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx,
}
priv->extra_cfg = default_extra_cfg;
- once(av1_initialize_enc);
+ aom_once(av1_initialize_enc);
res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
@@ -1200,6 +1229,9 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
volatile aom_enc_frame_flags_t flags = enc_flags;
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
if (setjmp(cpi->common.error.jmp)) {
cpi->common.error.setjmp = 0;
res = update_error_state(ctx, &cpi->common.error);
@@ -1259,7 +1291,6 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
if (cx_data_sz < ctx->cx_data_sz / 2) {
aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
"Compressed data buffer too small");
- return AOM_CODEC_ERROR;
}
}
@@ -1275,8 +1306,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
!img, timebase)) {
if (cpi->common.seq_params.frame_id_numbers_present_flag) {
if (cpi->common.invalid_delta_frame_id_minus_1) {
- ctx->base.err_detail = "Invalid delta_frame_id_minus_1";
- return AOM_CODEC_ERROR;
+ aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+ "Invalid delta_frame_id_minus_1");
}
}
cpi->seq_params_locked = 1;
@@ -1305,7 +1336,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
// OBUs are preceded/succeeded by an unsigned leb128 coded integer.
if (write_uleb_obu_size(obu_header_size, obu_payload_size,
ctx->pending_cx_data) != AOM_CODEC_OK) {
- return AOM_CODEC_ERROR;
+ aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
}
frame_size += obu_header_size + obu_payload_size + length_field_size;
@@ -1315,7 +1346,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
size_t curr_frame_size = frame_size;
if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) !=
AOM_CODEC_OK) {
- return AOM_CODEC_ERROR;
+ aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
}
frame_size = curr_frame_size;
@@ -1327,7 +1358,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
}
if (write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) !=
AOM_CODEC_OK) {
- return AOM_CODEC_ERROR;
+ aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
}
frame_size += length_field_size;
}
@@ -1358,7 +1389,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
}
if (write_uleb_obu_size(0, (uint32_t)tu_size, ctx->pending_cx_data) !=
AOM_CODEC_OK) {
- return AOM_CODEC_ERROR;
+ aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
}
ctx->pending_cx_data_sz += length_field_size;
}
@@ -1710,6 +1741,10 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
{ AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector },
{ AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table },
+#if CONFIG_DENOISE
+ { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level },
+ { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size },
+#endif // CONFIG_FILM_GRAIN
{ AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
// Getters
@@ -1728,7 +1763,7 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
{
// NOLINT
0, // g_usage
- 8, // g_threads
+ 0, // g_threads
0, // g_profile
320, // g_width
@@ -1810,7 +1845,7 @@ CODEC_INTERFACE(aom_codec_av1_cx) = {
NULL, // aom_codec_peek_si_fn_t
NULL, // aom_codec_get_si_fn_t
NULL, // aom_codec_decode_fn_t
- NULL, // aom_codec_frame_get_fn_t
+ NULL, // aom_codec_get_frame_fn_t
NULL // aom_codec_set_fb_fn_t
},
{
diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c
index db338f7e3c..f425720194 100644
--- a/third_party/aom/av1/av1_dx_iface.c
+++ b/third_party/aom/av1/av1_dx_iface.c
@@ -50,6 +50,7 @@ struct aom_codec_alg_priv {
int decode_tile_col;
unsigned int tile_mode;
unsigned int ext_tile_debug;
+ unsigned int row_mt;
EXTERNAL_REFERENCES ext_refs;
unsigned int is_annexb;
int operating_point;
@@ -61,7 +62,7 @@ struct aom_codec_alg_priv {
int last_submit_worker_id;
int next_output_worker_id;
int available_threads;
- aom_image_t *image_with_grain;
+ aom_image_t *image_with_grain[MAX_NUM_SPATIAL_LAYERS];
int need_resync; // wait for key/intra-only frame
// BufferPool that holds all reference frames. Shared by all the FrameWorkers.
BufferPool *buffer_pool;
@@ -101,7 +102,7 @@ static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx,
// default values
priv->cfg.cfg.ext_partition = 1;
}
- priv->image_with_grain = NULL;
+ av1_zero(priv->image_with_grain);
}
return AOM_CODEC_OK;
@@ -139,7 +140,9 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
aom_free(ctx->frame_workers);
aom_free(ctx->buffer_pool);
- if (ctx->image_with_grain) aom_img_free(ctx->image_with_grain);
+ for (int i = 0; i < MAX_NUM_SPATIAL_LAYERS; i++) {
+ if (ctx->image_with_grain[i]) aom_img_free(ctx->image_with_grain[i]);
+ }
aom_free(ctx);
return AOM_CODEC_OK;
}
@@ -339,16 +342,16 @@ static int frame_worker_hook(void *arg1, void *arg2) {
const uint8_t *data = frame_worker_data->data;
(void)arg2;
- frame_worker_data->result = av1_receive_compressed_data(
- frame_worker_data->pbi, frame_worker_data->data_size, &data);
+ int result = av1_receive_compressed_data(frame_worker_data->pbi,
+ frame_worker_data->data_size, &data);
frame_worker_data->data_end = data;
- if (frame_worker_data->result != 0) {
+ if (result != 0) {
// Check decode result in serial decode.
frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
frame_worker_data->pbi->need_resync = 1;
}
- return !frame_worker_data->result;
+ return !result;
}
static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
@@ -429,6 +432,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
frame_worker_data->pbi->operating_point = ctx->operating_point;
frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+ frame_worker_data->pbi->row_mt = ctx->row_mt;
worker->hook = (AVxWorkerHook)frame_worker_hook;
if (!winterface->reset(worker)) {
@@ -489,6 +493,7 @@ static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+ frame_worker_data->pbi->row_mt = ctx->row_mt;
frame_worker_data->pbi->ext_refs = ctx->ext_refs;
frame_worker_data->pbi->common.is_annexb = ctx->is_annexb;
@@ -592,21 +597,31 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
return res;
}
-aom_image_t *add_grain_if_needed(aom_image_t *img, aom_image_t *grain_img_buf,
- aom_film_grain_t *grain_params) {
+// If grain_params->apply_grain is false, returns img. Otherwise, adds film
+// grain to img, saves the result in *grain_img_ptr (allocating *grain_img_ptr
+// if necessary), and returns *grain_img_ptr.
+static aom_image_t *add_grain_if_needed(aom_image_t *img,
+ aom_image_t **grain_img_ptr,
+ aom_film_grain_t *grain_params) {
if (!grain_params->apply_grain) return img;
- if (grain_img_buf &&
- (img->d_w != grain_img_buf->d_w || img->d_h != grain_img_buf->d_h ||
- img->fmt != grain_img_buf->fmt || !(img->d_h % 2) || !(img->d_w % 2))) {
- aom_img_free(grain_img_buf);
- grain_img_buf = NULL;
+ aom_image_t *grain_img_buf = *grain_img_ptr;
+
+ const int w_even = ALIGN_POWER_OF_TWO(img->d_w, 1);
+ const int h_even = ALIGN_POWER_OF_TWO(img->d_h, 1);
+
+ if (grain_img_buf) {
+ const int alloc_w = ALIGN_POWER_OF_TWO(grain_img_buf->d_w, 1);
+ const int alloc_h = ALIGN_POWER_OF_TWO(grain_img_buf->d_h, 1);
+ if (w_even != alloc_w || h_even != alloc_h ||
+ img->fmt != grain_img_buf->fmt) {
+ aom_img_free(grain_img_buf);
+ grain_img_buf = NULL;
+ }
}
if (!grain_img_buf) {
- int w_even = img->d_w % 2 ? img->d_w + 1 : img->d_w;
- int h_even = img->d_h % 2 ? img->d_h + 1 : img->d_h;
grain_img_buf = aom_img_alloc(NULL, img->fmt, w_even, h_even, 16);
- grain_img_buf->bit_depth = img->bit_depth;
+ *grain_img_ptr = grain_img_buf;
}
av1_add_film_grain(grain_params, img, grain_img_buf);
@@ -649,8 +664,6 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
aom_film_grain_t *grain_params;
if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd,
&grain_params) == 0) {
- *index += 1; // Advance the iterator to point to the next image
-
AV1Decoder *const pbi = frame_worker_data->pbi;
AV1_COMMON *const cm = &pbi->common;
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
@@ -659,6 +672,7 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
if (!pbi->ext_tile_debug && cm->large_scale_tile) {
+ *index += 1; // Advance the iterator to point to the next image
img = &ctx->img;
img->img_data = pbi->tile_list_output;
img->sz = pbi->tile_list_size;
@@ -688,11 +702,14 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
const int tile_col = AOMMIN(pbi->dec_tile_col, cm->tile_cols - 1);
const int mi_col = tile_col * cm->tile_width;
const int ssx = ctx->img.x_chroma_shift;
+ const int is_hbd =
+ (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
int plane;
- ctx->img.planes[0] += mi_col * MI_SIZE;
+ ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd);
if (num_planes > 1) {
for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
- ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx);
+ ctx->img.planes[plane] +=
+ mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
}
}
ctx->img.d_w =
@@ -703,7 +720,10 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
img = &ctx->img;
img->temporal_id = cm->temporal_layer_id;
img->spatial_id = cm->spatial_layer_id;
- return add_grain_if_needed(img, ctx->image_with_grain, grain_params);
+ aom_image_t *res = add_grain_if_needed(
+ img, &ctx->image_with_grain[*index], grain_params);
+ *index += 1; // Advance the iterator to point to the next image
+ return res;
}
} else {
// Decoding failed. Release the worker thread.
@@ -999,7 +1019,7 @@ static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
- *bit_depth = cm->bit_depth;
+ *bit_depth = cm->seq_params.bit_depth;
return AOM_CODEC_OK;
} else {
return AOM_CODEC_ERROR;
@@ -1009,6 +1029,64 @@ static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
return AOM_CODEC_INVALID_PARAM;
}
+static aom_img_fmt_t get_img_format(int subsampling_x, int subsampling_y,
+ int use_highbitdepth) {
+ aom_img_fmt_t fmt = 0;
+
+ if (subsampling_x == 0 && subsampling_y == 0)
+ fmt = AOM_IMG_FMT_I444;
+ else if (subsampling_x == 1 && subsampling_y == 0)
+ fmt = AOM_IMG_FMT_I422;
+ else if (subsampling_x == 1 && subsampling_y == 1)
+ fmt = AOM_IMG_FMT_I420;
+
+ if (use_highbitdepth) fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+ return fmt;
+}
+
+static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_img_fmt_t *const img_fmt = va_arg(args, aom_img_fmt_t *);
+ AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+ if (img_fmt) {
+ if (worker) {
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+
+ *img_fmt = get_img_format(cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y,
+ cm->seq_params.use_highbitdepth);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ unsigned int *const tile_size = va_arg(args, unsigned int *);
+ AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+ if (tile_size) {
+ if (worker) {
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+ *tile_size =
+ ((cm->tile_width * MI_SIZE) << 16) + cm->tile_height * MI_SIZE;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+ return AOM_CODEC_INVALID_PARAM;
+}
+
static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx,
va_list args) {
ctx->invert_tile_order = va_arg(args, int);
@@ -1124,6 +1202,12 @@ static aom_codec_err_t ctrl_ext_tile_debug(aom_codec_alg_priv_t *ctx,
return AOM_CODEC_OK;
}
+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->row_mt = va_arg(args, unsigned int);
+ return AOM_CODEC_OK;
+}
+
static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
{ AV1_COPY_REFERENCE, ctrl_copy_reference },
@@ -1145,6 +1229,7 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
{ AV1D_SET_OUTPUT_ALL_LAYERS, ctrl_set_output_all_layers },
{ AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback },
{ AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug },
+ { AV1D_SET_ROW_MT, ctrl_set_row_mt },
{ AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr },
// Getters
@@ -1152,6 +1237,8 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
{ AOMD_GET_LAST_QUANTIZER, ctrl_get_last_quantizer },
{ AOMD_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
{ AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth },
+ { AV1D_GET_IMG_FORMAT, ctrl_get_img_format },
+ { AV1D_GET_TILE_SIZE, ctrl_get_tile_size },
{ AV1D_GET_DISPLAY_SIZE, ctrl_get_render_size },
{ AV1D_GET_FRAME_SIZE, ctrl_get_frame_size },
{ AV1_GET_ACCOUNTING, ctrl_get_accounting },
@@ -1180,7 +1267,7 @@ CODEC_INTERFACE(aom_codec_av1_dx) = {
decoder_peek_si, // aom_codec_peek_si_fn_t
decoder_get_si, // aom_codec_get_si_fn_t
decoder_decode, // aom_codec_decode_fn_t
- decoder_get_frame, // aom_codec_frame_get_fn_t
+ decoder_get_frame, // aom_codec_get_frame_fn_t
decoder_set_fb_fn, // aom_codec_set_fb_fn_t
},
{
diff --git a/third_party/aom/av1/common/alloccommon.c b/third_party/aom/av1/common/alloccommon.c
index 49902cc7d5..1bf81c91d4 100644
--- a/third_party/aom/av1/common/alloccommon.c
+++ b/third_party/aom/av1/common/alloccommon.c
@@ -137,11 +137,11 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
// Now we need to allocate enough space to store the line buffers for the
// stripes
const int frame_w = cm->superres_upscaled_width;
- const int use_highbd = cm->use_highbitdepth ? 1 : 0;
+ const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
for (int p = 0; p < num_planes; ++p) {
const int is_uv = p > 0;
- const int ss_x = is_uv && cm->subsampling_x;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
new file mode 100644
index 0000000000..51c9914986
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
@@ -0,0 +1,844 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/arm/av1_inv_txfm_neon.h"
+
+static INLINE TxSetType find_TxSetType(TX_SIZE tx_size) {
+ const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
+ TxSetType tx_set_type;
+ if (tx_size_sqr_up > TX_32X32) {
+ tx_set_type = EXT_TX_SET_DCTONLY;
+ } else if (tx_size_sqr_up == TX_32X32) {
+ tx_set_type = EXT_TX_SET_DCT_IDTX;
+ } else {
+ tx_set_type = EXT_TX_SET_ALL16;
+ }
+ return tx_set_type;
+}
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+ IDCT_1D,
+ IADST_1D,
+ IFLIPADST_1D = IADST_1D,
+ IIDENTITY_1D,
+ ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
+ IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
+ IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
+ IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
+ IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+ IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+ IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
+};
+
+// 1D functions
+static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
+ { av1_idct4_new, av1_iadst4_new, av1_iidentity4_c },
+ { av1_idct8_new, av1_iadst8_new, av1_iidentity8_c },
+ { av1_idct16_new, av1_iadst16_new, av1_iidentity16_c },
+ { av1_idct32_new, NULL, NULL },
+ { av1_idct64_new, NULL, NULL },
+};
+
+// Functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_neon
+ lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { av1_idct4_new, av1_idct4_new, NULL, NULL },
+ { av1_iadst4_new, av1_iadst4_new, NULL, NULL },
+ { av1_iidentity4_c, av1_iidentity4_c, NULL, NULL },
+ },
+ { { av1_idct8_new, av1_idct8_new, NULL, NULL },
+ { av1_iadst8_new, av1_iadst8_new, NULL, NULL },
+ { av1_iidentity8_c, av1_iidentity8_c, NULL, NULL } },
+ {
+ { av1_idct16_new, av1_idct16_new, av1_idct16_new, NULL },
+ { av1_iadst16_new, av1_iadst16_new, av1_iadst16_new, NULL },
+ { av1_iidentity16_c, av1_iidentity16_c, av1_iidentity16_c, NULL },
+ },
+ { { av1_idct32_new, av1_idct32_new, av1_idct32_new, av1_idct32_new },
+ { NULL, NULL, NULL, NULL },
+ { av1_iidentity32_c, av1_iidentity32_c, av1_iidentity32_c,
+ av1_iidentity32_c } },
+ { { av1_idct64_new, av1_idct64_new, av1_idct64_new, av1_idct64_new },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+ int32_t *temp_in = txfm_buf;
+
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ // row tx
+ int row_start = (buf_size_nonzero_h_div8 * 8);
+ for (int i = 0; i < row_start; i++) {
+ if (abs(rect_type) == 1) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ } else {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ // Doing memset for the rows which are not processed in row transform.
+ memset(buf_ptr, 0,
+ sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+ // col tx
+ for (int c = 0; c < txfm_size_col; c++) {
+ for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c];
+
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+ int32_t *temp_in = txfm_buf;
+
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // row tx
+ int row_start = (buf_size_nonzero_h_div8 * 8);
+ for (int i = 0; i < row_start; i++) {
+ if (abs(rect_type) == 1) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ } else {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+ // Doing memset for the rows which are not processed in row transform.
+ memset(buf_ptr, 0,
+ sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+ // col tx
+ for (int c = 0; c < txfm_size_col; c++) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+ int32_t *temp_in = txfm_buf;
+
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // row tx
+ int row_start = (buf_size_nonzero_h_div8 * 8);
+ for (int i = 0; i < row_start; i++) {
+ if (abs(rect_type) == 1) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ } else {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+ // Doing memset for the rows which are not processed in row transform.
+ memset(buf_ptr, 0,
+ sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+ // col tx
+ for (int c = 0; c < txfm_size_col; c++) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)eob;
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ (void)eob;
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ (void)eob;
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)eob;
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)eob;
+
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
+ int32_t *temp_in = txfm_buf;
+
+ int eobx, eoby, ud_flip, lr_flip, row_start;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ const int bd = 8;
+ int r;
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ row_start = (buf_size_nonzero_h_div8 << 3);
+
+ for (int i = 0; i < row_start; i++) {
+ if (abs(rect_type) == 1) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ } else {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ // Doing memset for the rows which are not processed in row transform.
+ memset(buf_ptr, 0,
+ sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+ for (int c = 0; c < txfm_size_col; c++) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_universe_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_type) {
+ case IDTX:
+ lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ default:
+ lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ int row;
+ switch (tx_size) {
+ case TX_4X4:
+ lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_4X8:
+ lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_8X4:
+ lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_4X16:
+ lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_16X4:
+ lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_16X64: {
+ lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ case TX_64X16: {
+ int32_t mod_input[64 * 16];
+ for (row = 0; row < 16; ++row) {
+ memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ case TX_32X64: {
+ lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ case TX_64X32: {
+ int32_t mod_input[64 * 32];
+ for (row = 0; row < 32; ++row) {
+ memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ case TX_64X64: {
+ int32_t mod_input[64 * 64];
+ for (row = 0; row < 32; ++row) {
+ memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ default:
+ lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (!txfm_param->lossless) {
+ av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob);
+ } else {
+ av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
new file mode 100644
index 0000000000..6af2d61e7b
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#define AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+
+typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
+ const int8_t cos_bit,
+ const int8_t *stage_ptr);
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x16_default[16]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x32_default[32]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+ 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x32_default[32]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x16_default[16]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+ 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+ 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+ av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+ NULL,
+ av1_eob_to_eobxy_8x8_default,
+ av1_eob_to_eobxy_16x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x16_default,
+ av1_eob_to_eobxy_16x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x32_default,
+ av1_eob_to_eobxy_32x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+ 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static int eob_fill[32] = {
+ 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ if (eob == 1) {
+ *eobx = 0;
+ *eoby = 0;
+ return;
+ }
+
+ const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+ const int eob_row = (eob - 1) >> tx_w_log2;
+ const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+ *eobx = eobxy & 0xFF;
+ *eoby = eobxy >> 8;
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+ *eobx = eob / (eoby_max + 1);
+ *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+ *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+ const int temp_eoby = eob / (eobx_max + 1);
+ assert(temp_eoby < 32);
+ *eoby = eob_fill[temp_eoby];
+}
+
+#endif // AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c
index 86a25e109f..f15744c94a 100644
--- a/third_party/aom/av1/common/arm/convolve_neon.c
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@@ -164,8 +164,8 @@ static INLINE uint8x8_t convolve8_vert_8x4_s32(
void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
@@ -182,7 +182,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
@@ -485,8 +485,8 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int vert_offset = filter_params_y->taps / 2 - 1;
@@ -502,7 +502,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
if (w <= 4) {
uint8x8_t d01, d23;
@@ -680,8 +680,8 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
int im_dst_stride;
@@ -711,7 +711,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
int16_t x_filter_tmp[8];
int16x8_t filter_x_coef = vld1q_s16(x_filter);
@@ -896,7 +896,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1));
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
@@ -1086,8 +1086,8 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
}
void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
(void)filter_params_x;
diff --git a/third_party/aom/av1/common/arm/intrapred_neon.c b/third_party/aom/av1/common/arm/intrapred_neon.c
deleted file mode 100644
index 799355553f..0000000000
--- a/third_party/aom/av1/common/arm/intrapred_neon.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- *
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void highbd_dc_predictor_neon(uint16_t *dst, ptrdiff_t stride,
- int bw, const uint16_t *above,
- const uint16_t *left) {
- assert(bw >= 4);
- assert(IS_POWER_OF_TWO(bw));
- int expected_dc, sum = 0;
- const int count = bw * 2;
- uint32x4_t sum_q = vdupq_n_u32(0);
- uint32x2_t sum_d;
- uint16_t *dst_1;
- if (bw >= 8) {
- for (int i = 0; i < bw; i += 8) {
- sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
- sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
- above += 8;
- left += 8;
- }
- sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
- sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
- expected_dc = (sum + (count >> 1)) / count;
- const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
- for (int r = 0; r < bw; r++) {
- dst_1 = dst;
- for (int i = 0; i < bw; i += 8) {
- vst1q_u16(dst_1, dc);
- dst_1 += 8;
- }
- dst += stride;
- }
- } else { // 4x4
- sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
- sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
- sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
- expected_dc = (sum + (count >> 1)) / count;
- const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
- for (int r = 0; r < bw; r++) {
- vst1_u16(dst, dc);
- dst += stride;
- }
- }
-}
-
-#define intra_pred_highbd_sized(type, width) \
- void aom_highbd_##type##_predictor_##width##x##width##_neon( \
- uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
- const uint16_t *left, int bd) { \
- (void)bd; \
- highbd_##type##_predictor_neon(dst, stride, width, above, left); \
- }
-
-#define intra_pred_square(type) \
- intra_pred_highbd_sized(type, 4); \
- intra_pred_highbd_sized(type, 8); \
- intra_pred_highbd_sized(type, 16); \
- intra_pred_highbd_sized(type, 32); \
- intra_pred_highbd_sized(type, 64);
-
-intra_pred_square(dc);
-
-#undef intra_pred_square
diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
index 992be4a9ed..4015082b4e 100644
--- a/third_party/aom/av1/common/arm/jnt_convolve_neon.c
+++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
@@ -515,8 +515,8 @@ static INLINE void jnt_convolve_2d_vert_neon(
void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
assert(!(w % 4));
@@ -532,9 +532,9 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
const int round_0 = conv_params->round_0 - 1;
const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
int16_t x_filter_tmp[8];
int16x8_t filter_x_coef = vld1q_s16(x_filter);
@@ -553,8 +553,8 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
uint8_t *dst8, int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
@@ -679,8 +679,8 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
assert(!(w % 4));
@@ -705,7 +705,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
const uint8_t *src_ptr = src - horiz_offset;
@@ -1013,8 +1013,8 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
assert(!(w % 4));
@@ -1040,7 +1040,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const uint8_t *src_ptr = src - (vert_offset * src_stride);
diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h
index 214b14bf71..4bf45a52c9 100644
--- a/third_party/aom/av1/common/arm/mem_neon.h
+++ b/third_party/aom/av1/common/arm/mem_neon.h
@@ -22,6 +22,14 @@ static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
s += p;
}
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define load_u8_4x1(s, s0, lane) \
+ do { \
+ *(s0) = vreinterpret_u8_u32( \
+ vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
+ } while (0)
+
static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
uint8x8_t *const s0, uint8x8_t *const s1,
uint8x8_t *const s2, uint8x8_t *const s3,
@@ -128,6 +136,13 @@ static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
*s3 = vld1_s16(s);
}
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define store_u8_4x1(s, s0, lane) \
+ do { \
+ vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
+ } while (0)
+
static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
const uint8x8_t s1, const uint8x8_t s2,
const uint8x8_t s3, const uint8x8_t s4,
@@ -242,6 +257,30 @@ static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
vst1q_s16(s, s7);
}
+static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
+ const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3) {
+ vst1_s16(s, s0);
+ s += dst_stride;
+ vst1_s16(s, s1);
+ s += dst_stride;
+ vst1_s16(s, s2);
+ s += dst_stride;
+ vst1_s16(s, s3);
+}
+
+static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
+ const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3) {
+ vst1q_s16(s, s0);
+ s += dst_stride;
+ vst1q_s16(s, s1);
+ s += dst_stride;
+ vst1q_s16(s, s2);
+ s += dst_stride;
+ vst1q_s16(s, s3);
+}
+
static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
int16x8_t *const s0, int16x8_t *const s1,
int16x8_t *const s2, int16x8_t *const s3,
@@ -398,4 +437,49 @@ static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
*tu1 = vsetq_lane_u64(a, *tu1, 1);
}
+static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
+ int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
+ *s1 = vld1q_s32(s);
+ s += p;
+ *s2 = vld1q_s32(s);
+ s += p;
+ *s3 = vld1q_s32(s);
+ s += p;
+ *s4 = vld1q_s32(s);
+}
+
+static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
+ int32x4_t s2, int32x4_t s3, int32x4_t s4) {
+ vst1q_s32(s, s1);
+ s += p;
+ vst1q_s32(s, s2);
+ s += p;
+ vst1q_s32(s, s3);
+ s += p;
+ vst1q_s32(s, s4);
+}
+
+static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
+ uint32x4_t *s2, uint32x4_t *s3,
+ uint32x4_t *s4) {
+ *s1 = vld1q_u32(s);
+ s += p;
+ *s2 = vld1q_u32(s);
+ s += p;
+ *s3 = vld1q_u32(s);
+ s += p;
+ *s4 = vld1q_u32(s);
+}
+
+static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
+ uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
+ vst1q_u32(s, s1);
+ s += p;
+ vst1q_u32(s, s2);
+ s += p;
+ vst1q_u32(s, s3);
+ s += p;
+ vst1q_u32(s, s4);
+}
+
#endif // AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c
new file mode 100644
index 0000000000..b4808a9727
--- /dev/null
+++ b/third_party/aom/av1/common/arm/selfguided_neon.c
@@ -0,0 +1,1506 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
+#include "av1/common/restoration.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+// Constants used for right shift in final_filter calculation.
+#define NB_EVEN 5
+#define NB_ODD 4
+
+static INLINE void calc_ab_fast_internal_common(
+ uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+ uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5,
+ int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec,
+ uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec,
+ uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2,
+ const int buf_stride) {
+ uint32x4_t q0, q1, q2, q3;
+ uint32x4_t p0, p1, p2, p3;
+ uint16x4_t d0, d1, d2, d3;
+
+ s0 = vmulq_u32(s0, const_n_val);
+ s1 = vmulq_u32(s1, const_n_val);
+ s2 = vmulq_u32(s2, const_n_val);
+ s3 = vmulq_u32(s3, const_n_val);
+
+ q0 = vmulq_u32(s4, s4);
+ q1 = vmulq_u32(s5, s5);
+ q2 = vmulq_u32(s6, s6);
+ q3 = vmulq_u32(s7, s7);
+
+ p0 = vcleq_u32(q0, s0);
+ p1 = vcleq_u32(q1, s1);
+ p2 = vcleq_u32(q2, s2);
+ p3 = vcleq_u32(q3, s3);
+
+ q0 = vsubq_u32(s0, q0);
+ q1 = vsubq_u32(s1, q1);
+ q2 = vsubq_u32(s2, q2);
+ q3 = vsubq_u32(s3, q3);
+
+ p0 = vandq_u32(p0, q0);
+ p1 = vandq_u32(p1, q1);
+ p2 = vandq_u32(p2, q2);
+ p3 = vandq_u32(p3, q3);
+
+ p0 = vmulq_u32(p0, s_vec);
+ p1 = vmulq_u32(p1, s_vec);
+ p2 = vmulq_u32(p2, s_vec);
+ p3 = vmulq_u32(p3, s_vec);
+
+ p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+ p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+ p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+ p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+
+ p0 = vminq_u32(p0, const_val);
+ p1 = vminq_u32(p1, const_val);
+ p2 = vminq_u32(p2, const_val);
+ p3 = vminq_u32(p3, const_val);
+
+ {
+ store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+
+ for (int x = 0; x < 4; x++) {
+ for (int y = 0; y < 4; y++) {
+ dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+ }
+ }
+ load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3);
+ }
+ p0 = vsubl_u16(sgrproj_sgr, d0);
+ p1 = vsubl_u16(sgrproj_sgr, d1);
+ p2 = vsubl_u16(sgrproj_sgr, d2);
+ p3 = vsubl_u16(sgrproj_sgr, d3);
+
+ s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec);
+ s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec);
+ s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec);
+ s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec);
+
+ s4 = vmulq_u32(s4, p0);
+ s5 = vmulq_u32(s5, p1);
+ s6 = vmulq_u32(s6, p2);
+ s7 = vmulq_u32(s7, p3);
+
+ p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+ p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+ p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+ p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+ store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0),
+ vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+ vreinterpretq_s32_u32(p3));
+}
+static INLINE void calc_ab_internal_common(
+ uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+ uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0,
+ uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4,
+ uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7,
+ uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val,
+ uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1,
+ uint16_t *dst_A16, int32_t *dst2, const int buf_stride) {
+ uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+ uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7;
+ uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7;
+
+ s0 = vmulq_u32(s0, const_n_val);
+ s1 = vmulq_u32(s1, const_n_val);
+ s2 = vmulq_u32(s2, const_n_val);
+ s3 = vmulq_u32(s3, const_n_val);
+ s4 = vmulq_u32(s4, const_n_val);
+ s5 = vmulq_u32(s5, const_n_val);
+ s6 = vmulq_u32(s6, const_n_val);
+ s7 = vmulq_u32(s7, const_n_val);
+
+ d0 = vget_low_u16(s16_4);
+ d1 = vget_low_u16(s16_5);
+ d2 = vget_low_u16(s16_6);
+ d3 = vget_low_u16(s16_7);
+ d4 = vget_high_u16(s16_4);
+ d5 = vget_high_u16(s16_5);
+ d6 = vget_high_u16(s16_6);
+ d7 = vget_high_u16(s16_7);
+
+ q0 = vmull_u16(d0, d0);
+ q1 = vmull_u16(d1, d1);
+ q2 = vmull_u16(d2, d2);
+ q3 = vmull_u16(d3, d3);
+ q4 = vmull_u16(d4, d4);
+ q5 = vmull_u16(d5, d5);
+ q6 = vmull_u16(d6, d6);
+ q7 = vmull_u16(d7, d7);
+
+ p0 = vcleq_u32(q0, s0);
+ p1 = vcleq_u32(q1, s1);
+ p2 = vcleq_u32(q2, s2);
+ p3 = vcleq_u32(q3, s3);
+ p4 = vcleq_u32(q4, s4);
+ p5 = vcleq_u32(q5, s5);
+ p6 = vcleq_u32(q6, s6);
+ p7 = vcleq_u32(q7, s7);
+
+ q0 = vsubq_u32(s0, q0);
+ q1 = vsubq_u32(s1, q1);
+ q2 = vsubq_u32(s2, q2);
+ q3 = vsubq_u32(s3, q3);
+ q4 = vsubq_u32(s4, q4);
+ q5 = vsubq_u32(s5, q5);
+ q6 = vsubq_u32(s6, q6);
+ q7 = vsubq_u32(s7, q7);
+
+ p0 = vandq_u32(p0, q0);
+ p1 = vandq_u32(p1, q1);
+ p2 = vandq_u32(p2, q2);
+ p3 = vandq_u32(p3, q3);
+ p4 = vandq_u32(p4, q4);
+ p5 = vandq_u32(p5, q5);
+ p6 = vandq_u32(p6, q6);
+ p7 = vandq_u32(p7, q7);
+
+ p0 = vmulq_u32(p0, s_vec);
+ p1 = vmulq_u32(p1, s_vec);
+ p2 = vmulq_u32(p2, s_vec);
+ p3 = vmulq_u32(p3, s_vec);
+ p4 = vmulq_u32(p4, s_vec);
+ p5 = vmulq_u32(p5, s_vec);
+ p6 = vmulq_u32(p6, s_vec);
+ p7 = vmulq_u32(p7, s_vec);
+
+ p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+ p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+ p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+ p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+ p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS);
+ p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS);
+ p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS);
+ p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS);
+
+ p0 = vminq_u32(p0, const_val);
+ p1 = vminq_u32(p1, const_val);
+ p2 = vminq_u32(p2, const_val);
+ p3 = vminq_u32(p3, const_val);
+ p4 = vminq_u32(p4, const_val);
+ p5 = vminq_u32(p5, const_val);
+ p6 = vminq_u32(p6, const_val);
+ p7 = vminq_u32(p7, const_val);
+
+ {
+ store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+ store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7);
+
+ for (int x = 0; x < 4; x++) {
+ for (int y = 0; y < 8; y++) {
+ dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+ }
+ }
+ load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7);
+ }
+
+ s16_4 = vsubq_u16(sgrproj_sgr, s16_4);
+ s16_5 = vsubq_u16(sgrproj_sgr, s16_5);
+ s16_6 = vsubq_u16(sgrproj_sgr, s16_6);
+ s16_7 = vsubq_u16(sgrproj_sgr, s16_7);
+
+ s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec);
+ s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec);
+ s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec);
+ s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec);
+ s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec);
+ s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec);
+ s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec);
+ s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec);
+
+ s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4)));
+ s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5)));
+ s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6)));
+ s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7)));
+ s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4)));
+ s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5)));
+ s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6)));
+ s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7)));
+
+ p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS);
+ p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS);
+ p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS);
+ p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS);
+ p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+ p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+ p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+ p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+ store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0),
+ vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+ vreinterpretq_s32_u32(p3));
+ store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4),
+ vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6),
+ vreinterpretq_s32_u32(p7));
+}
+
+static INLINE void boxsum2_square_sum_calc(
+ int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5,
+ int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10,
+ int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) {
+ int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
+ int32x4_t r12, r34, r67, r89, r1011;
+ int32x4_t r345, r6789, r789;
+
+ d1 = vmull_s16(t1, t1);
+ d2 = vmull_s16(t2, t2);
+ d3 = vmull_s16(t3, t3);
+ d4 = vmull_s16(t4, t4);
+ d5 = vmull_s16(t5, t5);
+ d6 = vmull_s16(t6, t6);
+ d7 = vmull_s16(t7, t7);
+ d8 = vmull_s16(t8, t8);
+ d9 = vmull_s16(t9, t9);
+ d10 = vmull_s16(t10, t10);
+ d11 = vmull_s16(t11, t11);
+
+ r12 = vaddq_s32(d1, d2);
+ r34 = vaddq_s32(d3, d4);
+ r67 = vaddq_s32(d6, d7);
+ r89 = vaddq_s32(d8, d9);
+ r1011 = vaddq_s32(d10, d11);
+ r345 = vaddq_s32(r34, d5);
+ r6789 = vaddq_s32(r67, r89);
+ r789 = vsubq_s32(r6789, d6);
+ *r0 = vaddq_s32(r12, r345);
+ *r1 = vaddq_s32(r67, r345);
+ *r2 = vaddq_s32(d5, r6789);
+ *r3 = vaddq_s32(r789, r1011);
+}
+
+static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
+ int32_t *dst32, int32_t *dst2, const int dst_stride,
+ const int width, const int height) {
+ assert(width > 2 * SGRPROJ_BORDER_HORZ);
+ assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+ int16_t *dst1_16_ptr, *src_ptr;
+ int32_t *dst2_ptr;
+ int h, w, count = 0;
+ const int dst_stride_2 = (dst_stride << 1);
+ const int dst_stride_8 = (dst_stride << 3);
+
+ dst1_16_ptr = dst16;
+ dst2_ptr = dst2;
+ src_ptr = src;
+ w = width;
+ {
+ int16x8_t t1, t2, t3, t4, t5, t6, t7;
+ int16x8_t t8, t9, t10, t11, t12;
+
+ int16x8_t q12345, q56789, q34567, q7891011;
+ int16x8_t q12, q34, q67, q89, q1011;
+ int16x8_t q345, q6789, q789;
+
+ int32x4_t r12345, r56789, r34567, r7891011;
+
+ do {
+ h = height;
+ dst1_16_ptr = dst16 + (count << 3);
+ dst2_ptr = dst2 + (count << 3);
+ src_ptr = src + (count << 3);
+
+ dst1_16_ptr += dst_stride_2;
+ dst2_ptr += dst_stride_2;
+ do {
+ load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+ src_ptr += 4 * src_stride;
+ load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8);
+ src_ptr += 4 * src_stride;
+ load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12);
+
+ q12 = vaddq_s16(t1, t2);
+ q34 = vaddq_s16(t3, t4);
+ q67 = vaddq_s16(t6, t7);
+ q89 = vaddq_s16(t8, t9);
+ q1011 = vaddq_s16(t10, t11);
+ q345 = vaddq_s16(q34, t5);
+ q6789 = vaddq_s16(q67, q89);
+ q789 = vaddq_s16(q89, t7);
+ q12345 = vaddq_s16(q12, q345);
+ q34567 = vaddq_s16(q67, q345);
+ q56789 = vaddq_s16(t5, q6789);
+ q7891011 = vaddq_s16(q789, q1011);
+
+ store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789,
+ q7891011);
+ dst1_16_ptr += dst_stride_8;
+
+ boxsum2_square_sum_calc(
+ vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3),
+ vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6),
+ vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9),
+ vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789,
+ &r7891011);
+
+ store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011);
+
+ boxsum2_square_sum_calc(
+ vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3),
+ vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6),
+ vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9),
+ vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789,
+ &r7891011);
+
+ store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789,
+ r7891011);
+ dst2_ptr += (dst_stride_8);
+ h -= 8;
+ } while (h > 0);
+ w -= 8;
+ count++;
+ } while (w > 0);
+ }
+
+ {
+ int16x4_t s1, s2, s3, s4, s5, s6, s7, s8;
+ int32x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+ int32x4_t q12345, q34567, q23456, q45678;
+ int32x4_t q23, q45, q67;
+ int32x4_t q2345, q4567;
+
+ int32x4_t r12345, r34567, r23456, r45678;
+ int32x4_t r23, r45, r67;
+ int32x4_t r2345, r4567;
+
+ int32_t *src2_ptr, *dst1_32_ptr;
+ int16_t *src1_ptr;
+ count = 0;
+ h = height;
+ do {
+ dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2);
+ dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+ src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2);
+ src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+ w = width;
+
+ dst1_32_ptr += 2;
+ dst2_ptr += 2;
+ load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4);
+ transpose_s16_4x4d(&s1, &s2, &s3, &s4);
+ load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4);
+ transpose_s32_4x4(&d1, &d2, &d3, &d4);
+ do {
+ src1_ptr += 4;
+ src2_ptr += 4;
+ load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8);
+ transpose_s16_4x4d(&s5, &s6, &s7, &s8);
+ load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8);
+ transpose_s32_4x4(&d5, &d6, &d7, &d8);
+ q23 = vaddl_s16(s2, s3);
+ q45 = vaddl_s16(s4, s5);
+ q67 = vaddl_s16(s6, s7);
+ q2345 = vaddq_s32(q23, q45);
+ q4567 = vaddq_s32(q45, q67);
+ q12345 = vaddq_s32(vmovl_s16(s1), q2345);
+ q23456 = vaddq_s32(q2345, vmovl_s16(s6));
+ q34567 = vaddq_s32(q4567, vmovl_s16(s3));
+ q45678 = vaddq_s32(q4567, vmovl_s16(s8));
+
+ transpose_s32_4x4(&q12345, &q23456, &q34567, &q45678);
+ store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567,
+ q45678);
+ dst1_32_ptr += 4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+
+ r23 = vaddq_s32(d2, d3);
+ r45 = vaddq_s32(d4, d5);
+ r67 = vaddq_s32(d6, d7);
+ r2345 = vaddq_s32(r23, r45);
+ r4567 = vaddq_s32(r45, r67);
+ r12345 = vaddq_s32(d1, r2345);
+ r23456 = vaddq_s32(r2345, d6);
+ r34567 = vaddq_s32(r4567, d3);
+ r45678 = vaddq_s32(r4567, d8);
+
+ transpose_s32_4x4(&r12345, &r23456, &r34567, &r45678);
+ store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678);
+ dst2_ptr += 4;
+ d1 = d5;
+ d2 = d6;
+ d3 = d7;
+ d4 = d8;
+ w -= 4;
+ } while (w > 0);
+ h -= 8;
+ count++;
+ } while (h > 0);
+ }
+}
+
+static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
+ uint16_t *B16, int32_t *B,
+ const int buf_stride, const int width,
+ const int height, const int r,
+ const int s, const int ht_inc) {
+ int32_t *src1, *dst2, count = 0;
+ uint16_t *dst_A16, *src2;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+ const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7;
+
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B16 + (count << 2) * buf_stride;
+ dst2 = B + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3);
+ load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7);
+ load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+ s16_4 = s16_0;
+ s16_5 = s16_1;
+ s16_6 = s16_2;
+ s16_7 = s16_3;
+
+ calc_ab_internal_common(
+ s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+ s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+ w -= 8;
+ dst2 += 8;
+ src1 += 8;
+ src2 += 8;
+ dst_A16 += 8;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
+ uint16_t *B16, int32_t *B,
+ const int buf_stride, const int width,
+ const int height, const int bit_depth,
+ const int r, const int s,
+ const int ht_inc) {
+ int32_t *src1, *dst2, count = 0;
+ uint16_t *dst_A16, *src2;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8));
+ const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+ const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+ uint16x8_t s16_0, s16_1, s16_2, s16_3;
+ uint16x8_t s16_4, s16_5, s16_6, s16_7;
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B16 + (count << 2) * buf_stride;
+ dst2 = B + (count << 2) * buf_stride;
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+ load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7);
+ load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+ s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+ s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+ s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+ s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+ s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec);
+ s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec);
+ s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec);
+ s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec);
+
+ s16_4 = vrshlq_u16(s16_0, bd_min_2_vec);
+ s16_5 = vrshlq_u16(s16_1, bd_min_2_vec);
+ s16_6 = vrshlq_u16(s16_2, bd_min_2_vec);
+ s16_7 = vrshlq_u16(s16_3, bd_min_2_vec);
+
+ calc_ab_internal_common(
+ s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+ s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+ w -= 8;
+ dst2 += 8;
+ src1 += 8;
+ src2 += 8;
+ dst_A16 += 8;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
+ int32_t *B, const int buf_stride,
+ const int width, const int height,
+ const int r, const int s,
+ const int ht_inc) {
+ int32_t *src1, *src2, count = 0;
+ uint16_t *dst_A16;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+ const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B + (count << 2) * buf_stride;
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+ load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+ s0 = vreinterpretq_u32_s32(sr0);
+ s1 = vreinterpretq_u32_s32(sr1);
+ s2 = vreinterpretq_u32_s32(sr2);
+ s3 = vreinterpretq_u32_s32(sr3);
+ s4 = vreinterpretq_u32_s32(sr4);
+ s5 = vreinterpretq_u32_s32(sr5);
+ s6 = vreinterpretq_u32_s32(sr6);
+ s7 = vreinterpretq_u32_s32(sr7);
+
+ calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+ sr6, sr7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1,
+ dst_A16, src2, buf_stride);
+
+ w -= 4;
+ src1 += 4;
+ src2 += 4;
+ dst_A16 += 4;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
+ int32_t *B, const int buf_stride,
+ const int width, const int height,
+ const int bit_depth, const int r,
+ const int s, const int ht_inc) {
+ int32_t *src1, *src2, count = 0;
+ uint16_t *dst_A16;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8));
+ const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+ const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B + (count << 2) * buf_stride;
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+ load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+ s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+ s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+ s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+ s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+ s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec);
+ s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec);
+ s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec);
+ s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec);
+
+ calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+ sr6, sr7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1,
+ dst_A16, src2, buf_stride);
+
+ w -= 4;
+ src1 += 4;
+ src2 += 4;
+ dst_A16 += 4;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
+ int32_t *dst2, const int dst_stride, const int width,
+ const int height) {
+ assert(width > 2 * SGRPROJ_BORDER_HORZ);
+ assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+ int16_t *src_ptr;
+ int32_t *dst2_ptr;
+ uint16_t *dst1_ptr;
+ int h, w, count = 0;
+
+ w = width;
+ {
+ int16x8_t s1, s2, s3, s4, s5, s6, s7, s8;
+ int16x8_t q23, q34, q56, q234, q345, q456, q567;
+ int32x4_t r23, r56, r345, r456, r567, r78, r678;
+ int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high;
+ int32x4_t r2, r3, r5, r6, r7, r8;
+ int16x8_t q678, q78;
+
+ do {
+ dst1_ptr = dst1 + (count << 3);
+ dst2_ptr = dst2 + (count << 3);
+ src_ptr = src + (count << 3);
+ h = height;
+
+ load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+ src_ptr += 4 * src_stride;
+
+ q23 = vaddq_s16(s2, s3);
+ q234 = vaddq_s16(q23, s4);
+ q34 = vaddq_s16(s3, s4);
+ dst1_ptr += (dst_stride << 1);
+
+ r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2));
+ r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3));
+ r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4));
+ r23 = vaddq_s32(r2, r3);
+ r234_low = vaddq_s32(r23, r4_low);
+ r34_low = vaddq_s32(r3, r4_low);
+
+ r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2));
+ r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3));
+ r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4));
+ r23 = vaddq_s32(r2, r3);
+ r234_high = vaddq_s32(r23, r4_high);
+ r34_high = vaddq_s32(r3, r4_high);
+
+ dst2_ptr += (dst_stride << 1);
+
+ do {
+ load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+ src_ptr += 4 * src_stride;
+
+ q345 = vaddq_s16(s5, q34);
+ q56 = vaddq_s16(s5, s6);
+ q456 = vaddq_s16(s4, q56);
+ q567 = vaddq_s16(s7, q56);
+ q78 = vaddq_s16(s7, s8);
+ q678 = vaddq_s16(s6, q78);
+
+ store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+ dst1_ptr += (dst_stride << 2);
+
+ s4 = s8;
+ q34 = q78;
+ q234 = q678;
+
+ r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5));
+ r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6));
+ r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7));
+ r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8));
+
+ r345 = vaddq_s32(r5, r34_low);
+ r56 = vaddq_s32(r5, r6);
+ r456 = vaddq_s32(r4_low, r56);
+ r567 = vaddq_s32(r7, r56);
+ r78 = vaddq_s32(r7, r8);
+ r678 = vaddq_s32(r6, r78);
+ store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567);
+
+ r4_low = r8;
+ r34_low = r78;
+ r234_low = r678;
+
+ r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5));
+ r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6));
+ r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7));
+ r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8));
+
+ r345 = vaddq_s32(r5, r34_high);
+ r56 = vaddq_s32(r5, r6);
+ r456 = vaddq_s32(r4_high, r56);
+ r567 = vaddq_s32(r7, r56);
+ r78 = vaddq_s32(r7, r8);
+ r678 = vaddq_s32(r6, r78);
+ store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567);
+ dst2_ptr += (dst_stride << 2);
+
+ r4_high = r8;
+ r34_high = r78;
+ r234_high = r678;
+
+ h -= 4;
+ } while (h > 0);
+ w -= 8;
+ count++;
+ } while (w > 0);
+ }
+
+ {
+ int16x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+ int16x4_t q23, q34, q56, q234, q345, q456, q567;
+ int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678;
+ int32x4_t r1, r2, r3, r4, r5, r6, r7, r8;
+ int16x4_t q678, q78;
+
+ int32_t *src2_ptr;
+ uint16_t *src1_ptr;
+ count = 0;
+ h = height;
+ w = width;
+ do {
+ dst1_ptr = dst1 + (count << 2) * dst_stride;
+ dst2_ptr = dst2 + (count << 2) * dst_stride;
+ src1_ptr = dst1 + (count << 2) * dst_stride;
+ src2_ptr = dst2 + (count << 2) * dst_stride;
+ w = width;
+
+ load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4);
+ transpose_s16_4x4d(&d1, &d2, &d3, &d4);
+ load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4);
+ transpose_s32_4x4(&r1, &r2, &r3, &r4);
+ src1_ptr += 4;
+ src2_ptr += 4;
+
+ q23 = vadd_s16(d2, d3);
+ q234 = vadd_s16(q23, d4);
+ q34 = vadd_s16(d3, d4);
+ dst1_ptr += 2;
+ r23 = vaddq_s32(r2, r3);
+ r234 = vaddq_s32(r23, r4);
+ r34 = vaddq_s32(r3, r4);
+ dst2_ptr += 2;
+
+ do {
+ load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8);
+ transpose_s16_4x4d(&d5, &d6, &d7, &d8);
+ load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8);
+ transpose_s32_4x4(&r5, &r6, &r7, &r8);
+ src1_ptr += 4;
+ src2_ptr += 4;
+
+ q345 = vadd_s16(d5, q34);
+ q56 = vadd_s16(d5, d6);
+ q456 = vadd_s16(d4, q56);
+ q567 = vadd_s16(d7, q56);
+ q78 = vadd_s16(d7, d8);
+ q678 = vadd_s16(d6, q78);
+ transpose_s16_4x4d(&q234, &q345, &q456, &q567);
+ store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+ dst1_ptr += 4;
+
+ d4 = d8;
+ q34 = q78;
+ q234 = q678;
+
+ r345 = vaddq_s32(r5, r34);
+ r56 = vaddq_s32(r5, r6);
+ r456 = vaddq_s32(r4, r56);
+ r567 = vaddq_s32(r7, r56);
+ r78 = vaddq_s32(r7, r8);
+ r678 = vaddq_s32(r6, r78);
+ transpose_s32_4x4(&r234, &r345, &r456, &r567);
+ store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567);
+ dst2_ptr += 4;
+
+ r4 = r8;
+ r34 = r78;
+ r234 = r678;
+ w -= 4;
+ } while (w > 0);
+ h -= 4;
+ count++;
+ } while (h > 0);
+ }
+}
+
+static INLINE int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) {
+ int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+ int32x4_t fours, threes, res;
+
+ xtl = vld1q_s32(buf - buf_stride - 1);
+ xt = vld1q_s32(buf - buf_stride);
+ xtr = vld1q_s32(buf - buf_stride + 1);
+ xl = vld1q_s32(buf - 1);
+ x = vld1q_s32(buf);
+ xr = vld1q_s32(buf + 1);
+ xbl = vld1q_s32(buf + buf_stride - 1);
+ xb = vld1q_s32(buf + buf_stride);
+ xbr = vld1q_s32(buf + buf_stride + 1);
+
+ fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x))));
+ threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+ res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes);
+ return res;
+}
+
+static INLINE void cross_sum_inp_u16(uint16_t *buf, int buf_stride,
+ int32x4_t *a0, int32x4_t *a1) {
+ uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+ uint16x8_t r0, r1;
+
+ xtl = vld1q_u16(buf - buf_stride - 1);
+ xt = vld1q_u16(buf - buf_stride);
+ xtr = vld1q_u16(buf - buf_stride + 1);
+ xl = vld1q_u16(buf - 1);
+ x = vld1q_u16(buf);
+ xr = vld1q_u16(buf + 1);
+ xbl = vld1q_u16(buf + buf_stride - 1);
+ xb = vld1q_u16(buf + buf_stride);
+ xbr = vld1q_u16(buf + buf_stride + 1);
+
+ xb = vaddq_u16(xb, x);
+ xt = vaddq_u16(xt, xr);
+ xl = vaddq_u16(xl, xb);
+ xl = vaddq_u16(xl, xt);
+
+ r0 = vshlq_n_u16(xl, 2);
+
+ xbl = vaddq_u16(xbl, xbr);
+ xtl = vaddq_u16(xtl, xtr);
+ xtl = vaddq_u16(xtl, xbl);
+
+ r1 = vshlq_n_u16(xtl, 2);
+ r1 = vsubq_u16(r1, xtl);
+
+ *a0 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1))));
+ *a1 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1))));
+}
+
+static INLINE int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) {
+ int32x4_t xtr, xt, xtl, xbr, xb, xbl;
+ int32x4_t fives, sixes, fives_plus_sixes;
+
+ xtl = vld1q_s32(buf - buf_stride - 1);
+ xt = vld1q_s32(buf - buf_stride);
+ xtr = vld1q_s32(buf - buf_stride + 1);
+ xbl = vld1q_s32(buf + buf_stride - 1);
+ xb = vld1q_s32(buf + buf_stride);
+ xbr = vld1q_s32(buf + buf_stride + 1);
+
+ fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+ sixes = vaddq_s32(xt, xb);
+ fives_plus_sixes = vaddq_s32(fives, sixes);
+
+ return vaddq_s32(
+ vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride,
+ int32x4_t *a0, int32x4_t *a1) {
+ uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0;
+
+ xtl = vld1q_u16(buf - buf_stride - 1);
+ xt = vld1q_u16(buf - buf_stride);
+ xtr = vld1q_u16(buf - buf_stride + 1);
+ xbl = vld1q_u16(buf + buf_stride - 1);
+ xb = vld1q_u16(buf + buf_stride);
+ xbr = vld1q_u16(buf + buf_stride + 1);
+
+ xbr = vaddq_u16(xbr, xbl);
+ xtr = vaddq_u16(xtr, xtl);
+ xbr = vaddq_u16(xbr, xtr);
+ xtl = vshlq_n_u16(xbr, 2);
+ xbr = vaddq_u16(xtl, xbr);
+
+ xb = vaddq_u16(xb, xt);
+ xb0 = vshlq_n_u16(xb, 1);
+ xb = vshlq_n_u16(xb, 2);
+ xb = vaddq_u16(xb, xb0);
+
+ *a0 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb))));
+ *a1 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb))));
+}
+
+static INLINE int32x4_t cross_sum_fast_odd_row(int32_t *buf) {
+ int32x4_t xl, x, xr;
+ int32x4_t fives, sixes, fives_plus_sixes;
+
+ xl = vld1q_s32(buf - 1);
+ x = vld1q_s32(buf);
+ xr = vld1q_s32(buf + 1);
+ fives = vaddq_s32(xl, xr);
+ sixes = x;
+ fives_plus_sixes = vaddq_s32(fives, sixes);
+
+ return vaddq_s32(
+ vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
+ int32x4_t *a1) {
+ uint16x8_t xl, x, xr;
+ uint16x8_t x0;
+
+ xl = vld1q_u16(buf - 1);
+ x = vld1q_u16(buf);
+ xr = vld1q_u16(buf + 1);
+ xl = vaddq_u16(xl, xr);
+ x0 = vshlq_n_u16(xl, 2);
+ xl = vaddq_u16(xl, x0);
+
+ x0 = vshlq_n_u16(x, 1);
+ x = vshlq_n_u16(x, 2);
+ x = vaddq_u16(x, x0);
+
+ *a0 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x))));
+ *a1 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x))));
+}
+
+void final_filter_fast_internal(uint16_t *A, int32_t *B, const int buf_stride,
+ int16_t *src, const int src_stride,
+ int32_t *dst, const int dst_stride,
+ const int width, const int height) {
+ int16x8_t s0;
+ int32_t *B_tmp, *dst_ptr;
+ uint16_t *A_tmp;
+ int16_t *src_ptr;
+ int32x4_t a_res0, a_res1, b_res0, b_res1;
+ int w, h, count = 0;
+ assert(SGRPROJ_SGR_BITS == 8);
+ assert(SGRPROJ_RST_BITS == 4);
+
+ A_tmp = A;
+ B_tmp = B;
+ src_ptr = src;
+ dst_ptr = dst;
+ h = height;
+ do {
+ A_tmp = (A + count * buf_stride);
+ B_tmp = (B + count * buf_stride);
+ src_ptr = (src + count * src_stride);
+ dst_ptr = (dst + count * dst_stride);
+ w = width;
+ if (!(count & 1)) {
+ do {
+ s0 = vld1q_s16(src_ptr);
+ cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1);
+ a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+ a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+ b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride);
+ b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride);
+ a_res0 = vaddq_s32(a_res0, b_res0);
+ a_res1 = vaddq_s32(a_res1, b_res1);
+
+ a_res0 =
+ vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+ a_res1 =
+ vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+
+ vst1q_s32(dst_ptr, a_res0);
+ vst1q_s32(dst_ptr + 4, a_res1);
+
+ A_tmp += 8;
+ B_tmp += 8;
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ } else {
+ do {
+ s0 = vld1q_s16(src_ptr);
+ cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1);
+ a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+ a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+ b_res0 = cross_sum_fast_odd_row(B_tmp);
+ b_res1 = cross_sum_fast_odd_row(B_tmp + 4);
+ a_res0 = vaddq_s32(a_res0, b_res0);
+ a_res1 = vaddq_s32(a_res1, b_res1);
+
+ a_res0 =
+ vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+ a_res1 =
+ vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+
+ vst1q_s32(dst_ptr, a_res0);
+ vst1q_s32(dst_ptr + 4, a_res1);
+
+ A_tmp += 8;
+ B_tmp += 8;
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+ count++;
+ h -= 1;
+ } while (h > 0);
+}
+
+void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
+ int16_t *src, const int src_stride, int32_t *dst,
+ const int dst_stride, const int width,
+ const int height) {
+ int16x8_t s0;
+ int32_t *B_tmp, *dst_ptr;
+ uint16_t *A_tmp;
+ int16_t *src_ptr;
+ int32x4_t a_res0, a_res1, b_res0, b_res1;
+ int w, h, count = 0;
+
+ assert(SGRPROJ_SGR_BITS == 8);
+ assert(SGRPROJ_RST_BITS == 4);
+ h = height;
+
+ do {
+ A_tmp = (A + count * buf_stride);
+ B_tmp = (B + count * buf_stride);
+ src_ptr = (src + count * src_stride);
+ dst_ptr = (dst + count * dst_stride);
+ w = width;
+ do {
+ s0 = vld1q_s16(src_ptr);
+ cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1);
+ a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+ a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+ b_res0 = cross_sum_inp_s32(B_tmp, buf_stride);
+ b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride);
+ a_res0 = vaddq_s32(a_res0, b_res0);
+ a_res1 = vaddq_s32(a_res1, b_res1);
+
+ a_res0 =
+ vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+ a_res1 =
+ vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+ vst1q_s32(dst_ptr, a_res0);
+ vst1q_s32(dst_ptr + 4, a_res1);
+
+ A_tmp += 8;
+ B_tmp += 8;
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ count++;
+ h -= 1;
+ } while (h > 0);
+}
+
+static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
+ int height, int dgd_stride,
+ int32_t *dst, int dst_stride,
+ int bit_depth, int sgr_params_idx,
+ int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ const int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *square_sum_buf = A_;
+ int32_t *sum_buf = B_;
+ uint16_t *tmp16_buf = A16_;
+
+ assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+ assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+ "Need SGRPROJ_BORDER_* >= r+1");
+
+ assert(radius_idx == 0);
+ assert(r == 2);
+
+ // input(dgd16) is 16bit.
+ // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is
+ // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit
+ // buffer(square_sum_buf).
+ boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+ SGRPROJ_BORDER_HORZ),
+ dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride,
+ width_ext, height_ext);
+
+ square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+ // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+ // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+
+ if (8 == bit_depth) {
+ calc_ab_fast_internal_lbd(
+ (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+ (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
+ params->s[radius_idx], 2);
+ } else {
+ calc_ab_fast_internal_hbd(
+ (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+ (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2,
+ bit_depth, r, params->s[radius_idx], 2);
+ }
+ final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
+ dgd_stride, dst, dst_stride, width, height);
+}
+
+static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
+ int dgd_stride, int32_t *dst,
+ int dst_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+ uint16_t B16_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *square_sum_buf = A_;
+ uint16_t *sum_buf = B16_;
+ uint16_t *A16 = A16_;
+ int32_t *B = B_;
+
+ assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+ assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+ "Need SGRPROJ_BORDER_* >= r+1");
+
+ assert(radius_idx == 1);
+ assert(r == 1);
+
+ // input(dgd16) is 16bit.
+ // sum of pixels output will be in 16bit(sum_buf).
+ // sum of squares output is kept in 32bit buffer(square_sum_buf).
+ boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+ SGRPROJ_BORDER_HORZ),
+ dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext,
+ height_ext);
+
+ square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+ // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+ // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+ if (8 == bit_depth) {
+ calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+ (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+ (B - buf_stride - 1), buf_stride, width + 2,
+ height + 2, r, params->s[radius_idx], 1);
+ } else {
+ calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
+ (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+ (B - buf_stride - 1), buf_stride, width + 2,
+ height + 2, bit_depth, r, params->s[radius_idx], 1);
+ }
+ final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
+ dst_stride, width, height);
+}
+
+static INLINE void src_convert_u8_to_u16(const uint8_t *src,
+ const int src_stride, uint16_t *dst,
+ const int dst_stride, const int width,
+ const int height) {
+ const uint8_t *src_ptr;
+ uint16_t *dst_ptr;
+ int h, w, count = 0;
+
+ uint8x8_t t1, t2, t3, t4;
+ uint16x8_t s1, s2, s3, s4;
+ h = height;
+ do {
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+ w = width;
+ if (w >= 7) {
+ do {
+ load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+ s1 = vmovl_u8(t1);
+ s2 = vmovl_u8(t2);
+ s3 = vmovl_u8(t3);
+ s4 = vmovl_u8(t4);
+ store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 7);
+ }
+
+ for (int y = 0; y < w; y++) {
+ dst_ptr[y] = src_ptr[y];
+ dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+ dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+ dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+ }
+ count++;
+ h -= 4;
+ } while (h > 3);
+
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+ for (int x = 0; x < h; x++) {
+ for (int y = 0; y < width; y++) {
+ dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
+ }
+ }
+}
+
+static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
+ uint16_t *dst, const int dst_stride,
+ int width, int height) {
+ const uint16_t *src_ptr;
+ uint16_t *dst_ptr;
+ int h, w, count = 0;
+ uint16x8_t s1, s2, s3, s4;
+
+ h = height;
+ do {
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+ w = width;
+ do {
+ load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+ store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 7);
+
+ for (int y = 0; y < w; y++) {
+ dst_ptr[y] = src_ptr[y];
+ dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+ dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+ dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+ }
+ count++;
+ h -= 4;
+ } while (h > 3);
+
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+
+ for (int x = 0; x < h; x++) {
+ memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
+ sizeof(uint16_t) * width);
+ }
+}
+
+void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
+ int stride, int32_t *flt0, int32_t *flt1,
+ int flt_stride, int sgr_params_idx,
+ int bit_depth, int highbd) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+ uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+ const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+ uint16_t *dgd16 =
+ dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+ const int dgd_stride = stride;
+
+ if (highbd) {
+ const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+ src_convert_hbd_copy(
+ dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ } else {
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ }
+
+ if (params->r[0] > 0)
+ restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
+ flt_stride, bit_depth, sgr_params_idx, 0);
+ if (params->r[1] > 0)
+ restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride,
+ bit_depth, sgr_params_idx, 1);
+}
+
+void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+ uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+ const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+ uint16_t *dgd16 =
+ dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+ const int dgd_stride = stride;
+ const sgr_params_type *const params = &sgr_params[eps];
+ int xq[2];
+
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+ if (highbd) {
+ const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+ src_convert_hbd_copy(
+ dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ } else {
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ }
+
+ if (params->r[0] > 0)
+ restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width,
+ bit_depth, eps, 0);
+ if (params->r[1] > 0)
+ restoration_internal(dgd16, width, height, dgd16_stride, flt1, width,
+ bit_depth, eps, 1);
+
+ decode_xq(xqd, xq, params);
+
+ {
+ int16_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint16_t *dst16_ptr;
+ int16x4_t d0, d4;
+ int16x8_t r0, s0;
+ uint16x8_t r4;
+ int32x4_t u0, u4, v0, v4, f00, f10;
+ uint8x8_t t0;
+ int count = 0, w = width, h = height, rc = 0;
+
+ const int32x4_t xq0_vec = vdupq_n_s32(xq[0]);
+ const int32x4_t xq1_vec = vdupq_n_s32(xq[1]);
+ const int16x8_t zero = vdupq_n_s16(0);
+ const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1);
+ uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8);
+ dst_ptr = dst8;
+ src_ptr = (int16_t *)dgd16;
+ do {
+ w = width;
+ count = 0;
+ dst_ptr = dst8 + rc * dst_stride;
+ dst16_ptr = dst16 + rc * dst_stride;
+ do {
+ s0 = vld1q_s16(src_ptr + count);
+
+ u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS);
+ u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS);
+
+ v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS);
+ v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS);
+
+ if (params->r[0] > 0) {
+ f00 = vld1q_s32(flt0 + count);
+ f10 = vld1q_s32(flt0 + count + 4);
+
+ f00 = vsubq_s32(f00, u0);
+ f10 = vsubq_s32(f10, u4);
+
+ v0 = vmlaq_s32(v0, xq0_vec, f00);
+ v4 = vmlaq_s32(v4, xq0_vec, f10);
+ }
+
+ if (params->r[1] > 0) {
+ f00 = vld1q_s32(flt1 + count);
+ f10 = vld1q_s32(flt1 + count + 4);
+
+ f00 = vsubq_s32(f00, u0);
+ f10 = vsubq_s32(f10, u4);
+
+ v0 = vmlaq_s32(v0, xq1_vec, f00);
+ v4 = vmlaq_s32(v4, xq1_vec, f10);
+ }
+
+ d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ r0 = vcombine_s16(d0, d4);
+
+ r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));
+
+ if (highbd) {
+ r4 = vminq_u16(r4, max);
+ vst1q_u16(dst16_ptr, r4);
+ } else {
+ t0 = vqmovn_u16(r4);
+ vst1_u8(dst_ptr, t0);
+ }
+ w -= 8;
+ count += 8;
+ dst_ptr += 8;
+ dst16_ptr += 8;
+ } while (w > 0);
+
+ src_ptr += dgd16_stride;
+ flt1 += width;
+ flt0 += width;
+ rc++;
+ h--;
+ } while (h > 0);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h
index 53727bb435..fe134087b4 100644
--- a/third_party/aom/av1/common/arm/transpose_neon.h
+++ b/third_party/aom/av1/common/arm/transpose_neon.h
@@ -419,4 +419,42 @@ static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
*a3 = vreinterpret_s16_s32(c1.val[1]);
}
+static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+ int32x4x2_t b0;
+ b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+ b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+ return b0;
+}
+
+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+ int32x4_t *a2, int32x4_t *a3) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+ const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+ const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+ *a0 = c0.val[0];
+ *a1 = c1.val[0];
+ *a2 = c0.val[1];
+ *a3 = c1.val[1];
+}
+
#endif // AV1_COMMON_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c
index 738290fadf..9d68b8760f 100644
--- a/third_party/aom/av1/common/av1_loopfilter.c
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@@ -1308,7 +1308,7 @@ static int compare_ref_dst(AV1_COMMON *const cm, uint8_t *ref_buf,
end <<= MI_SIZE_LOG2;
uint8_t *ref0 = ref_buf;
uint8_t *dst0 = dst_buf;
- if (cm->use_highbitdepth) {
+ if (cm->seq_params.use_highbitdepth) {
const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref_buf);
const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst_buf);
for (int j = 0; j < 4; ++j) {
@@ -1404,11 +1404,11 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm,
uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
- if (cm->use_highbitdepth)
+ if (cm->seq_params.use_highbitdepth)
highbd_filter_selectively_vert_row2(
ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
- &cm->lf_info, lfl, lfl2, (int)cm->bit_depth);
+ &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
else
filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
mask_16x16_0, mask_8x8_0, mask_4x4_0,
@@ -1474,10 +1474,11 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm,
mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
- if (cm->use_highbitdepth)
- highbd_filter_selectively_horiz(
- CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
- mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->bit_depth);
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+ dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl,
+ (int)cm->seq_params.bit_depth);
else
filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
mask_8x8, mask_4x4, &cm->lf_info, lfl);
@@ -1652,6 +1653,8 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
const int dst_stride = plane_ptr->dst.stride;
const int y_range = (MAX_MIB_SIZE >> scale_vert);
const int x_range = (MAX_MIB_SIZE >> scale_horz);
+ const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+ const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
for (int y = 0; y < y_range; y += row_step) {
uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
for (int x = 0; x < x_range;) {
@@ -1677,40 +1680,40 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
switch (params.filter_length) {
// apply 4-tap filtering
case 4:
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
- cm->bit_depth);
+ bit_depth);
else
aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
case 6: // apply 6-tap filter for chroma plane only
assert(plane != 0);
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
- cm->bit_depth);
+ bit_depth);
else
aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
// apply 8-tap filtering
case 8:
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
- cm->bit_depth);
+ bit_depth);
else
aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
// apply 14-tap filtering
case 14:
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
- cm->bit_depth);
+ bit_depth);
else
aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
@@ -1737,6 +1740,8 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
const int dst_stride = plane_ptr->dst.stride;
const int y_range = (MAX_MIB_SIZE >> scale_vert);
const int x_range = (MAX_MIB_SIZE >> scale_horz);
+ const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+ const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
for (int x = 0; x < x_range; x += col_step) {
uint8_t *p = dst_ptr + x * MI_SIZE;
for (int y = 0; y < y_range;) {
@@ -1762,10 +1767,10 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
switch (params.filter_length) {
// apply 4-tap filtering
case 4:
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
- params.hev_thr, cm->bit_depth);
+ params.hev_thr, bit_depth);
else
aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
@@ -1773,30 +1778,30 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
// apply 6-tap filtering
case 6:
assert(plane != 0);
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
- params.hev_thr, cm->bit_depth);
+ params.hev_thr, bit_depth);
else
aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
// apply 8-tap filtering
case 8:
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
- params.hev_thr, cm->bit_depth);
+ params.hev_thr, bit_depth);
else
aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
// apply 14-tap filtering
case 14:
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
- params.hev_thr, cm->bit_depth);
+ params.hev_thr, bit_depth);
else
aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
diff --git a/third_party/aom/av1/common/av1_rtcd.c b/third_party/aom/av1/common/av1_rtcd.c
index 38e26bee17..a77a4d2541 100644
--- a/third_party/aom/av1/common/av1_rtcd.c
+++ b/third_party/aom/av1/common/av1_rtcd.c
@@ -16,7 +16,7 @@
#include "aom_ports/aom_once.h"
void av1_rtcd() {
- // TODO(JBB): Remove this once, by insuring that both the encoder and
- // decoder setup functions are protected by once();
- once(setup_rtcd_internal);
+ // TODO(JBB): Remove this aom_once, by insuring that both the encoder and
+ // decoder setup functions are protected by aom_once();
+ aom_once(setup_rtcd_internal);
}
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
index 6aa925515a..fa8b34981d 100755
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -106,7 +106,7 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
#inv txfm
add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_inv_txfm_add ssse3 avx2/;
+specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
@@ -181,7 +181,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
#fwd txfm
add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
- specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1/;
+ specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -241,11 +241,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/av1_txb_init_levels sse4_1/;
add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
- specialize qw/av1_wedge_sse_from_residuals sse2/;
+ specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
- specialize qw/av1_wedge_sign_from_residuals sse2/;
+ specialize qw/av1_wedge_sign_from_residuals sse2 avx2/;
add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
- specialize qw/av1_wedge_compute_delta_squares sse2/;
+ specialize qw/av1_wedge_compute_delta_squares sse2 avx2/;
# hash
add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
@@ -288,34 +288,34 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
# LOOP_RESTORATION functions
add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
-specialize qw/apply_selfguided_restoration sse4_1 avx2/;
+specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/;
add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
int sgr_params_idx, int bit_depth, int highbd";
-specialize qw/av1_selfguided_restoration sse4_1 avx2/;
+specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
# CONVOLVE_ROUND/COMPOUND_ROUND functions
-add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-
- add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
- add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+
+ add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
+ add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h
index 5db3233f53..c9cc798525 100644
--- a/third_party/aom/av1/common/av1_txfm.h
+++ b/third_party/aom/av1/common/av1_txfm.h
@@ -171,53 +171,6 @@ static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
}
-static INLINE TX_SIZE av1_rotate_tx_size(TX_SIZE tx_size) {
- switch (tx_size) {
- case TX_4X4: return TX_4X4;
- case TX_8X8: return TX_8X8;
- case TX_16X16: return TX_16X16;
- case TX_32X32: return TX_32X32;
- case TX_64X64: return TX_64X64;
- case TX_32X64: return TX_64X32;
- case TX_64X32: return TX_32X64;
- case TX_4X8: return TX_8X4;
- case TX_8X4: return TX_4X8;
- case TX_8X16: return TX_16X8;
- case TX_16X8: return TX_8X16;
- case TX_16X32: return TX_32X16;
- case TX_32X16: return TX_16X32;
- case TX_4X16: return TX_16X4;
- case TX_16X4: return TX_4X16;
- case TX_8X32: return TX_32X8;
- case TX_32X8: return TX_8X32;
- case TX_16X64: return TX_64X16;
- case TX_64X16: return TX_16X64;
- default: assert(0); return TX_INVALID;
- }
-}
-
-static INLINE TX_TYPE av1_rotate_tx_type(TX_TYPE tx_type) {
- switch (tx_type) {
- case DCT_DCT: return DCT_DCT;
- case ADST_DCT: return DCT_ADST;
- case DCT_ADST: return ADST_DCT;
- case ADST_ADST: return ADST_ADST;
- case FLIPADST_DCT: return DCT_FLIPADST;
- case DCT_FLIPADST: return FLIPADST_DCT;
- case FLIPADST_FLIPADST: return FLIPADST_FLIPADST;
- case ADST_FLIPADST: return FLIPADST_ADST;
- case FLIPADST_ADST: return ADST_FLIPADST;
- case IDTX: return IDTX;
- case V_DCT: return H_DCT;
- case H_DCT: return V_DCT;
- case V_ADST: return H_ADST;
- case H_ADST: return V_ADST;
- case V_FLIPADST: return H_FLIPADST;
- case H_FLIPADST: return V_FLIPADST;
- default: assert(0); return TX_TYPES;
- }
-}
-
// Utility function that returns the log of the ratio of the col and row
// sizes.
static INLINE int get_rect_tx_log_ratio(int col, int row) {
diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h
index 3e8d1d6c6b..979f13bd99 100644
--- a/third_party/aom/av1/common/blockd.h
+++ b/third_party/aom/av1/common/blockd.h
@@ -605,6 +605,12 @@ static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
}
+static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
+ return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ ? CONVERT_TO_BYTEPTR(buf16)
+ : buf16;
+}
+
static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
switch (bsize) {
case BLOCK_4X4: return 0;
@@ -674,6 +680,15 @@ static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
};
+static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = {
+ 0x0001, // 0000 0000 0000 0001
+ 0x0201, // 0000 0010 0000 0001
+ 0x020F, // 0000 0010 0000 1111
+ 0x0E0F, // 0000 1110 0000 1111
+ 0x0FFF, // 0000 1111 1111 1111
+ 0xFFFF, // 1111 1111 1111 1111
+};
+
static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
int use_reduced_set) {
const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
@@ -1145,38 +1160,6 @@ static INLINE PLANE_TYPE get_plane_type(int plane) {
return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
}
-static INLINE void transpose_uint8(uint8_t *dst, int dst_stride,
- const uint8_t *src, int src_stride, int w,
- int h) {
- int r, c;
- for (r = 0; r < h; ++r)
- for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
-static INLINE void transpose_uint16(uint16_t *dst, int dst_stride,
- const uint16_t *src, int src_stride, int w,
- int h) {
- int r, c;
- for (r = 0; r < h; ++r)
- for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
-static INLINE void transpose_int16(int16_t *dst, int dst_stride,
- const int16_t *src, int src_stride, int w,
- int h) {
- int r, c;
- for (r = 0; r < h; ++r)
- for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
-static INLINE void transpose_int32(int32_t *dst, int dst_stride,
- const int32_t *src, int src_stride, int w,
- int h) {
- int r, c;
- for (r = 0; r < h; ++r)
- for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) {
return 1024;
diff --git a/third_party/aom/av1/common/cdef.c b/third_party/aom/av1/common/cdef.c
index c9b9749008..e9e2b0e42c 100644
--- a/third_party/aom/av1/common/cdef.c
+++ b/third_party/aom/av1/common/cdef.c
@@ -110,7 +110,7 @@ void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
static void copy_sb8_16(AOM_UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
const uint8_t *src, int src_voffset, int src_hoffset,
int sstride, int vsize, int hsize) {
- if (cm->use_highbitdepth) {
+ if (cm->seq_params.use_highbitdepth) {
const uint16_t *base =
&CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
@@ -153,7 +153,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
int mi_high_l2[3];
int xdec[3];
int ydec[3];
- int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
+ int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
@@ -363,7 +363,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
}
- if (cm->use_highbitdepth) {
+ if (cm->seq_params.use_highbitdepth) {
cdef_filter_fb(
NULL,
&CONVERT_TO_SHORTPTR(
diff --git a/third_party/aom/av1/common/cfl.c b/third_party/aom/av1/common/cfl.c
index ee19f0bcfb..ccc59b4eb7 100644
--- a/third_party/aom/av1/common/cfl.c
+++ b/third_party/aom/av1/common/cfl.c
@@ -15,21 +15,14 @@
#include "config/av1_rtcd.h"
-void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm) {
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) {
assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
- if (!(cm->subsampling_x == 0 && cm->subsampling_y == 0) &&
- !(cm->subsampling_x == 1 && cm->subsampling_y == 1) &&
- !(cm->subsampling_x == 1 && cm->subsampling_y == 0)) {
- aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
- "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported by "
- "CfL, %d %d subsampling is not supported.\n",
- cm->subsampling_x, cm->subsampling_y);
- }
+
memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3));
memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3));
- cfl->subsampling_x = cm->subsampling_x;
- cfl->subsampling_y = cm->subsampling_y;
+ cfl->subsampling_x = seq_params->subsampling_x;
+ cfl->subsampling_y = seq_params->subsampling_y;
cfl->are_parameters_computed = 0;
cfl->store_y = 0;
// The DC_PRED cache is disabled by default and is only enabled in
diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c
index d57f44f8b3..ed962c722a 100644
--- a/third_party/aom/av1/common/convolve.c
+++ b/third_party/aom/av1/common/convolve.c
@@ -75,8 +75,8 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
@@ -91,7 +91,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// horizontal filter
const uint8_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < im_h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -107,7 +107,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// vertical filter
int16_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
@@ -126,8 +126,8 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -141,7 +141,7 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -156,8 +156,8 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -172,7 +172,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -187,8 +187,8 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
(void)filter_params_x;
@@ -204,8 +204,8 @@ void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -222,7 +222,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
// horizontal filter
const uint8_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < im_h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -238,7 +238,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
// vertical filter
int16_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
@@ -270,8 +270,8 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -289,7 +289,7 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -320,8 +320,8 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -339,7 +339,7 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -370,8 +370,8 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
uint8_t *dst8, int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -412,8 +412,8 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int x_step_qn,
const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params) {
@@ -439,7 +439,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(x_filter_idx < SUBPEL_SHIFTS);
const int16_t *x_filter =
- av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
int32_t sum = (1 << (bd + FILTER_BITS - 1));
for (int k = 0; k < filter_params_x->taps; ++k) {
sum += x_filter[k] * src_x[k - fo_horiz];
@@ -461,7 +461,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(y_filter_idx < SUBPEL_SHIFTS);
const int16_t *y_filter =
- av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
int32_t sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
@@ -498,8 +498,8 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
static void convolve_2d_scale_wrapper(
const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params) {
if (conv_params->is_compound) {
@@ -520,25 +520,27 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
(void)y_step_q4;
(void)dst;
(void)dst_stride;
-
- InterpFilterParams filter_params_x, filter_params_y;
- av1_get_convolve_filter_params(interp_filters, &filter_params_x,
- &filter_params_y, w, h);
+ InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
+ InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(filter_x, w);
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(filter_y, h);
if (scaled)
convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
- &filter_params_x, &filter_params_y, subpel_x_q4,
+ filter_params_x, filter_params_y, subpel_x_q4,
x_step_q4, subpel_y_q4, y_step_q4, conv_params);
else
sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
- src, src_stride, dst, dst_stride, w, h, &filter_params_x,
- &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
+ src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
}
void av1_highbd_convolve_2d_copy_sr_c(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
(void)filter_params_x;
(void)filter_params_y;
@@ -554,8 +556,8 @@ void av1_highbd_convolve_2d_copy_sr_c(
void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -569,7 +571,7 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -585,8 +587,8 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -599,7 +601,7 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -614,8 +616,8 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
@@ -630,7 +632,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
// horizontal filter
const uint16_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < im_h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -646,7 +648,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
// vertical filter
int16_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
@@ -666,8 +668,9 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
uint16_t *dst16, int dst16_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
int x, y, k;
@@ -685,7 +688,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
// horizontal filter
const uint16_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (y = 0; y < im_h; ++y) {
for (x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -703,7 +706,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
int16_t *src_vert = im_block + fo_vert * im_stride;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
int32_t sum = 1 << offset_bits;
@@ -734,8 +737,9 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
uint16_t *dst16, int dst16_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -753,7 +757,7 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
assert(bits >= 0);
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -784,8 +788,9 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
uint16_t *dst16, int dst16_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -803,7 +808,7 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
assert(bits >= 0);
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -834,8 +839,8 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
void av1_highbd_jnt_convolve_2d_copy_c(
const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
- int w, int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int w, int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
@@ -875,8 +880,8 @@ void av1_highbd_jnt_convolve_2d_copy_c(
void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int x_step_qn,
const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params, int bd) {
@@ -900,7 +905,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(x_filter_idx < SUBPEL_SHIFTS);
const int16_t *x_filter =
- av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
int32_t sum = (1 << (bd + FILTER_BITS - 1));
for (int k = 0; k < filter_params_x->taps; ++k) {
sum += x_filter[k] * src_x[k - fo_horiz];
@@ -922,7 +927,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(y_filter_idx < SUBPEL_SHIFTS);
const int16_t *y_filter =
- av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
int32_t sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
@@ -971,9 +976,12 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
(void)dst_stride;
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- InterpFilterParams filter_params_x, filter_params_y;
- av1_get_convolve_filter_params(interp_filters, &filter_params_x,
- &filter_params_y, w, h);
+ InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
+ InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(filter_x, w);
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(filter_y, h);
if (scaled) {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -981,16 +989,16 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
assert(conv_params->dst != NULL);
}
av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
- &filter_params_x, &filter_params_y,
- subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4,
- conv_params, bd);
+ filter_params_x, filter_params_y, subpel_x_q4,
+ x_step_q4, subpel_y_q4, y_step_q4, conv_params,
+ bd);
} else {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
0][conv_params->is_compound](
- src, src_stride, dst, dst_stride, w, h, &filter_params_x,
- &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
+ src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
}
}
diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h
index 1b2c2d0d5a..bc2d4bccf6 100644
--- a/third_party/aom/av1/common/convolve.h
+++ b/third_party/aom/av1/common/convolve.h
@@ -40,27 +40,17 @@ typedef struct ConvolveParams {
typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params);
typedef void (*aom_highbd_convolve_fn_t)(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-static INLINE void av1_get_convolve_filter_params(InterpFilters interp_filters,
- InterpFilterParams *params_x,
- InterpFilterParams *params_y,
- int w, int h) {
- InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
- InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
- *params_x = av1_get_interp_filter_params_with_block_size(filter_x, w);
- *params_y = av1_get_interp_filter_params_with_block_size(filter_y, h);
-}
-
struct AV1Common;
struct scale_factors;
diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h
index a37ee9f24f..689c25f30a 100644
--- a/third_party/aom/av1/common/enums.h
+++ b/third_party/aom/av1/common/enums.h
@@ -557,6 +557,7 @@ typedef uint8_t TXFM_CONTEXT;
#define BWDREF_FRAME 5
#define ALTREF2_FRAME 6
#define ALTREF_FRAME 7
+#define EXTREF_FRAME REF_FRAMES
#define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
#define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
@@ -607,6 +608,7 @@ typedef enum ATTRIBUTE_PACKED {
// In large_scale_tile coding, external references are used.
#define MAX_EXTERNAL_REFERENCES 128
+#define MAX_TILES 512
#ifdef __cplusplus
} // extern "C"
diff --git a/third_party/aom/av1/common/filter.c b/third_party/aom/av1/common/filter.c
deleted file mode 100644
index a7e67ea4ad..0000000000
--- a/third_party/aom/av1/common/filter.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/filter.h"
-
-DECLARE_ALIGNED(256, static const InterpKernel,
- bilinear_filters[SUBPEL_SHIFTS]) = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 },
- { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
- { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 },
- { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 },
- { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 },
- { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 },
- { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 },
- { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
- sub_pel_filters_8[SUBPEL_SHIFTS]) = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, -6, 126, 8, -2, 0, 0 },
- { 0, 2, -10, 122, 18, -4, 0, 0 }, { 0, 2, -12, 116, 28, -8, 2, 0 },
- { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
- { 0, 2, -16, 94, 58, -12, 2, 0 }, { 0, 2, -14, 84, 66, -12, 2, 0 },
- { 0, 2, -14, 76, 76, -14, 2, 0 }, { 0, 2, -12, 66, 84, -14, 2, 0 },
- { 0, 2, -12, 58, 94, -16, 2, 0 }, { 0, 2, -12, 48, 102, -14, 2, 0 },
- { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
- { 0, 0, -4, 18, 122, -10, 2, 0 }, { 0, 0, -2, 8, 126, -6, 2, 0 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
- sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { -2, 2, -6, 126, 8, -2, 2, 0 },
- { -2, 6, -12, 124, 16, -6, 4, -2 }, { -2, 8, -18, 120, 26, -10, 6, -2 },
- { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
- { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
- { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
- { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
- { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
- { -2, 4, -6, 16, 124, -12, 6, -2 }, { 0, 2, -2, 8, 126, -6, 2, -2 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
- sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, 28, 62, 34, 2, 0, 0 },
- { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
- { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
- { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, -2, 16, 54, 48, 12, 0, 0 },
- { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
- { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
- { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
- { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 28, 2, 0 }
-};
-
-static const InterpFilterParams
- av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
- { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
- EIGHTTAP_REGULAR },
- { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
- EIGHTTAP_SMOOTH },
- { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
- MULTITAP_SHARP },
- { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
- BILINEAR }
- };
-
-DECLARE_ALIGNED(256, static const InterpKernel,
- sub_pel_filters_4[SUBPEL_SHIFTS]) = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 },
- { 0, 0, -8, 122, 18, -4, 0, 0 }, { 0, 0, -10, 116, 28, -6, 0, 0 },
- { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
- { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
- { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
- { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
- { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
- { 0, 0, -4, 18, 122, -8, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 }
-};
-DECLARE_ALIGNED(256, static const InterpKernel,
- sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 30, 62, 34, 2, 0, 0 },
- { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
- { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
- { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
- { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
- { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
- { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
- { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 }
-};
-
-static const InterpFilterParams av1_interp_4tap[2] = {
- { (const int16_t *)sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
- EIGHTTAP_REGULAR },
- { (const int16_t *)sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
- EIGHTTAP_SMOOTH },
-};
-
-InterpFilterParams av1_get_interp_filter_params_with_block_size(
- const InterpFilter interp_filter, const int w) {
- if (w <= 4 &&
- (interp_filter == MULTITAP_SHARP || interp_filter == EIGHTTAP_REGULAR))
- return av1_interp_4tap[0];
- else if (w <= 4 && interp_filter == EIGHTTAP_SMOOTH)
- return av1_interp_4tap[1];
-
- return av1_interp_filter_params_list[interp_filter];
-}
-
-const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter) {
- return (const int16_t *)av1_interp_filter_params_list[interp_filter]
- .filter_ptr;
-}
diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h
index 0c24ad9d04..7f8ad583a9 100644
--- a/third_party/aom/av1/common/filter.h
+++ b/third_party/aom/av1/common/filter.h
@@ -64,8 +64,8 @@ static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter;
}
-#define LOG_SWITCHABLE_FILTERS \
- 2 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+/* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+#define LOG_SWITCHABLE_FILTERS 2
#define MAX_SUBPEL_TAPS 12
#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
@@ -79,14 +79,116 @@ typedef struct InterpFilterParams {
InterpFilter interp_filter;
} InterpFilterParams;
-const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter);
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_bilinear_filters[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 },
+ { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
+ { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 },
+ { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 },
+ { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 },
+ { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 },
+ { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 },
+ { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, -6, 126, 8, -2, 0, 0 },
+ { 0, 2, -10, 122, 18, -4, 0, 0 }, { 0, 2, -12, 116, 28, -8, 2, 0 },
+ { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
+ { 0, 2, -16, 94, 58, -12, 2, 0 }, { 0, 2, -14, 84, 66, -12, 2, 0 },
+ { 0, 2, -14, 76, 76, -14, 2, 0 }, { 0, 2, -12, 66, 84, -14, 2, 0 },
+ { 0, 2, -12, 58, 94, -16, 2, 0 }, { 0, 2, -12, 48, 102, -14, 2, 0 },
+ { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
+ { 0, 0, -4, 18, 122, -10, 2, 0 }, { 0, 0, -2, 8, 126, -6, 2, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { -2, 2, -6, 126, 8, -2, 2, 0 },
+ { -2, 6, -12, 124, 16, -6, 4, -2 }, { -2, 8, -18, 120, 26, -10, 6, -2 },
+ { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
+ { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
+ { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
+ { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
+ { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
+ { -2, 4, -6, 16, 124, -12, 6, -2 }, { 0, 2, -2, 8, 126, -6, 2, -2 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, 28, 62, 34, 2, 0, 0 },
+ { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
+ { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
+ { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, -2, 16, 54, 48, 12, 0, 0 },
+ { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
+ { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+ { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
+ { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 28, 2, 0 }
+};
+
+static const InterpFilterParams
+ av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+ { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ EIGHTTAP_REGULAR },
+ { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS,
+ SUBPEL_SHIFTS, EIGHTTAP_SMOOTH },
+ { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ MULTITAP_SHARP },
+ { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ BILINEAR }
+ };
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 },
+ { 0, 0, -8, 122, 18, -4, 0, 0 }, { 0, 0, -10, 116, 28, -6, 0, 0 },
+ { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
+ { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
+ { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
+ { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
+ { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
+ { 0, 0, -4, 18, 122, -8, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 }
+};
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 30, 62, 34, 2, 0, 0 },
+ { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
+ { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
+ { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
+ { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
+ { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+ { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
+ { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 }
+};
+
+// For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
+static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
+ { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ EIGHTTAP_REGULAR },
+ { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ EIGHTTAP_SMOOTH },
+ { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ EIGHTTAP_REGULAR },
+ { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ BILINEAR },
+};
+
+static INLINE const InterpFilterParams *
+av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
+ const int w) {
+ if (w <= 4) return &av1_interp_4tap[interp_filter];
+ return &av1_interp_filter_params_list[interp_filter];
+}
-InterpFilterParams av1_get_interp_filter_params_with_block_size(
- const InterpFilter interp_filter, const int w);
+static INLINE const int16_t *av1_get_interp_filter_kernel(
+ const InterpFilter interp_filter) {
+ return av1_interp_filter_params_list[interp_filter].filter_ptr;
+}
static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
- const InterpFilterParams filter_params, const int subpel) {
- return filter_params.filter_ptr + filter_params.taps * subpel;
+ const InterpFilterParams *const filter_params, const int subpel) {
+ return filter_params->filter_ptr + filter_params->taps * subpel;
}
#ifdef __cplusplus
diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h
index a6227f18f5..c2495640e6 100644
--- a/third_party/aom/av1/common/mv.h
+++ b/third_party/aom/av1/common/mv.h
@@ -294,9 +294,6 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
mv->row = clamp(mv->row, min_row, max_row);
}
-static INLINE int mv_has_subpel(const MV *mv) {
- return (mv->row & SUBPEL_MASK) || (mv->col & SUBPEL_MASK);
-}
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h
index 716b4a2473..f68c159e1f 100644
--- a/third_party/aom/av1/common/mvref_common.h
+++ b/third_party/aom/av1/common/mvref_common.h
@@ -44,7 +44,7 @@ static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) {
assert(b >= 0 && b < (1 << bits));
int diff = a - b;
- int m = 1 << (bits - 1);
+ const int m = 1 << (bits - 1);
diff = (diff & (m - 1)) - (diff & m);
return diff;
}
diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h
index fa5f02e520..6b1bf2d74b 100644
--- a/third_party/aom/av1/common/onyxc_int.h
+++ b/third_party/aom/av1/common/onyxc_int.h
@@ -184,7 +184,10 @@ typedef struct BitstreamLevel {
uint8_t minor;
} BitstreamLevel;
-/* Initial version of sequence header structure */
+// Sequence header structure.
+// Note: All syntax elements of sequence_header_obu that need to be
+// bit-identical across multiple sequence headers must be part of this struct,
+// so that consistency is checked by are_seq_headers_consistent() function.
typedef struct SequenceHeader {
int num_bits_width;
int num_bits_height;
@@ -205,7 +208,6 @@ typedef struct SequenceHeader {
// 2 - adaptive
int still_picture; // Video is a single frame still picture
int reduced_still_picture_hdr; // Use reduced header for still picture
- int monochrome; // Monochorme video
int enable_filter_intra; // enables/disables filterintra
int enable_intra_edge_filter; // enables/disables corner/edge/upsampling
int enable_interintra_compound; // enables/disables interintra_compound
@@ -229,6 +231,9 @@ typedef struct SequenceHeader {
// enabled for that frame.
int enable_cdef; // To turn on/off CDEF
int enable_restoration; // To turn on/off loop restoration
+ BITSTREAM_PROFILE profile;
+
+ // Operating point info.
int operating_points_cnt_minus_1;
int operating_point_idc[MAX_NUM_OPERATING_POINTS];
int display_model_info_present_flag;
@@ -236,15 +241,26 @@ typedef struct SequenceHeader {
BitstreamLevel level[MAX_NUM_OPERATING_POINTS];
uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in the spec. One bit: 0
// or 1.
-} SequenceHeader;
-typedef struct AV1Common {
- struct aom_internal_error_info error;
+ // Color config.
+ aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1,
+ // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
+ int use_highbitdepth; // If true, we need to use 16bit frame buffers.
+ int monochrome; // Monochorme video
aom_color_primaries_t color_primaries;
aom_transfer_characteristics_t transfer_characteristics;
aom_matrix_coefficients_t matrix_coefficients;
- aom_chroma_sample_position_t chroma_sample_position;
int color_range;
+ int subsampling_x; // Chroma subsampling for x
+ int subsampling_y; // Chroma subsampling for y
+ aom_chroma_sample_position_t chroma_sample_position;
+ int separate_uv_delta_q;
+
+ int film_grain_params_present;
+} SequenceHeader;
+
+typedef struct AV1Common {
+ struct aom_internal_error_info error;
int width;
int height;
int render_width;
@@ -253,18 +269,11 @@ typedef struct AV1Common {
int last_height;
int timing_info_present;
aom_timing_info_t timing_info;
- int buffer_removal_delay_present;
+ int buffer_removal_time_present;
aom_dec_model_info_t buffer_model;
aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
- int tu_presentation_delay_flag;
- int64_t tu_presentation_delay;
-
- // TODO(jkoleszar): this implies chroma ss right now, but could vary per
- // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to
- // support additional planes.
- int subsampling_x;
- int subsampling_y;
+ uint32_t frame_presentation_time;
int largest_tile_id;
size_t largest_tile_size;
@@ -273,8 +282,6 @@ typedef struct AV1Common {
// Scale of the current frame with respect to itself.
struct scale_factors sf_identity;
- // Marks if we need to use 16bit frame buffers (1: yes, 0: no).
- int use_highbitdepth;
YV12_BUFFER_CONFIG *frame_to_show;
RefCntBuffer *prev_frame;
@@ -342,8 +349,6 @@ typedef struct AV1Common {
int u_ac_delta_q;
int v_ac_delta_q;
- int separate_uv_delta_q;
-
// The dequantizers below are true dequntizers used only in the
// dequantization process. They have the same coefficient
// shift/scale as TX.
@@ -447,10 +452,7 @@ typedef struct AV1Common {
unsigned int frame_offset;
unsigned int current_video_frame;
- BITSTREAM_PROFILE profile;
- // AOM_BITS_8 in profile 0 or 1, AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
- aom_bit_depth_t bit_depth;
aom_bit_depth_t dequant_bit_depth; // bit_depth of current dequantizer
int error_resilient_mode;
@@ -494,9 +496,8 @@ typedef struct AV1Common {
ENTROPY_CONTEXT **above_context[MAX_MB_PLANE];
TXFM_CONTEXT **above_txfm_context;
WarpedMotionParams global_motion[REF_FRAMES];
- aom_film_grain_table_t *film_grain_table;
- int film_grain_params_present;
aom_film_grain_t film_grain_params;
+
int cdef_pri_damping;
int cdef_sec_damping;
int nb_cdef_strengths;
@@ -590,7 +591,7 @@ static INLINE int get_free_fb(AV1_COMMON *cm) {
if (frame_bufs[i].ref_count == 0) break;
if (i != FRAME_BUFFERS) {
- if (frame_bufs[i].buf.use_external_refernce_buffers) {
+ if (frame_bufs[i].buf.use_external_reference_buffers) {
// If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
// external reference buffers. Restore the buffer pointers to point to the
// internally allocated memory.
@@ -598,7 +599,7 @@ static INLINE int get_free_fb(AV1_COMMON *cm) {
ybf->y_buffer = ybf->store_buf_adr[0];
ybf->u_buffer = ybf->store_buf_adr[1];
ybf->v_buffer = ybf->store_buf_adr[2];
- ybf->use_external_refernce_buffers = 0;
+ ybf->use_external_reference_buffers = 0;
}
frame_bufs[i].ref_count = 1;
@@ -683,15 +684,7 @@ static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
}
}
-static INLINE int mi_cols_aligned_to_sb(const AV1_COMMON *cm) {
- return ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
-}
-
-static INLINE int mi_rows_aligned_to_sb(const AV1_COMMON *cm) {
- return ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
-}
-
-void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm);
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
static INLINE int av1_num_planes(const AV1_COMMON *cm) {
return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
@@ -734,7 +727,7 @@ static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
}
xd->mi_stride = cm->mi_stride;
xd->error_info = &cm->error;
- cfl_init(&xd->cfl, cm);
+ cfl_init(&xd->cfl, &cm->seq_params);
}
static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
@@ -1066,17 +1059,18 @@ static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
}
-static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
+static INLINE void av1_zero_above_context(AV1_COMMON *const cm, const MACROBLOCKD *xd,
int mi_col_start, int mi_col_end, const int tile_row) {
+ const SequenceHeader *const seq_params = &cm->seq_params;
const int num_planes = av1_num_planes(cm);
const int width = mi_col_end - mi_col_start;
const int aligned_width =
- ALIGN_POWER_OF_TWO(width, cm->seq_params.mib_size_log2);
+ ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
const int offset_y = mi_col_start;
const int width_y = aligned_width;
- const int offset_uv = offset_y >> cm->subsampling_x;
- const int width_uv = width_y >> cm->subsampling_x;
+ const int offset_uv = offset_y >> seq_params->subsampling_x;
+ const int width_uv = width_y >> seq_params->subsampling_x;
av1_zero_array(cm->above_context[0][tile_row] + offset_y, width_y);
if (num_planes > 1) {
@@ -1084,7 +1078,7 @@ static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
av1_zero_array(cm->above_context[1][tile_row] + offset_uv, width_uv);
av1_zero_array(cm->above_context[2][tile_row] + offset_uv, width_uv);
} else {
- aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
"Invalid value of planes");
}
}
diff --git a/third_party/aom/av1/common/quant_common.c b/third_party/aom/av1/common/quant_common.c
index 84575d74b4..0e14da7a38 100644
--- a/third_party/aom/av1/common/quant_common.c
+++ b/third_party/aom/av1/common/quant_common.c
@@ -223,29 +223,6 @@ int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
return av1_ac_quant_Q3(qindex, delta, bit_depth);
}
-int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth) {
- int i;
- const int16_t *tab = ac_qlookup_Q3;
- switch (bit_depth) {
- case AOM_BITS_10: {
- tab = ac_qlookup_10_Q3;
- break;
- }
- case AOM_BITS_12: {
- tab = ac_qlookup_12_Q3;
- break;
- }
- default:
- assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
- return -1;
- }
- (void)bit_depth;
- for (i = 0; i < QINDEX_RANGE; i++) {
- if (ac_Q3 <= tab[i]) return i;
- }
- return QINDEX_RANGE - 1;
-}
-
int av1_get_qindex(const struct segmentation *seg, int segment_id,
int base_qindex) {
if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h
index f9681036d1..ca199e94ce 100644
--- a/third_party/aom/av1/common/quant_common.h
+++ b/third_party/aom/av1/common/quant_common.h
@@ -42,7 +42,6 @@ int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
-int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth);
int av1_get_qindex(const struct segmentation *seg, int segment_id,
int base_qindex);
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
index b6ac436fb5..b9f0b57f38 100644
--- a/third_party/aom/av1/common/reconinter.c
+++ b/third_party/aom/av1/common/reconinter.c
@@ -627,9 +627,7 @@ void av1_make_masked_inter_predictor(
tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
#undef INTER_PRED_BYTES_PER_PIXEL
- uint8_t *tmp_dst = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- ? CONVERT_TO_BYTEPTR(tmp_buf)
- : tmp_buf;
+ uint8_t *tmp_dst = get_buf_by_bd(xd, tmp_buf);
const int tmp_buf_stride = MAX_SB_SIZE;
CONV_BUF_TYPE *org_dst = conv_params->dst;
@@ -1002,8 +1000,8 @@ void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
{ xd->plane[0].dst.stride, 0, 0 } };
if (!ctx) ctx = &default_ctx;
- av1_build_interintra_predictors_sby(cm, xd, xd->plane[0].dst.buf,
- xd->plane[0].dst.stride, ctx, bsize);
+ av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride, ctx, 0, bsize);
}
}
@@ -1609,10 +1607,10 @@ void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
const int ssy = xd->plane[plane].subsampling_y;
BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode];
- xd->mi[0]->angle_delta[PLANE_TYPE_Y] = 0;
- xd->mi[0]->angle_delta[PLANE_TYPE_UV] = 0;
- xd->mi[0]->filter_intra_mode_info.use_filter_intra = 0;
- xd->mi[0]->use_intrabc = 0;
+ assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0);
+ assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0);
+ assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0);
+ assert(xd->mi[0]->use_intrabc == 0);
av1_predict_intra_block(cm, xd, pd->width, pd->height,
max_txsize_rect_lookup[plane_bsize], mode, 0, 0,
@@ -1642,42 +1640,23 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
inter_pred, inter_stride, intra_pred, intra_stride);
}
-void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *ypred, int ystride,
- BUFFER_SET *ctx, BLOCK_SIZE bsize) {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
- av1_build_intra_predictors_for_interintra(
- cm, xd, bsize, 0, ctx, CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
- av1_combine_interintra(xd, bsize, 0, ypred, ystride,
- CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
- return;
- }
- {
- DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
- av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, ctx,
- intrapredictor, MAX_SB_SIZE);
- av1_combine_interintra(xd, bsize, 0, ypred, ystride, intrapredictor,
- MAX_SB_SIZE);
- }
-}
-
-void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *upred, int ustride,
+// build interintra_predictors for one plane
+void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *pred, int stride,
BUFFER_SET *ctx, int plane,
BLOCK_SIZE bsize) {
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- DECLARE_ALIGNED(16, uint16_t, uintrapredictor[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
av1_build_intra_predictors_for_interintra(
- cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(uintrapredictor),
+ cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
MAX_SB_SIZE);
- av1_combine_interintra(xd, bsize, plane, upred, ustride,
- CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE);
+ av1_combine_interintra(xd, bsize, plane, pred, stride,
+ CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
} else {
- DECLARE_ALIGNED(16, uint8_t, uintrapredictor[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
- uintrapredictor, MAX_SB_SIZE);
- av1_combine_interintra(xd, bsize, plane, upred, ustride, uintrapredictor,
+ intrapredictor, MAX_SB_SIZE);
+ av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor,
MAX_SB_SIZE);
}
}
@@ -1686,8 +1665,8 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
uint8_t *upred, uint8_t *vpred,
int ustride, int vstride,
BUFFER_SET *ctx, BLOCK_SIZE bsize) {
- av1_build_interintra_predictors_sbc(cm, xd, upred, ustride, ctx, 1, bsize);
- av1_build_interintra_predictors_sbc(cm, xd, vpred, vstride, ctx, 2, bsize);
+ av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
+ av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
}
void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -1695,7 +1674,7 @@ void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
uint8_t *vpred, int ystride, int ustride,
int vstride, BUFFER_SET *ctx,
BLOCK_SIZE bsize) {
- av1_build_interintra_predictors_sby(cm, xd, ypred, ystride, ctx, bsize);
+ av1_build_interintra_predictors_sbp(cm, xd, ypred, ystride, ctx, 0, bsize);
av1_build_interintra_predictors_sbuv(cm, xd, upred, vpred, ustride, vstride,
ctx, bsize);
}
@@ -1713,9 +1692,7 @@ static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
struct buf_2d *const pre_buf = &pd->pre[ref];
- const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
- uint8_t *const dst =
- (hbd ? CONVERT_TO_BYTEPTR(ext_dst) : ext_dst) + ext_dst_stride * y + x;
+ uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
const MV mv = mi->mv[ref].as_mv;
ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h
index aa3aefc885..6a3def270d 100644
--- a/third_party/aom/av1/common/reconinter.h
+++ b/third_party/aom/av1/common/reconinter.h
@@ -412,12 +412,9 @@ void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
int vstride, BUFFER_SET *ctx,
BLOCK_SIZE bsize);
-void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *ypred, int ystride,
- BUFFER_SET *ctx, BLOCK_SIZE bsize);
-
-void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *upred, int ustride,
+// build interintra_predictors for one plane
+void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *pred, int stride,
BUFFER_SET *ctx, int plane,
BLOCK_SIZE bsize);
@@ -429,6 +426,7 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
void av1_build_intra_predictors_for_interintra(
const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);
+
void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
const uint8_t *inter_pred, int inter_stride,
const uint8_t *intra_pred, int intra_stride);
diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c
index 21d1f60b29..71a52e73e5 100644
--- a/third_party/aom/av1/common/reconintra.c
+++ b/third_party/aom/av1/common/reconintra.c
@@ -1071,13 +1071,6 @@ static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
p_left[-1] = s;
}
-static int use_intra_edge_upsample(int bs0, int bs1, int delta, int type) {
- const int d = abs(delta);
- const int blk_wh = bs0 + bs1;
- if (d <= 0 || d >= 40) return 0;
- return type ? (blk_wh <= 8) : (blk_wh <= 16);
-}
-
void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
// interpolate half-sample positions
assert(sz <= MAX_UPSAMPLE_SZ);
@@ -1284,13 +1277,13 @@ static void build_intra_predictors_high(
}
}
upsample_above =
- use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+ av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
if (need_above && upsample_above) {
const int n_px = txwpx + (need_right ? txhpx : 0);
av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
}
upsample_left =
- use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+ av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
if (need_left && upsample_left) {
const int n_px = txhpx + (need_bottom ? txwpx : 0);
av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
@@ -1467,13 +1460,13 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
}
}
upsample_above =
- use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+ av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
if (need_above && upsample_above) {
const int n_px = txwpx + (need_right ? txhpx : 0);
av1_upsample_intra_edge(above_row, n_px);
}
upsample_left =
- use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+ av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
if (need_left && upsample_left) {
const int n_px = txhpx + (need_bottom ? txwpx : 0);
av1_upsample_intra_edge(left_col, n_px);
@@ -1642,4 +1635,6 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
dst_stride, dst, dst_stride, blk_col, blk_row, plane);
}
-void av1_init_intra_predictors(void) { once(init_intra_predictors_internal); }
+void av1_init_intra_predictors(void) {
+ aom_once(init_intra_predictors_internal);
+}
diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h
index a7d9e8b799..57638f24e6 100644
--- a/third_party/aom/av1/common/reconintra.h
+++ b/third_party/aom/av1/common/reconintra.h
@@ -12,6 +12,8 @@
#ifndef AV1_COMMON_RECONINTRA_H_
#define AV1_COMMON_RECONINTRA_H_
+#include <stdlib.h>
+
#include "aom/aom_integer.h"
#include "av1/common/blockd.h"
#include "av1/common/onyxc_int.h"
@@ -103,6 +105,14 @@ static INLINE int av1_get_dy(int angle) {
return 1;
}
}
+
+static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
+ int type) {
+ const int d = abs(delta);
+ const int blk_wh = bs0 + bs1;
+ if (d <= 0 || d >= 40) return 0;
+ return type ? (blk_wh <= 8) : (blk_wh <= 16);
+}
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
index 17e6823b1e..93d62292a3 100644
--- a/third_party/aom/av1/common/resize.c
+++ b/third_party/aom/av1/common/resize.c
@@ -1100,7 +1100,7 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
int src_stride, uint8_t *dst, int dst_stride,
int plane, int rows) {
const int is_uv = (plane > 0);
- const int ss_x = is_uv && cm->subsampling_x;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
const int upscaled_plane_width =
ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
@@ -1141,10 +1141,11 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
const int pad_left = (j == 0);
const int pad_right = (j == cm->tile_cols - 1);
- if (cm->use_highbitdepth)
- highbd_upscale_normative_rect(
- src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width,
- dst_stride, x_step_qn, x0_qn, pad_left, pad_right, cm->bit_depth);
+ if (cm->seq_params.use_highbitdepth)
+ highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride,
+ dst_ptr, rows, dst_width, dst_stride,
+ x_step_qn, x0_qn, pad_left, pad_right,
+ cm->seq_params.bit_depth);
else
upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
rows, dst_width, dst_stride, x_step_qn, x0_qn,
@@ -1175,7 +1176,7 @@ YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
const int num_planes = av1_num_planes(cm);
if (cm->width != unscaled->y_crop_width ||
cm->height != unscaled->y_crop_height) {
- av1_resize_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
+ av1_resize_and_extend_frame(unscaled, scaled, (int)cm->seq_params.bit_depth,
num_planes);
return scaled;
} else {
@@ -1232,6 +1233,7 @@ static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src,
void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
const int num_planes = av1_num_planes(cm);
if (!av1_superres_scaled(cm)) return;
+ const SequenceHeader *const seq_params = &cm->seq_params;
YV12_BUFFER_CONFIG copy_buffer;
memset(&copy_buffer, 0, sizeof(copy_buffer));
@@ -1239,10 +1241,10 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
YV12_BUFFER_CONFIG *const frame_to_show = get_frame_new_buffer(cm);
const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
- if (aom_alloc_frame_buffer(&copy_buffer, aligned_width, cm->height,
- cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment))
+ if (aom_alloc_frame_buffer(
+ &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate copy buffer for superres upscaling");
@@ -1269,11 +1271,11 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
"Failed to free current frame buffer before superres upscaling");
// aom_realloc_frame_buffer() leaves config data for frame_to_show intact
- if (aom_realloc_frame_buffer(frame_to_show, cm->superres_upscaled_width,
- cm->superres_upscaled_height,
- cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment, fb, cb, cb_priv))
+ if (aom_realloc_frame_buffer(
+ frame_to_show, cm->superres_upscaled_width,
+ cm->superres_upscaled_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, fb, cb, cb_priv))
aom_internal_error(
&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate current frame buffer for superres upscaling");
@@ -1283,10 +1285,11 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
// Don't use callbacks on the encoder.
// aom_alloc_frame_buffer() clears the config data for frame_to_show
- if (aom_alloc_frame_buffer(frame_to_show, cm->superres_upscaled_width,
- cm->superres_upscaled_height, cm->subsampling_x,
- cm->subsampling_y, cm->use_highbitdepth,
- AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+ if (aom_alloc_frame_buffer(
+ frame_to_show, cm->superres_upscaled_width,
+ cm->superres_upscaled_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment))
aom_internal_error(
&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to reallocate current frame buffer for superres upscaling");
diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c
index 58a5275ca9..632967957b 100644
--- a/third_party/aom/av1/common/restoration.c
+++ b/third_party/aom/av1/common/restoration.c
@@ -42,8 +42,8 @@ const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
AV1PixelRect rect;
- int ss_x = is_uv && cm->subsampling_x;
- int ss_y = is_uv && cm->subsampling_y;
+ int ss_x = is_uv && cm->seq_params.subsampling_x;
+ int ss_y = is_uv && cm->seq_params.subsampling_y;
rect.top = 0;
rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
@@ -1146,16 +1146,17 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
YV12_BUFFER_CONFIG *frame,
AV1_COMMON *cm, int optimized_lr,
int num_planes) {
- const int bit_depth = cm->bit_depth;
- const int highbd = cm->use_highbitdepth;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ const int bit_depth = seq_params->bit_depth;
+ const int highbd = seq_params->use_highbitdepth;
lr_ctxt->dst = &cm->rst_frame;
const int frame_width = frame->crop_widths[0];
const int frame_height = frame->crop_heights[0];
- if (aom_realloc_frame_buffer(lr_ctxt->dst, frame_width, frame_height,
- cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment, NULL, NULL, NULL) < 0)
+ if (aom_realloc_frame_buffer(
+ lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, highbd, AOM_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL) < 0)
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate restoration dst buffer");
@@ -1180,8 +1181,8 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
highbd);
lr_plane_ctxt->rsi = rsi;
- lr_plane_ctxt->ss_x = is_uv && cm->subsampling_x;
- lr_plane_ctxt->ss_y = is_uv && cm->subsampling_y;
+ lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
+ lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
lr_plane_ctxt->highbd = highbd;
lr_plane_ctxt->bit_depth = bit_depth;
lr_plane_ctxt->data8 = frame->buffers[plane];
@@ -1337,7 +1338,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
int32_t *tmpbuf,
RestorationLineBuffers *rlbs) {
const int is_uv = plane > 0;
- const int ss_y = is_uv && cm->subsampling_y;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
const RestorationInfo *rsi = &cm->rst_info[plane];
@@ -1350,7 +1351,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
int mi_row, int mi_col, BLOCK_SIZE bsize,
int *rcol0, int *rcol1, int *rrow0,
- int *rrow1, int *tile_tl_idx) {
+ int *rrow1) {
assert(rcol0 && rcol1 && rrow0 && rrow1);
if (bsize != cm->seq_params.sb_size) return 0;
@@ -1383,8 +1384,8 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
// The size of an MI-unit on this plane of the image
- const int ss_x = is_uv && cm->subsampling_x;
- const int ss_y = is_uv && cm->subsampling_y;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
const int mi_size_x = MI_SIZE >> ss_x;
const int mi_size_y = MI_SIZE >> ss_y;
@@ -1419,9 +1420,6 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
*rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
*rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
- const int tile_idx = 0;
- *tile_tl_idx = tile_idx * rsi->units_per_tile;
-
return *rcol0 < *rcol1 && *rrow0 < *rrow1;
}
@@ -1468,7 +1466,7 @@ static void save_deblock_boundary_lines(
int upscaled_width;
int line_bytes;
if (av1_superres_scaled(cm)) {
- const int ss_x = is_uv && cm->subsampling_x;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
line_bytes = upscaled_width << use_highbd;
if (use_highbd)
@@ -1515,7 +1513,7 @@ static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
// At the point where this function is called, we've already applied
// superres. So we don't need to extend the lines here, we can just
// pull directly from the topmost row of the upscaled frame.
- const int ss_x = is_uv && cm->subsampling_x;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
const int upscaled_width = av1_superres_scaled(cm)
? (cm->superres_upscaled_width + ss_x) >> ss_x
: src_width;
@@ -1535,7 +1533,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
int use_highbd, int plane,
AV1_COMMON *cm, int after_cdef) {
const int is_uv = plane > 0;
- const int ss_y = is_uv && cm->subsampling_y;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
@@ -1600,7 +1598,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
AV1_COMMON *cm, int after_cdef) {
const int num_planes = av1_num_planes(cm);
- const int use_highbd = cm->use_highbitdepth;
+ const int use_highbd = cm->seq_params.use_highbitdepth;
for (int p = 0; p < num_planes; ++p) {
save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
}
diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h
index 0c40175346..aec37d8341 100644
--- a/third_party/aom/av1/common/restoration.h
+++ b/third_party/aom/av1/common/restoration.h
@@ -346,7 +346,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
int mi_row, int mi_col, BLOCK_SIZE bsize,
int *rcol0, int *rcol1, int *rrow0,
- int *rrow1, int *tile_tl_idx);
+ int *rrow1);
void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
struct AV1Common *cm,
diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h
index c5cebc1354..d206586b5f 100644
--- a/third_party/aom/av1/common/scan.h
+++ b/third_party/aom/av1/common/scan.h
@@ -39,13 +39,6 @@ extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd);
-static INLINE int get_coef_context(const int16_t *neighbors,
- const uint8_t *token_cache, int c) {
- return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
- token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >>
- 1;
-}
-
static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size,
TX_TYPE tx_type) {
return &av1_scan_orders[tx_size][tx_type];
diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c
index 3fa998a918..f9b734b8c5 100644
--- a/third_party/aom/av1/common/thread_common.c
+++ b/third_party/aom/av1/common/thread_common.c
@@ -572,7 +572,7 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
for (int plane = 0; plane < num_planes; plane++) {
if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
const int is_uv = plane > 0;
- const int ss_y = is_uv && cm->subsampling_y;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
AV1PixelRect tile_rect = ctxt[plane].tile_rect;
const int unit_size = ctxt[plane].rsi->restoration_unit_size;
diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c
index 9a43ab29a7..026c904b66 100644
--- a/third_party/aom/av1/common/tile_common.c
+++ b/third_party/aom/av1/common/tile_common.c
@@ -179,8 +179,8 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
r.bottom = AOMMIN(r.bottom, frame_h);
// Convert to coordinates in the appropriate plane
- const int ss_x = is_uv && cm->subsampling_x;
- const int ss_y = is_uv && cm->subsampling_y;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
r.left = ROUND_POWER_OF_TWO(r.left, ss_x);
r.right = ROUND_POWER_OF_TWO(r.right, ss_x);
diff --git a/third_party/aom/av1/common/timing.c b/third_party/aom/av1/common/timing.c
index 5ff538ae19..49dbde78fb 100644
--- a/third_party/aom/av1/common/timing.c
+++ b/third_party/aom/av1/common/timing.c
@@ -53,8 +53,8 @@ int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
decoder_model->encoder_decoder_buffer_delay_length = 16;
- decoder_model->buffer_removal_delay_length = 10;
- decoder_model->frame_presentation_delay_length = 10;
+ decoder_model->buffer_removal_time_length = 10;
+ decoder_model->frame_presentation_time_length = 10;
}
void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h
index d31f4b7fc1..1749baa57c 100644
--- a/third_party/aom/av1/common/timing.h
+++ b/third_party/aom/av1/common/timing.h
@@ -27,23 +27,23 @@ typedef struct aom_timing {
typedef struct aom_dec_model_info {
uint32_t num_units_in_decoding_tick;
int encoder_decoder_buffer_delay_length;
- int buffer_removal_delay_length;
- int frame_presentation_delay_length;
+ int buffer_removal_time_length;
+ int frame_presentation_time_length;
} aom_dec_model_info_t;
typedef struct aom_dec_model_op_parameters {
int decoder_model_param_present_flag;
int64_t bitrate;
int64_t buffer_size;
- int decoder_buffer_delay;
- int encoder_buffer_delay;
+ uint32_t decoder_buffer_delay;
+ uint32_t encoder_buffer_delay;
int low_delay_mode_flag;
int display_model_param_present_flag;
int initial_display_delay;
} aom_dec_model_op_parameters_t;
typedef struct aom_op_timing_info_t {
- int64_t buffer_removal_delay;
+ uint32_t buffer_removal_time;
} aom_op_timing_info_t;
void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h
index cdac90d9e9..f0ab79d0f1 100644
--- a/third_party/aom/av1/common/txb_common.h
+++ b/third_party/aom/av1/common/txb_common.h
@@ -466,31 +466,6 @@ static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels,
return mag;
}
-static INLINE int get_nz_count(const uint8_t *const levels, const int bwl,
- const TX_CLASS tx_class) {
- int count;
-
- count = (levels[1] != 0); // { 0, 1 }
- count += (levels[(1 << bwl) + TX_PAD_HOR] != 0); // { 1, 0 }
-
- for (int idx = 0; idx < SIG_REF_DIFF_OFFSET_NUM; ++idx) {
- const int row_offset =
- ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][0]
- : ((tx_class == TX_CLASS_VERT)
- ? sig_ref_diff_offset_vert[idx][0]
- : sig_ref_diff_offset_horiz[idx][0]));
- const int col_offset =
- ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][1]
- : ((tx_class == TX_CLASS_VERT)
- ? sig_ref_diff_offset_vert[idx][1]
- : sig_ref_diff_offset_horiz[idx][1]));
- const int nb_pos =
- (row_offset << bwl) + (row_offset << TX_PAD_HOR_LOG2) + col_offset;
- count += (levels[nb_pos] != 0);
- }
- return count;
-}
-
#define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D
#define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5)
#define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10)
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
index ae6f076572..412d83ed84 100644
--- a/third_party/aom/av1/common/warped_motion.c
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -92,33 +92,6 @@ static const int error_measure_lut[512] = {
};
/* clang-format on */
-void project_points_affine(const int32_t *mat, int *points, int *proj,
- const int n, const int stride_points,
- const int stride_proj, const int subsampling_x,
- const int subsampling_y) {
- for (int i = 0; i < n; ++i) {
- const int x = *(points++), y = *(points++);
- if (subsampling_x)
- *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
- mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
- (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- WARPEDDIFF_PREC_BITS + 1);
- else
- *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0],
- WARPEDDIFF_PREC_BITS);
- if (subsampling_y)
- *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
- mat[4] * 2 * x + mat[5] * 2 * y + mat[1] +
- (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- WARPEDDIFF_PREC_BITS + 1);
- else
- *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[4] * x + mat[5] * y + mat[1],
- WARPEDDIFF_PREC_BITS);
- points += stride_points - 2;
- proj += stride_proj - 2;
- }
-}
-
// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
// at a time. The zoom/rotation/shear in the model are applied to the
// "fractional" position of each pixel, which therefore varies within
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
index f5da36bbbc..ce4032ee52 100644
--- a/third_party/aom/av1/common/warped_motion.h
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -68,11 +68,6 @@ static const uint8_t warp_pad_right[14][16] = {
{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
};
-void project_points_affine(const int32_t *mat, int *points, int *proj,
- const int n, const int stride_points,
- const int stride_proj, const int subsampling_x,
- const int subsampling_y);
-
// Returns the error between the result of applying motion 'wm' to the frame
// described by 'ref' and the frame described by 'dst'.
int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
index 6747cae01b..0c5286f9df 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -39,7 +39,7 @@ static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(filter_idx < SUBPEL_SHIFTS);
const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
// Load the filter coefficients
const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
@@ -140,7 +140,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(filter_idx < SUBPEL_SHIFTS);
const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
int x;
@@ -232,8 +232,8 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
}
void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
uint8_t *dst8, int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int x_step_qn,
const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params) {
@@ -278,7 +278,7 @@ static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(filter_idx < SUBPEL_SHIFTS);
const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
// Load the filter coefficients
const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
@@ -372,7 +372,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(filter_idx < SUBPEL_SHIFTS);
const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
int x;
@@ -472,8 +472,8 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
void av1_highbd_convolve_2d_scale_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params, int bd) {
// TODO(yaowu): Move this out of stack
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
index 7415c58df3..ae331b40da 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -19,49 +19,47 @@
#include "av1/common/x86/av1_inv_txfm_ssse3.h"
static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
- btf_16_adds_subs_avx2(x1[0], x1[3]);
- btf_16_adds_subs_avx2(x1[1], x1[2]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x1[5], x1[6]);
-
- btf_16_adds_subs_avx2(x1[8], x1[11]);
- btf_16_adds_subs_avx2(x1[9], x1[10]);
- btf_16_subs_adds_avx2(x1[15], x1[12]);
- btf_16_subs_adds_avx2(x1[14], x1[13]);
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
}
static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
- btf_16_adds_subs_avx2(x[0], x[7]);
- btf_16_adds_subs_avx2(x[1], x[6]);
- btf_16_adds_subs_avx2(x[2], x[5]);
- btf_16_adds_subs_avx2(x[3], x[4]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
}
static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
- btf_16_adds_subs_out_avx2(output[0], output[15], x1[0], x1[15]);
- btf_16_adds_subs_out_avx2(output[1], output[14], x1[1], x1[14]);
- btf_16_adds_subs_out_avx2(output[2], output[13], x1[2], x1[13]);
- btf_16_adds_subs_out_avx2(output[3], output[12], x1[3], x1[12]);
- btf_16_adds_subs_out_avx2(output[4], output[11], x1[4], x1[11]);
- btf_16_adds_subs_out_avx2(output[5], output[10], x1[5], x1[10]);
- btf_16_adds_subs_out_avx2(output[6], output[9], x1[6], x1[9]);
- btf_16_adds_subs_out_avx2(output[7], output[8], x1[7], x1[8]);
+ btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
}
static void idct16_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
__m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
__m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
@@ -103,29 +101,29 @@ static void idct16_new_avx2(const __m256i *input, __m256i *output,
x1[15] = input[15];
// stage 2
- btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]);
- btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]);
- btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]);
- btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]);
+ btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
// stage 3
- btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]);
- btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]);
- btf_16_adds_subs_avx2(x1[8], x1[9]);
- btf_16_subs_adds_avx2(x1[11], x1[10]);
- btf_16_adds_subs_avx2(x1[12], x1[13]);
- btf_16_subs_adds_avx2(x1[15], x1[14]);
+ btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
// stage 4
- btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]);
- btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]);
- btf_16_adds_subs_avx2(x1[4], x1[5]);
- btf_16_subs_adds_avx2(x1[7], x1[6]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]);
-
- idct16_stage5_avx2(x1, cospi, __rounding, cos_bit);
- idct16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+ idct16_stage5_avx2(x1, cospi, _r, cos_bit);
+ idct16_stage6_avx2(x1, cospi, _r, cos_bit);
idct16_stage7_avx2(output, x1);
}
@@ -133,7 +131,7 @@ static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
@@ -159,21 +157,21 @@ static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
// stage 3
btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
- btf_16_adds_subs_avx2(x1[8], x1[9]);
- btf_16_subs_adds_avx2(x1[11], x1[10]);
- btf_16_adds_subs_avx2(x1[12], x1[13]);
- btf_16_subs_adds_avx2(x1[15], x1[14]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
// stage 4
btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
- btf_16_adds_subs_avx2(x1[4], x1[5]);
- btf_16_subs_adds_avx2(x1[7], x1[6]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
- idct16_stage5_avx2(x1, cospi, __rounding, cos_bit);
- idct16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ idct16_stage5_avx2(x1, cospi, _r, cos_bit);
+ idct16_stage6_avx2(x1, cospi, _r, cos_bit);
idct16_stage7_avx2(output, x1);
}
@@ -212,74 +210,71 @@ static void idct16_low1_new_avx2(const __m256i *input, __m256i *output,
}
static INLINE void iadst16_stage3_avx2(__m256i *x) {
- btf_16_adds_subs_avx2(x[0], x[8]);
- btf_16_adds_subs_avx2(x[1], x[9]);
- btf_16_adds_subs_avx2(x[2], x[10]);
- btf_16_adds_subs_avx2(x[3], x[11]);
- btf_16_adds_subs_avx2(x[4], x[12]);
- btf_16_adds_subs_avx2(x[5], x[13]);
- btf_16_adds_subs_avx2(x[6], x[14]);
- btf_16_adds_subs_avx2(x[7], x[15]);
+ btf_16_adds_subs_avx2(&x[0], &x[8]);
+ btf_16_adds_subs_avx2(&x[1], &x[9]);
+ btf_16_adds_subs_avx2(&x[2], &x[10]);
+ btf_16_adds_subs_avx2(&x[3], &x[11]);
+ btf_16_adds_subs_avx2(&x[4], &x[12]);
+ btf_16_adds_subs_avx2(&x[5], &x[13]);
+ btf_16_adds_subs_avx2(&x[6], &x[14]);
+ btf_16_adds_subs_avx2(&x[7], &x[15]);
}
static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
- btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
- btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
- btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
- btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
}
static INLINE void iadst16_stage5_avx2(__m256i *x) {
- btf_16_adds_subs_avx2(x[0], x[4]);
- btf_16_adds_subs_avx2(x[1], x[5]);
- btf_16_adds_subs_avx2(x[2], x[6]);
- btf_16_adds_subs_avx2(x[3], x[7]);
- btf_16_adds_subs_avx2(x[8], x[12]);
- btf_16_adds_subs_avx2(x[9], x[13]);
- btf_16_adds_subs_avx2(x[10], x[14]);
- btf_16_adds_subs_avx2(x[11], x[15]);
+ btf_16_adds_subs_avx2(&x[0], &x[4]);
+ btf_16_adds_subs_avx2(&x[1], &x[5]);
+ btf_16_adds_subs_avx2(&x[2], &x[6]);
+ btf_16_adds_subs_avx2(&x[3], &x[7]);
+ btf_16_adds_subs_avx2(&x[8], &x[12]);
+ btf_16_adds_subs_avx2(&x[9], &x[13]);
+ btf_16_adds_subs_avx2(&x[10], &x[14]);
+ btf_16_adds_subs_avx2(&x[11], &x[15]);
}
static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
- btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
- btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
- btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
- btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
}
static INLINE void iadst16_stage7_avx2(__m256i *x) {
- btf_16_adds_subs_avx2(x[0], x[2]);
- btf_16_adds_subs_avx2(x[1], x[3]);
- btf_16_adds_subs_avx2(x[4], x[6]);
- btf_16_adds_subs_avx2(x[5], x[7]);
- btf_16_adds_subs_avx2(x[8], x[10]);
- btf_16_adds_subs_avx2(x[9], x[11]);
- btf_16_adds_subs_avx2(x[12], x[14]);
- btf_16_adds_subs_avx2(x[13], x[15]);
+ btf_16_adds_subs_avx2(&x[0], &x[2]);
+ btf_16_adds_subs_avx2(&x[1], &x[3]);
+ btf_16_adds_subs_avx2(&x[4], &x[6]);
+ btf_16_adds_subs_avx2(&x[5], &x[7]);
+ btf_16_adds_subs_avx2(&x[8], &x[10]);
+ btf_16_adds_subs_avx2(&x[9], &x[11]);
+ btf_16_adds_subs_avx2(&x[12], &x[14]);
+ btf_16_adds_subs_avx2(&x[13], &x[15]);
}
static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
- btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x1[2], x1[3]);
- btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x1[6], x1[7]);
- btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x1[10], x1[11]);
- btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x1[14], x1[15]);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
}
static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
@@ -307,7 +302,7 @@ static void iadst16_new_avx2(const __m256i *input, __m256i *output,
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
__m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
__m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
@@ -346,21 +341,21 @@ static void iadst16_new_avx2(const __m256i *input, __m256i *output,
x1[15] = input[14];
// stage 2
- btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, x1[0], x1[1], x1[0], x1[1]);
- btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, x1[2], x1[3], x1[2], x1[3]);
- btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, x1[4], x1[5], x1[4], x1[5]);
- btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, x1[6], x1[7], x1[6], x1[7]);
- btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, x1[8], x1[9], x1[8], x1[9]);
- btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, x1[10], x1[11], x1[10], x1[11]);
- btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, x1[12], x1[13], x1[12], x1[13]);
- btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, x1[14], x1[15], x1[14], x1[15]);
+ btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
iadst16_stage3_avx2(x1);
- iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
iadst16_stage5_avx2(x1);
- iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
iadst16_stage7_avx2(x1);
- iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
iadst16_stage9_avx2(output, x1);
}
@@ -368,7 +363,7 @@ static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
// stage 1
__m256i x1[16];
@@ -392,11 +387,11 @@ static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
iadst16_stage3_avx2(x1);
- iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
iadst16_stage5_avx2(x1);
- iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
iadst16_stage7_avx2(x1);
- iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
iadst16_stage9_avx2(output, x1);
}
@@ -404,7 +399,7 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
@@ -423,7 +418,7 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
x1[9] = x1[1];
// stage 4
- btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x1[8], x1[9], x1[8], x1[9]);
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
// stage 5
x1[4] = x1[0];
@@ -433,8 +428,8 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
x1[13] = x1[9];
// stage 6
- btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[4], x1[5], x1[4], x1[5]);
- btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[12], x1[13], x1[12], x1[13]);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
// stage 7
x1[2] = x1[0];
@@ -446,130 +441,125 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
x1[14] = x1[12];
x1[15] = x1[13];
- iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
iadst16_stage9_avx2(output, x1);
}
static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
- btf_16_adds_subs_avx2(x[16], x[17]);
- btf_16_subs_adds_avx2(x[19], x[18]);
- btf_16_adds_subs_avx2(x[20], x[21]);
- btf_16_subs_adds_avx2(x[23], x[22]);
- btf_16_adds_subs_avx2(x[24], x[25]);
- btf_16_subs_adds_avx2(x[27], x[26]);
- btf_16_adds_subs_avx2(x[28], x[29]);
- btf_16_subs_adds_avx2(x[31], x[30]);
+ btf_16_adds_subs_avx2(&x[16], &x[17]);
+ btf_16_adds_subs_avx2(&x[19], &x[18]);
+ btf_16_adds_subs_avx2(&x[20], &x[21]);
+ btf_16_adds_subs_avx2(&x[23], &x[22]);
+ btf_16_adds_subs_avx2(&x[24], &x[25]);
+ btf_16_adds_subs_avx2(&x[27], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[29]);
+ btf_16_adds_subs_avx2(&x[31], &x[30]);
}
static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
- btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
- btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
- btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
- btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
}
static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
- btf_16_adds_subs_avx2(x[16], x[19]);
- btf_16_adds_subs_avx2(x[17], x[18]);
- btf_16_subs_adds_avx2(x[23], x[20]);
- btf_16_subs_adds_avx2(x[22], x[21]);
- btf_16_adds_subs_avx2(x[24], x[27]);
- btf_16_adds_subs_avx2(x[25], x[26]);
- btf_16_subs_adds_avx2(x[31], x[28]);
- btf_16_subs_adds_avx2(x[30], x[29]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[16], &x[19]);
+ btf_16_adds_subs_avx2(&x[17], &x[18]);
+ btf_16_adds_subs_avx2(&x[23], &x[20]);
+ btf_16_adds_subs_avx2(&x[22], &x[21]);
+ btf_16_adds_subs_avx2(&x[24], &x[27]);
+ btf_16_adds_subs_avx2(&x[25], &x[26]);
+ btf_16_adds_subs_avx2(&x[31], &x[28]);
+ btf_16_adds_subs_avx2(&x[30], &x[29]);
}
static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
- btf_16_adds_subs_avx2(x[8], x[11]);
- btf_16_adds_subs_avx2(x[9], x[10]);
- btf_16_subs_adds_avx2(x[15], x[12]);
- btf_16_subs_adds_avx2(x[14], x[13]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
}
static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
- btf_16_adds_subs_avx2(x[0], x[7]);
- btf_16_adds_subs_avx2(x[1], x[6]);
- btf_16_adds_subs_avx2(x[2], x[5]);
- btf_16_adds_subs_avx2(x[3], x[4]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
- btf_16_adds_subs_avx2(x[16], x[23]);
- btf_16_adds_subs_avx2(x[17], x[22]);
- btf_16_adds_subs_avx2(x[18], x[21]);
- btf_16_adds_subs_avx2(x[19], x[20]);
- btf_16_subs_adds_avx2(x[31], x[24]);
- btf_16_subs_adds_avx2(x[30], x[25]);
- btf_16_subs_adds_avx2(x[29], x[26]);
- btf_16_subs_adds_avx2(x[28], x[27]);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[16], &x[23]);
+ btf_16_adds_subs_avx2(&x[17], &x[22]);
+ btf_16_adds_subs_avx2(&x[18], &x[21]);
+ btf_16_adds_subs_avx2(&x[19], &x[20]);
+ btf_16_adds_subs_avx2(&x[31], &x[24]);
+ btf_16_adds_subs_avx2(&x[30], &x[25]);
+ btf_16_adds_subs_avx2(&x[29], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[27]);
}
static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
- btf_16_adds_subs_avx2(x[0], x[15]);
- btf_16_adds_subs_avx2(x[1], x[14]);
- btf_16_adds_subs_avx2(x[2], x[13]);
- btf_16_adds_subs_avx2(x[3], x[12]);
- btf_16_adds_subs_avx2(x[4], x[11]);
- btf_16_adds_subs_avx2(x[5], x[10]);
- btf_16_adds_subs_avx2(x[6], x[9]);
- btf_16_adds_subs_avx2(x[7], x[8]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+ btf_16_adds_subs_avx2(&x[0], &x[15]);
+ btf_16_adds_subs_avx2(&x[1], &x[14]);
+ btf_16_adds_subs_avx2(&x[2], &x[13]);
+ btf_16_adds_subs_avx2(&x[3], &x[12]);
+ btf_16_adds_subs_avx2(&x[4], &x[11]);
+ btf_16_adds_subs_avx2(&x[5], &x[10]);
+ btf_16_adds_subs_avx2(&x[6], &x[9]);
+ btf_16_adds_subs_avx2(&x[7], &x[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
}
static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
- btf_16_adds_subs_out_avx2(output[0], output[31], x[0], x[31]);
- btf_16_adds_subs_out_avx2(output[1], output[30], x[1], x[30]);
- btf_16_adds_subs_out_avx2(output[2], output[29], x[2], x[29]);
- btf_16_adds_subs_out_avx2(output[3], output[28], x[3], x[28]);
- btf_16_adds_subs_out_avx2(output[4], output[27], x[4], x[27]);
- btf_16_adds_subs_out_avx2(output[5], output[26], x[5], x[26]);
- btf_16_adds_subs_out_avx2(output[6], output[25], x[6], x[25]);
- btf_16_adds_subs_out_avx2(output[7], output[24], x[7], x[24]);
- btf_16_adds_subs_out_avx2(output[8], output[23], x[8], x[23]);
- btf_16_adds_subs_out_avx2(output[9], output[22], x[9], x[22]);
- btf_16_adds_subs_out_avx2(output[10], output[21], x[10], x[21]);
- btf_16_adds_subs_out_avx2(output[11], output[20], x[11], x[20]);
- btf_16_adds_subs_out_avx2(output[12], output[19], x[12], x[19]);
- btf_16_adds_subs_out_avx2(output[13], output[18], x[13], x[18]);
- btf_16_adds_subs_out_avx2(output[14], output[17], x[14], x[17]);
- btf_16_adds_subs_out_avx2(output[15], output[16], x[15], x[16]);
+ btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
+ btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
+ btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
+ btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
+ btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
+ btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
+ btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
+ btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
+ btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
}
static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
@@ -629,7 +619,7 @@ static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
// stage 1
__m256i x[32];
@@ -666,20 +656,20 @@ static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
x[10] = x[11];
x[13] = x[12];
x[14] = x[15];
- idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit);
+ idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
// stage 5
btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
x[5] = x[4];
x[6] = x[7];
- idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit);
+ idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
// stage 6
x[3] = x[0];
x[2] = x[1];
- idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit);
+ idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
- idct32_stage7_avx2(x, cospi, __rounding, cos_bit);
- idct32_stage8_avx2(x, cospi, __rounding, cos_bit);
+ idct32_stage7_avx2(x, cospi, _r, cos_bit);
+ idct32_stage8_avx2(x, cospi, _r, cos_bit);
idct32_stage9_avx2(output, x);
}
@@ -687,7 +677,7 @@ static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
// stage 1
__m256i x[32];
@@ -728,25 +718,25 @@ static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
// stage 4
btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
- btf_16_adds_subs_avx2(x[8], x[9]);
- btf_16_subs_adds_avx2(x[11], x[10]);
- btf_16_adds_subs_avx2(x[12], x[13]);
- btf_16_subs_adds_avx2(x[15], x[14]);
- idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[9]);
+ btf_16_adds_subs_avx2(&x[11], &x[10]);
+ btf_16_adds_subs_avx2(&x[12], &x[13]);
+ btf_16_adds_subs_avx2(&x[15], &x[14]);
+ idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
// stage 5
btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
- btf_16_adds_subs_avx2(x[4], x[5]);
- btf_16_subs_adds_avx2(x[7], x[6]);
- idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[4], &x[5]);
+ btf_16_adds_subs_avx2(&x[7], &x[6]);
+ idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
- btf_16_adds_subs_avx2(x[0], x[3]);
- btf_16_adds_subs_avx2(x[1], x[2]);
- idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[0], &x[3]);
+ btf_16_adds_subs_avx2(&x[1], &x[2]);
+ idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
- idct32_stage7_avx2(x, cospi, __rounding, cos_bit);
- idct32_stage8_avx2(x, cospi, __rounding, cos_bit);
+ idct32_stage7_avx2(x, cospi, _r, cos_bit);
+ idct32_stage8_avx2(x, cospi, _r, cos_bit);
idct32_stage9_avx2(output, x);
}
@@ -754,7 +744,7 @@ static void idct32_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
__m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
__m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
@@ -825,51 +815,50 @@ static void idct32_new_avx2(const __m256i *input, __m256i *output,
x1[31] = input[31];
// stage 2
- btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, x1[16], x1[31], x1[16], x1[31]);
- btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, x1[17], x1[30], x1[17], x1[30]);
- btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, x1[18], x1[29], x1[18], x1[29]);
- btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, x1[19], x1[28], x1[19], x1[28]);
- btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, x1[20], x1[27], x1[20], x1[27]);
- btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, x1[21], x1[26], x1[21], x1[26]);
- btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, x1[22], x1[25], x1[22], x1[25]);
- btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, x1[23], x1[24], x1[23], x1[24]);
+ btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit);
// stage 3
- btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]);
- btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]);
- btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]);
- btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]);
+ btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
idct32_high16_stage3_avx2(x1);
// stage 4
- btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]);
- btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]);
- btf_16_adds_subs_avx2(x1[8], x1[9]);
- btf_16_subs_adds_avx2(x1[11], x1[10]);
- btf_16_adds_subs_avx2(x1[12], x1[13]);
- btf_16_subs_adds_avx2(x1[15], x1[14]);
- idct32_high16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+ btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+ idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit);
// stage 5
- btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]);
- btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]);
- btf_16_adds_subs_avx2(x1[4], x1[5]);
- btf_16_subs_adds_avx2(x1[7], x1[6]);
- idct32_high24_stage5_avx2(x1, cospi, __rounding, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit);
// stage 6
- btf_16_adds_subs_avx2(x1[0], x1[3]);
- btf_16_adds_subs_avx2(x1[1], x1[2]);
- idct32_high28_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit);
- idct32_stage7_avx2(x1, cospi, __rounding, cos_bit);
- idct32_stage8_avx2(x1, cospi, __rounding, cos_bit);
+ idct32_stage7_avx2(x1, cospi, _r, cos_bit);
+ idct32_stage8_avx2(x1, cospi, _r, cos_bit);
idct32_stage9_avx2(output, x1);
}
static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
@@ -883,19 +872,18 @@ static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
- btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
- btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
- btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
- btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
- btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
- btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
- btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
- btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
}
static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
@@ -903,31 +891,30 @@ static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
- btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
- btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
- btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
- btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
- btf_16_adds_subs_avx2(x[32], x[35]);
- btf_16_adds_subs_avx2(x[33], x[34]);
- btf_16_subs_adds_avx2(x[39], x[36]);
- btf_16_subs_adds_avx2(x[38], x[37]);
- btf_16_adds_subs_avx2(x[40], x[43]);
- btf_16_adds_subs_avx2(x[41], x[42]);
- btf_16_subs_adds_avx2(x[47], x[44]);
- btf_16_subs_adds_avx2(x[46], x[45]);
- btf_16_adds_subs_avx2(x[48], x[51]);
- btf_16_adds_subs_avx2(x[49], x[50]);
- btf_16_subs_adds_avx2(x[55], x[52]);
- btf_16_subs_adds_avx2(x[54], x[53]);
- btf_16_adds_subs_avx2(x[56], x[59]);
- btf_16_adds_subs_avx2(x[57], x[58]);
- btf_16_subs_adds_avx2(x[63], x[60]);
- btf_16_subs_adds_avx2(x[62], x[61]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[35]);
+ btf_16_adds_subs_avx2(&x[33], &x[34]);
+ btf_16_adds_subs_avx2(&x[39], &x[36]);
+ btf_16_adds_subs_avx2(&x[38], &x[37]);
+ btf_16_adds_subs_avx2(&x[40], &x[43]);
+ btf_16_adds_subs_avx2(&x[41], &x[42]);
+ btf_16_adds_subs_avx2(&x[47], &x[44]);
+ btf_16_adds_subs_avx2(&x[46], &x[45]);
+ btf_16_adds_subs_avx2(&x[48], &x[51]);
+ btf_16_adds_subs_avx2(&x[49], &x[50]);
+ btf_16_adds_subs_avx2(&x[55], &x[52]);
+ btf_16_adds_subs_avx2(&x[54], &x[53]);
+ btf_16_adds_subs_avx2(&x[56], &x[59]);
+ btf_16_adds_subs_avx2(&x[57], &x[58]);
+ btf_16_adds_subs_avx2(&x[63], &x[60]);
+ btf_16_adds_subs_avx2(&x[62], &x[61]);
}
static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
@@ -935,185 +922,180 @@ static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
- btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
- btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
- btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
- btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
- btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
- btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
- btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
- btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
}
static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
- btf_16_adds_subs_avx2(x[16], x[19]);
- btf_16_adds_subs_avx2(x[17], x[18]);
- btf_16_subs_adds_avx2(x[23], x[20]);
- btf_16_subs_adds_avx2(x[22], x[21]);
- btf_16_adds_subs_avx2(x[24], x[27]);
- btf_16_adds_subs_avx2(x[25], x[26]);
- btf_16_subs_adds_avx2(x[31], x[28]);
- btf_16_subs_adds_avx2(x[30], x[29]);
- idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit);
+ const __m256i _r, int8_t cos_bit) {
+ btf_16_adds_subs_avx2(&x[16], &x[19]);
+ btf_16_adds_subs_avx2(&x[17], &x[18]);
+ btf_16_adds_subs_avx2(&x[23], &x[20]);
+ btf_16_adds_subs_avx2(&x[22], &x[21]);
+ btf_16_adds_subs_avx2(&x[24], &x[27]);
+ btf_16_adds_subs_avx2(&x[25], &x[26]);
+ btf_16_adds_subs_avx2(&x[31], &x[28]);
+ btf_16_adds_subs_avx2(&x[30], &x[29]);
+ idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
}
static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
- btf_16_adds_subs_avx2(x[32], x[39]);
- btf_16_adds_subs_avx2(x[33], x[38]);
- btf_16_adds_subs_avx2(x[34], x[37]);
- btf_16_adds_subs_avx2(x[35], x[36]);
- btf_16_subs_adds_avx2(x[47], x[40]);
- btf_16_subs_adds_avx2(x[46], x[41]);
- btf_16_subs_adds_avx2(x[45], x[42]);
- btf_16_subs_adds_avx2(x[44], x[43]);
- btf_16_adds_subs_avx2(x[48], x[55]);
- btf_16_adds_subs_avx2(x[49], x[54]);
- btf_16_adds_subs_avx2(x[50], x[53]);
- btf_16_adds_subs_avx2(x[51], x[52]);
- btf_16_subs_adds_avx2(x[63], x[56]);
- btf_16_subs_adds_avx2(x[62], x[57]);
- btf_16_subs_adds_avx2(x[61], x[58]);
- btf_16_subs_adds_avx2(x[60], x[59]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[39]);
+ btf_16_adds_subs_avx2(&x[33], &x[38]);
+ btf_16_adds_subs_avx2(&x[34], &x[37]);
+ btf_16_adds_subs_avx2(&x[35], &x[36]);
+ btf_16_adds_subs_avx2(&x[47], &x[40]);
+ btf_16_adds_subs_avx2(&x[46], &x[41]);
+ btf_16_adds_subs_avx2(&x[45], &x[42]);
+ btf_16_adds_subs_avx2(&x[44], &x[43]);
+ btf_16_adds_subs_avx2(&x[48], &x[55]);
+ btf_16_adds_subs_avx2(&x[49], &x[54]);
+ btf_16_adds_subs_avx2(&x[50], &x[53]);
+ btf_16_adds_subs_avx2(&x[51], &x[52]);
+ btf_16_adds_subs_avx2(&x[63], &x[56]);
+ btf_16_adds_subs_avx2(&x[62], &x[57]);
+ btf_16_adds_subs_avx2(&x[61], &x[58]);
+ btf_16_adds_subs_avx2(&x[60], &x[59]);
}
static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
- btf_16_adds_subs_avx2(x[16], x[23]);
- btf_16_adds_subs_avx2(x[17], x[22]);
- btf_16_adds_subs_avx2(x[18], x[21]);
- btf_16_adds_subs_avx2(x[19], x[20]);
- btf_16_subs_adds_avx2(x[31], x[24]);
- btf_16_subs_adds_avx2(x[30], x[25]);
- btf_16_subs_adds_avx2(x[29], x[26]);
- btf_16_subs_adds_avx2(x[28], x[27]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+ btf_16_adds_subs_avx2(&x[16], &x[23]);
+ btf_16_adds_subs_avx2(&x[17], &x[22]);
+ btf_16_adds_subs_avx2(&x[18], &x[21]);
+ btf_16_adds_subs_avx2(&x[19], &x[20]);
+ btf_16_adds_subs_avx2(&x[31], &x[24]);
+ btf_16_adds_subs_avx2(&x[30], &x[25]);
+ btf_16_adds_subs_avx2(&x[29], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[27]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
}
static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
- btf_16_adds_subs_avx2(x[0], x[15]);
- btf_16_adds_subs_avx2(x[1], x[14]);
- btf_16_adds_subs_avx2(x[2], x[13]);
- btf_16_adds_subs_avx2(x[3], x[12]);
- btf_16_adds_subs_avx2(x[4], x[11]);
- btf_16_adds_subs_avx2(x[5], x[10]);
- btf_16_adds_subs_avx2(x[6], x[9]);
- btf_16_adds_subs_avx2(x[7], x[8]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
- btf_16_adds_subs_avx2(x[32], x[47]);
- btf_16_adds_subs_avx2(x[33], x[46]);
- btf_16_adds_subs_avx2(x[34], x[45]);
- btf_16_adds_subs_avx2(x[35], x[44]);
- btf_16_adds_subs_avx2(x[36], x[43]);
- btf_16_adds_subs_avx2(x[37], x[42]);
- btf_16_adds_subs_avx2(x[38], x[41]);
- btf_16_adds_subs_avx2(x[39], x[40]);
- btf_16_subs_adds_avx2(x[63], x[48]);
- btf_16_subs_adds_avx2(x[62], x[49]);
- btf_16_subs_adds_avx2(x[61], x[50]);
- btf_16_subs_adds_avx2(x[60], x[51]);
- btf_16_subs_adds_avx2(x[59], x[52]);
- btf_16_subs_adds_avx2(x[58], x[53]);
- btf_16_subs_adds_avx2(x[57], x[54]);
- btf_16_subs_adds_avx2(x[56], x[55]);
+ btf_16_adds_subs_avx2(&x[0], &x[15]);
+ btf_16_adds_subs_avx2(&x[1], &x[14]);
+ btf_16_adds_subs_avx2(&x[2], &x[13]);
+ btf_16_adds_subs_avx2(&x[3], &x[12]);
+ btf_16_adds_subs_avx2(&x[4], &x[11]);
+ btf_16_adds_subs_avx2(&x[5], &x[10]);
+ btf_16_adds_subs_avx2(&x[6], &x[9]);
+ btf_16_adds_subs_avx2(&x[7], &x[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[47]);
+ btf_16_adds_subs_avx2(&x[33], &x[46]);
+ btf_16_adds_subs_avx2(&x[34], &x[45]);
+ btf_16_adds_subs_avx2(&x[35], &x[44]);
+ btf_16_adds_subs_avx2(&x[36], &x[43]);
+ btf_16_adds_subs_avx2(&x[37], &x[42]);
+ btf_16_adds_subs_avx2(&x[38], &x[41]);
+ btf_16_adds_subs_avx2(&x[39], &x[40]);
+ btf_16_adds_subs_avx2(&x[63], &x[48]);
+ btf_16_adds_subs_avx2(&x[62], &x[49]);
+ btf_16_adds_subs_avx2(&x[61], &x[50]);
+ btf_16_adds_subs_avx2(&x[60], &x[51]);
+ btf_16_adds_subs_avx2(&x[59], &x[52]);
+ btf_16_adds_subs_avx2(&x[58], &x[53]);
+ btf_16_adds_subs_avx2(&x[57], &x[54]);
+ btf_16_adds_subs_avx2(&x[56], &x[55]);
}
static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
- btf_16_adds_subs_avx2(x[0], x[31]);
- btf_16_adds_subs_avx2(x[1], x[30]);
- btf_16_adds_subs_avx2(x[2], x[29]);
- btf_16_adds_subs_avx2(x[3], x[28]);
- btf_16_adds_subs_avx2(x[4], x[27]);
- btf_16_adds_subs_avx2(x[5], x[26]);
- btf_16_adds_subs_avx2(x[6], x[25]);
- btf_16_adds_subs_avx2(x[7], x[24]);
- btf_16_adds_subs_avx2(x[8], x[23]);
- btf_16_adds_subs_avx2(x[9], x[22]);
- btf_16_adds_subs_avx2(x[10], x[21]);
- btf_16_adds_subs_avx2(x[11], x[20]);
- btf_16_adds_subs_avx2(x[12], x[19]);
- btf_16_adds_subs_avx2(x[13], x[18]);
- btf_16_adds_subs_avx2(x[14], x[17]);
- btf_16_adds_subs_avx2(x[15], x[16]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+ btf_16_adds_subs_avx2(&x[0], &x[31]);
+ btf_16_adds_subs_avx2(&x[1], &x[30]);
+ btf_16_adds_subs_avx2(&x[2], &x[29]);
+ btf_16_adds_subs_avx2(&x[3], &x[28]);
+ btf_16_adds_subs_avx2(&x[4], &x[27]);
+ btf_16_adds_subs_avx2(&x[5], &x[26]);
+ btf_16_adds_subs_avx2(&x[6], &x[25]);
+ btf_16_adds_subs_avx2(&x[7], &x[24]);
+ btf_16_adds_subs_avx2(&x[8], &x[23]);
+ btf_16_adds_subs_avx2(&x[9], &x[22]);
+ btf_16_adds_subs_avx2(&x[10], &x[21]);
+ btf_16_adds_subs_avx2(&x[11], &x[20]);
+ btf_16_adds_subs_avx2(&x[12], &x[19]);
+ btf_16_adds_subs_avx2(&x[13], &x[18]);
+ btf_16_adds_subs_avx2(&x[14], &x[17]);
+ btf_16_adds_subs_avx2(&x[15], &x[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
}
static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
- btf_16_adds_subs_out_avx2(output[0], output[63], x[0], x[63]);
- btf_16_adds_subs_out_avx2(output[1], output[62], x[1], x[62]);
- btf_16_adds_subs_out_avx2(output[2], output[61], x[2], x[61]);
- btf_16_adds_subs_out_avx2(output[3], output[60], x[3], x[60]);
- btf_16_adds_subs_out_avx2(output[4], output[59], x[4], x[59]);
- btf_16_adds_subs_out_avx2(output[5], output[58], x[5], x[58]);
- btf_16_adds_subs_out_avx2(output[6], output[57], x[6], x[57]);
- btf_16_adds_subs_out_avx2(output[7], output[56], x[7], x[56]);
- btf_16_adds_subs_out_avx2(output[8], output[55], x[8], x[55]);
- btf_16_adds_subs_out_avx2(output[9], output[54], x[9], x[54]);
- btf_16_adds_subs_out_avx2(output[10], output[53], x[10], x[53]);
- btf_16_adds_subs_out_avx2(output[11], output[52], x[11], x[52]);
- btf_16_adds_subs_out_avx2(output[12], output[51], x[12], x[51]);
- btf_16_adds_subs_out_avx2(output[13], output[50], x[13], x[50]);
- btf_16_adds_subs_out_avx2(output[14], output[49], x[14], x[49]);
- btf_16_adds_subs_out_avx2(output[15], output[48], x[15], x[48]);
- btf_16_adds_subs_out_avx2(output[16], output[47], x[16], x[47]);
- btf_16_adds_subs_out_avx2(output[17], output[46], x[17], x[46]);
- btf_16_adds_subs_out_avx2(output[18], output[45], x[18], x[45]);
- btf_16_adds_subs_out_avx2(output[19], output[44], x[19], x[44]);
- btf_16_adds_subs_out_avx2(output[20], output[43], x[20], x[43]);
- btf_16_adds_subs_out_avx2(output[21], output[42], x[21], x[42]);
- btf_16_adds_subs_out_avx2(output[22], output[41], x[22], x[41]);
- btf_16_adds_subs_out_avx2(output[23], output[40], x[23], x[40]);
- btf_16_adds_subs_out_avx2(output[24], output[39], x[24], x[39]);
- btf_16_adds_subs_out_avx2(output[25], output[38], x[25], x[38]);
- btf_16_adds_subs_out_avx2(output[26], output[37], x[26], x[37]);
- btf_16_adds_subs_out_avx2(output[27], output[36], x[27], x[36]);
- btf_16_adds_subs_out_avx2(output[28], output[35], x[28], x[35]);
- btf_16_adds_subs_out_avx2(output[29], output[34], x[29], x[34]);
- btf_16_adds_subs_out_avx2(output[30], output[33], x[30], x[33]);
- btf_16_adds_subs_out_avx2(output[31], output[32], x[31], x[32]);
+ btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
+ btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
+ btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
+ btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
+ btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
+ btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
+ btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
+ btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
+ btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
+ btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
+ btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
+ btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
+ btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
+ btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
+ btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
+ btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
+ btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
+ btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
+ btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
+ btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
+ btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
+ btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
+ btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
+ btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
+ btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
}
static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
@@ -1207,7 +1189,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
@@ -1260,16 +1242,16 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
x[22] = x[23];
x[25] = x[24];
x[30] = x[31];
- btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
- btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
- btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
- btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
// stage 5
x[9] = x[8];
x[14] = x[15];
- btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
- btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
x[35] = x[32];
x[34] = x[33];
x[36] = x[39];
@@ -1289,7 +1271,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
// stage 6
btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
x[19] = x[16];
x[18] = x[17];
x[20] = x[23];
@@ -1298,7 +1280,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
x[26] = x[25];
x[28] = x[31];
x[29] = x[30];
- idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
// stage 7
x[3] = x[0];
@@ -1307,7 +1289,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
x[10] = x[9];
x[12] = x[15];
x[13] = x[14];
- idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
// stage 8
x[7] = x[0];
@@ -1315,12 +1297,12 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
x[5] = x[2];
x[4] = x[3];
x[9] = x[9];
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
- idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
- idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
- idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage9_avx2(x, cospi, _r, cos_bit);
+ idct64_stage10_avx2(x, cospi, _r, cos_bit);
idct64_stage11_avx2(output, x);
}
@@ -1328,7 +1310,7 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
@@ -1398,7 +1380,7 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
x[26] = x[27];
x[29] = x[28];
x[30] = x[31];
- idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
// stage 5
btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
@@ -1406,37 +1388,37 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
x[10] = x[11];
x[13] = x[12];
x[14] = x[15];
- idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
// stage 6
btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
x[5] = x[4];
x[6] = x[7];
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
- idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+ idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
// stage 7
x[3] = x[0];
x[2] = x[1];
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
- btf_16_adds_subs_avx2(x[8], x[11]);
- btf_16_adds_subs_avx2(x[9], x[10]);
- btf_16_subs_adds_avx2(x[15], x[12]);
- btf_16_subs_adds_avx2(x[14], x[13]);
- idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
// stage 8
- btf_16_adds_subs_avx2(x[0], x[7]);
- btf_16_adds_subs_avx2(x[1], x[6]);
- btf_16_adds_subs_avx2(x[2], x[5]);
- btf_16_adds_subs_avx2(x[3], x[4]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
- idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
-
- idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
- idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
+
+ idct64_stage9_avx2(x, cospi, _r, cos_bit);
+ idct64_stage10_avx2(x, cospi, _r, cos_bit);
idct64_stage11_avx2(output, x);
}
@@ -1444,7 +1426,7 @@ static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
@@ -1514,78 +1496,78 @@ static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
- btf_16_adds_subs_avx2(x[32], x[33]);
- btf_16_subs_adds_avx2(x[35], x[34]);
- btf_16_adds_subs_avx2(x[36], x[37]);
- btf_16_subs_adds_avx2(x[39], x[38]);
- btf_16_adds_subs_avx2(x[40], x[41]);
- btf_16_subs_adds_avx2(x[43], x[42]);
- btf_16_adds_subs_avx2(x[44], x[45]);
- btf_16_subs_adds_avx2(x[47], x[46]);
- btf_16_adds_subs_avx2(x[48], x[49]);
- btf_16_subs_adds_avx2(x[51], x[50]);
- btf_16_adds_subs_avx2(x[52], x[53]);
- btf_16_subs_adds_avx2(x[55], x[54]);
- btf_16_adds_subs_avx2(x[56], x[57]);
- btf_16_subs_adds_avx2(x[59], x[58]);
- btf_16_adds_subs_avx2(x[60], x[61]);
- btf_16_subs_adds_avx2(x[63], x[62]);
+ btf_16_adds_subs_avx2(&x[32], &x[33]);
+ btf_16_adds_subs_avx2(&x[35], &x[34]);
+ btf_16_adds_subs_avx2(&x[36], &x[37]);
+ btf_16_adds_subs_avx2(&x[39], &x[38]);
+ btf_16_adds_subs_avx2(&x[40], &x[41]);
+ btf_16_adds_subs_avx2(&x[43], &x[42]);
+ btf_16_adds_subs_avx2(&x[44], &x[45]);
+ btf_16_adds_subs_avx2(&x[47], &x[46]);
+ btf_16_adds_subs_avx2(&x[48], &x[49]);
+ btf_16_adds_subs_avx2(&x[51], &x[50]);
+ btf_16_adds_subs_avx2(&x[52], &x[53]);
+ btf_16_adds_subs_avx2(&x[55], &x[54]);
+ btf_16_adds_subs_avx2(&x[56], &x[57]);
+ btf_16_adds_subs_avx2(&x[59], &x[58]);
+ btf_16_adds_subs_avx2(&x[60], &x[61]);
+ btf_16_adds_subs_avx2(&x[63], &x[62]);
// stage 4
btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
- btf_16_adds_subs_avx2(x[16], x[17]);
- btf_16_subs_adds_avx2(x[19], x[18]);
- btf_16_adds_subs_avx2(x[20], x[21]);
- btf_16_subs_adds_avx2(x[23], x[22]);
- btf_16_adds_subs_avx2(x[24], x[25]);
- btf_16_subs_adds_avx2(x[27], x[26]);
- btf_16_adds_subs_avx2(x[28], x[29]);
- btf_16_subs_adds_avx2(x[31], x[30]);
- idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[16], &x[17]);
+ btf_16_adds_subs_avx2(&x[19], &x[18]);
+ btf_16_adds_subs_avx2(&x[20], &x[21]);
+ btf_16_adds_subs_avx2(&x[23], &x[22]);
+ btf_16_adds_subs_avx2(&x[24], &x[25]);
+ btf_16_adds_subs_avx2(&x[27], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[29]);
+ btf_16_adds_subs_avx2(&x[31], &x[30]);
+ idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
// stage 5
btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
- btf_16_adds_subs_avx2(x[8], x[9]);
- btf_16_subs_adds_avx2(x[11], x[10]);
- btf_16_adds_subs_avx2(x[12], x[13]);
- btf_16_subs_adds_avx2(x[15], x[14]);
- idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[9]);
+ btf_16_adds_subs_avx2(&x[11], &x[10]);
+ btf_16_adds_subs_avx2(&x[12], &x[13]);
+ btf_16_adds_subs_avx2(&x[15], &x[14]);
+ idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
// stage 6
btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
- btf_16_adds_subs_avx2(x[4], x[5]);
- btf_16_subs_adds_avx2(x[7], x[6]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
- idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[4], &x[5]);
+ btf_16_adds_subs_avx2(&x[7], &x[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+ idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
// stage 7
- btf_16_adds_subs_avx2(x[0], x[3]);
- btf_16_adds_subs_avx2(x[1], x[2]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
- btf_16_adds_subs_avx2(x[8], x[11]);
- btf_16_adds_subs_avx2(x[9], x[10]);
- btf_16_subs_adds_avx2(x[15], x[12]);
- btf_16_subs_adds_avx2(x[14], x[13]);
- idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[0], &x[3]);
+ btf_16_adds_subs_avx2(&x[1], &x[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
// stage 8
- btf_16_adds_subs_avx2(x[0], x[7]);
- btf_16_adds_subs_avx2(x[1], x[6]);
- btf_16_adds_subs_avx2(x[2], x[5]);
- btf_16_adds_subs_avx2(x[3], x[4]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
- idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
// stage 9~11
- idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
- idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage9_avx2(x, cospi, _r, cos_bit);
+ idct64_stage10_avx2(x, cospi, _r, cos_bit);
idct64_stage11_avx2(output, x);
}
@@ -1667,7 +1649,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
if (lr_flip) {
for (int j = 0; j < buf_size_w_div16; ++j) {
__m256i temp[16];
- flip_buf_av2(buf0 + 16 * j, temp, 16);
+ flip_buf_avx2(buf0 + 16 * j, temp, 16);
int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
}
@@ -1693,18 +1675,18 @@ static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
int txw_idx, int rect_type) {
const int32_t *input_row = input;
const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
- const __m256i rounding = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
- (1 << (NewSqrt2Bits - shift - 1)));
+ const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+ (1 << (NewSqrt2Bits - shift - 1)));
const __m256i one = _mm256_set1_epi16(1);
- const __m256i scale_rounding = _mm256_unpacklo_epi16(scale, rounding);
+ const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
if (rect_type != 1 && rect_type != -1) {
for (int i = 0; i < height; ++i) {
const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
input_row += stride;
__m256i lo = _mm256_unpacklo_epi16(src, one);
__m256i hi = _mm256_unpackhi_epi16(src, one);
- lo = _mm256_madd_epi16(lo, scale_rounding);
- hi = _mm256_madd_epi16(hi, scale_rounding);
+ lo = _mm256_madd_epi16(lo, scale__r);
+ hi = _mm256_madd_epi16(hi, scale__r);
lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
out[i] = _mm256_packs_epi32(lo, hi);
@@ -1718,8 +1700,8 @@ static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
input_row += stride;
__m256i lo = _mm256_unpacklo_epi16(src, one);
__m256i hi = _mm256_unpackhi_epi16(src, one);
- lo = _mm256_madd_epi16(lo, scale_rounding);
- hi = _mm256_madd_epi16(hi, scale_rounding);
+ lo = _mm256_madd_epi16(lo, scale__r);
+ hi = _mm256_madd_epi16(hi, scale__r);
lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
out[i] = _mm256_packs_epi32(lo, hi);
@@ -1731,10 +1713,10 @@ static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
__m256i *buf, int shift, int height,
int txh_idx) {
const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
- const __m256i scale_rounding = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
- const __m256i shift_rounding = _mm256_set1_epi32(1 << (-shift - 1));
+ const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
+ const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
const __m256i one = _mm256_set1_epi16(1);
- const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale_rounding);
+ const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
for (int h = 0; h < height; ++h) {
__m256i lo = _mm256_unpacklo_epi16(buf[h], one);
__m256i hi = _mm256_unpackhi_epi16(buf[h], one);
@@ -1742,8 +1724,8 @@ static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
hi = _mm256_madd_epi16(hi, scale_coeff);
lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
- lo = _mm256_add_epi32(lo, shift_rounding);
- hi = _mm256_add_epi32(hi, shift_rounding);
+ lo = _mm256_add_epi32(lo, shift__r);
+ hi = _mm256_add_epi32(hi, shift__r);
lo = _mm256_srai_epi32(lo, -shift);
hi = _mm256_srai_epi32(hi, -shift);
const __m256i x = _mm256_packs_epi32(lo, hi);
@@ -1856,7 +1838,7 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
if (lr_flip) {
for (int j = 0; j < buf_size_w_div16; ++j) {
__m256i temp[16];
- flip_buf_av2(buf0 + 16 * j, temp, 16);
+ flip_buf_avx2(buf0 + 16 * j, temp, 16);
transpose_16bit_16x16_avx2(temp,
_buf1 + 16 * (buf_size_w_div16 - 1 - j));
}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
index c17f655c55..7b5b29cf8b 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -19,37 +19,12 @@
#include "aom/aom_integer.h"
#include "aom_dsp/x86/transpose_sse2.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
#ifdef __cplusplus
extern "C" {
#endif
-#define pair_set_w16_epi16(a, b) \
- _mm256_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
-
-#define btf_16_w16_avx2(w0, w1, in0, in1, out0, out1) \
- { \
- __m256i t0 = _mm256_unpacklo_epi16(in0, in1); \
- __m256i t1 = _mm256_unpackhi_epi16(in0, in1); \
- __m256i u0 = _mm256_madd_epi16(t0, w0); \
- __m256i u1 = _mm256_madd_epi16(t1, w0); \
- __m256i v0 = _mm256_madd_epi16(t0, w1); \
- __m256i v1 = _mm256_madd_epi16(t1, w1); \
- \
- __m256i a0 = _mm256_add_epi32(u0, __rounding); \
- __m256i a1 = _mm256_add_epi32(u1, __rounding); \
- __m256i b0 = _mm256_add_epi32(v0, __rounding); \
- __m256i b1 = _mm256_add_epi32(v1, __rounding); \
- \
- __m256i c0 = _mm256_srai_epi32(a0, cos_bit); \
- __m256i c1 = _mm256_srai_epi32(a1, cos_bit); \
- __m256i d0 = _mm256_srai_epi32(b0, cos_bit); \
- __m256i d1 = _mm256_srai_epi32(b1, cos_bit); \
- \
- out0 = _mm256_packs_epi32(c0, c1); \
- out1 = _mm256_packs_epi32(d0, d1); \
- }
-
// half input is zero
#define btf_16_w16_0_avx2(w0, w1, in, out0, out1) \
{ \
@@ -60,111 +35,6 @@ extern "C" {
out1 = _mm256_mulhrs_epi16(_in, _w1); \
}
-#define btf_16_adds_subs_avx2(in0, in1) \
- { \
- const __m256i _in0 = in0; \
- const __m256i _in1 = in1; \
- in0 = _mm256_adds_epi16(_in0, _in1); \
- in1 = _mm256_subs_epi16(_in0, _in1); \
- }
-
-#define btf_16_subs_adds_avx2(in0, in1) \
- { \
- const __m256i _in0 = in0; \
- const __m256i _in1 = in1; \
- in1 = _mm256_subs_epi16(_in0, _in1); \
- in0 = _mm256_adds_epi16(_in0, _in1); \
- }
-
-#define btf_16_adds_subs_out_avx2(out0, out1, in0, in1) \
- { \
- const __m256i _in0 = in0; \
- const __m256i _in1 = in1; \
- out0 = _mm256_adds_epi16(_in0, _in1); \
- out1 = _mm256_subs_epi16(_in0, _in1); \
- }
-
-static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
- const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
- const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
- return _mm256_permute4x64_epi64(b, 0xD8);
-}
-
-static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
- int stride, __m256i *out,
- int out_size) {
- for (int i = 0; i < out_size; ++i) {
- out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
- }
-}
-
-static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
- __m256i *const out) {
- // Unpack 16 bit elements. Goes from:
- // in[0]: 00 01 02 03 08 09 0a 0b 04 05 06 07 0c 0d 0e 0f
- // in[1]: 10 11 12 13 18 19 1a 1b 14 15 16 17 1c 1d 1e 1f
- // in[2]: 20 21 22 23 28 29 2a 2b 24 25 26 27 2c 2d 2e 2f
- // in[3]: 30 31 32 33 38 39 3a 3b 34 35 36 37 3c 3d 3e 3f
- // in[4]: 40 41 42 43 48 49 4a 4b 44 45 46 47 4c 4d 4e 4f
- // in[5]: 50 51 52 53 58 59 5a 5b 54 55 56 57 5c 5d 5e 5f
- // in[6]: 60 61 62 63 68 69 6a 6b 64 65 66 67 6c 6d 6e 6f
- // in[7]: 70 71 72 73 78 79 7a 7b 74 75 76 77 7c 7d 7e 7f
- // in[8]: 80 81 82 83 88 89 8a 8b 84 85 86 87 8c 8d 8e 8f
- // to:
- // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
- // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
- // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
- // ...
- __m256i a[16];
- for (int i = 0; i < 16; i += 2) {
- a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
- a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
- }
- __m256i b[16];
- for (int i = 0; i < 16; i += 2) {
- b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
- b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
- }
- __m256i c[16];
- for (int i = 0; i < 16; i += 2) {
- c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
- c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
- }
- out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
- out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
- out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
- out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
-
- out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
- out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
- out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
- out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
-
- out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
- out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
- out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
- out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
-
- out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
- out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
- out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
- out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
-}
-
-static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
- if (bit < 0) {
- __m256i scale = _mm256_set1_epi16(1 << (bit + 15));
- for (int i = 0; i < size; ++i) {
- in[i] = _mm256_mulhrs_epi16(in[i], scale);
- }
- } else if (bit > 0) {
- for (int i = 0; i < size; ++i) {
- in[i] = _mm256_slli_epi16(in[i], bit);
- }
- }
-}
-
static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
int size) {
const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
@@ -173,12 +43,6 @@ static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
}
}
-static INLINE void flip_buf_av2(__m256i *in, __m256i *out, int size) {
- for (int i = 0; i < size; ++i) {
- out[size - i - 1] = in[i];
- }
-}
-
static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
__m128i pred = _mm_loadu_si128((__m128i const *)(output));
__m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
@@ -197,9 +61,6 @@ static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
}
}
-typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
- int8_t cos_bit);
-
void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, TX_SIZE tx_size,
int eob);
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
index cccc62f03a..90b9879cc4 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
#include "config/aom_dsp_rtcd.h"
#include "av1/common/av1_txfm.h"
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
index faf7251fad..367e02096e 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse4.h
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
#ifndef AV1_TXFM_SSE4_H_
#define AV1_TXFM_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
index fd5e90a2eb..1099144fed 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -24,8 +24,8 @@
void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int bd = 8;
@@ -46,10 +46,10 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
assert(conv_params->round_0 > 0);
- filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
@@ -180,8 +180,8 @@ static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
(void)filter_params_x;
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
index fc0e654536..637f83cf7f 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -21,8 +21,8 @@
void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int bd = 8;
@@ -46,7 +46,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
/* Horizontal filter */
{
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -112,7 +112,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
/* Vertical filter */
{
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -239,8 +239,8 @@ static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
(void)filter_params_x;
@@ -357,8 +357,8 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int bd = 8;
diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c
index 6fdfb0954b..0e91ea9475 100644
--- a/third_party/aom/av1/common/x86/convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/convolve_avx2.c
@@ -19,8 +19,8 @@
void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
int i, j;
@@ -176,8 +176,8 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
int i, j;
@@ -187,10 +187,10 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
__m256i filt[4], coeffs[4];
- filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
index 18fe9ae5a1..f66dee37d3 100644
--- a/third_party/aom/av1/common/x86/convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -23,7 +23,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
const int subpel_q4,
__m128i *const coeffs /* [4] */) {
const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params, subpel_q4 & SUBPEL_MASK);
+ filter_params, subpel_q4 & SUBPEL_MASK);
const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
// coeffs 0 1 0 1 2 3 2 3
const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
@@ -78,8 +78,8 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s,
void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
const uint8_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -239,8 +239,8 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride,
const uint8_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int fo_horiz = filter_params_x->taps / 2 - 1;
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
index a34c618d0f..8444ffa930 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -23,8 +23,8 @@
void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4,
const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
@@ -222,8 +222,8 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
void av1_highbd_convolve_2d_copy_sr_avx2(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
(void)filter_params_x;
(void)filter_params_y;
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
index bdf813fa01..15f8872c18 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
@@ -73,8 +73,8 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
void av1_highbd_convolve_2d_copy_sr_sse2(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
(void)filter_params_x;
(void)filter_params_y;
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
index 5d2fc465e0..eb340523a0 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -24,8 +24,8 @@
void av1_highbd_jnt_convolve_2d_copy_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
@@ -169,8 +169,8 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
void av1_highbd_jnt_convolve_2d_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
DECLARE_ALIGNED(16, int16_t,
im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
@@ -207,7 +207,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
/* Horizontal filter */
{
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -274,7 +274,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
/* Vertical filter */
{
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
// coeffs 0 1 0 1 2 3 2 3
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
index a9cf6a4d67..33183fdeec 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -20,13 +20,11 @@
#include "aom_dsp/x86/convolve_sse2.h"
#include "av1/common/convolve.h"
-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride,
- uint16_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4,
- const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
+void av1_highbd_convolve_2d_sr_ssse3(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
int im_h = h + filter_params_y->taps - 1;
int im_stride = 8;
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
index 89d0ecb1ef..608bd88a43 100644
--- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -25,8 +25,8 @@
void av1_highbd_jnt_convolve_2d_copy_avx2(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
@@ -224,13 +224,11 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
}
}
-void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4,
- const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
+void av1_highbd_jnt_convolve_2d_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
@@ -459,13 +457,11 @@ void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride,
}
}
-void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4,
- const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
+void av1_highbd_jnt_convolve_x_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -628,13 +624,11 @@ void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride,
}
}
-void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4,
- const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
+void av1_highbd_jnt_convolve_y_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int fo_vert = filter_params_y->taps / 2 - 1;
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
index ccca6b07ae..1a29985b56 100644
--- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -19,8 +19,8 @@
void av1_highbd_jnt_convolve_y_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
@@ -259,8 +259,8 @@ void av1_highbd_jnt_convolve_y_sse4_1(
void av1_highbd_jnt_convolve_x_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
index ac1d2c9ca8..d1ea262903 100644
--- a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -23,8 +23,8 @@
void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -53,10 +53,10 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
assert(bits >= 0);
assert(conv_params->round_0 > 0);
- filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
@@ -126,8 +126,8 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -389,8 +389,8 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -422,10 +422,10 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
assert(conv_params->round_0 > 0);
- filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
@@ -581,8 +581,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int bd = 8;
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
index 4df7bd42eb..87dc3242e8 100644
--- a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
@@ -18,8 +18,8 @@
void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int bd = 8;
@@ -152,8 +152,8 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int bd = 8;
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
index e4d51ac8d3..822772782b 100644
--- a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
@@ -18,8 +18,8 @@
void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -56,7 +56,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
/* Horizontal filter */
{
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -124,7 +124,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
/* Vertical filter */
{
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
// coeffs 0 1 0 1 2 3 2 3
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index a42c940283..c64150b9db 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
#include <smmintrin.h>
#include "config/aom_config.h"
diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c
index e92c6b28c8..6dbc4f3eb4 100644
--- a/third_party/aom/av1/decoder/decodeframe.c
+++ b/third_party/aom/av1/decoder/decodeframe.c
@@ -84,15 +84,15 @@ int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
}
// Use only_chroma = 1 to only set the chroma planes
-static void set_planes_to_neutral_grey(AV1_COMMON *const cm,
+static void set_planes_to_neutral_grey(const SequenceHeader *const seq_params,
const YV12_BUFFER_CONFIG *const buf,
int only_chroma) {
- const int val = 1 << (cm->bit_depth - 1);
+ const int val = 1 << (seq_params->bit_depth - 1);
for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
const int is_uv = plane > 0;
for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
- if (cm->use_highbitdepth) {
+ if (seq_params->use_highbitdepth) {
// TODO(yaowu): replace this with aom_memset16() for speed
for (int col_idx = 0; col_idx < buf->crop_widths[is_uv]; col_idx++) {
uint16_t *base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
@@ -157,16 +157,18 @@ static void inverse_transform_block(MACROBLOCKD *xd, int plane,
memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
}
-static void read_coeffs_tx_intra_block(AV1_COMMON *cm, MACROBLOCKD *const xd,
- aom_reader *const r, int plane, int row,
- int col, TX_SIZE tx_size) {
+static void read_coeffs_tx_intra_block(const AV1_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ aom_reader *const r, const int plane,
+ const int row, const int col,
+ const TX_SIZE tx_size) {
MB_MODE_INFO *mbmi = xd->mi[0];
if (!mbmi->skip) {
#if TXCOEFF_TIMER
struct aom_usec_timer timer;
aom_usec_timer_start(&timer);
#endif
- av1_read_coeffs_txb_facade(cm, xd, r, row, col, plane, tx_size);
+ av1_read_coeffs_txb_facade(cm, xd, r, plane, row, col, tx_size);
#if TXCOEFF_TIMER
aom_usec_timer_mark(&timer);
const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
@@ -176,11 +178,38 @@ static void read_coeffs_tx_intra_block(AV1_COMMON *cm, MACROBLOCKD *const xd,
}
}
-static void predict_and_reconstruct_intra_block(AV1_COMMON *cm,
- MACROBLOCKD *const xd,
- aom_reader *const r, int plane,
- int row, int col,
- TX_SIZE tx_size) {
+static void decode_block_void(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ aom_reader *const r, const int plane,
+ const int row, const int col,
+ const TX_SIZE tx_size) {
+ (void)cm;
+ (void)xd;
+ (void)r;
+ (void)plane;
+ (void)row;
+ (void)col;
+ (void)tx_size;
+}
+
+static void predict_inter_block_void(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ (void)cm;
+ (void)xd;
+ (void)mi_row;
+ (void)mi_col;
+ (void)bsize;
+}
+
+static void cfl_store_inter_block_void(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd) {
+ (void)cm;
+ (void)xd;
+}
+
+static void predict_and_reconstruct_intra_block(
+ const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+ const int plane, const int row, const int col, const TX_SIZE tx_size) {
(void)r;
MB_MODE_INFO *mbmi = xd->mi[0];
PLANE_TYPE plane_type = get_plane_type(plane);
@@ -208,28 +237,33 @@ static void predict_and_reconstruct_intra_block(AV1_COMMON *cm,
static void inverse_transform_inter_block(const AV1_COMMON *const cm,
MACROBLOCKD *const xd,
- aom_reader *const r,
+ aom_reader *const r, const int plane,
const int blk_row, const int blk_col,
- const int plane,
const TX_SIZE tx_size) {
(void)r;
PLANE_TYPE plane_type = get_plane_type(plane);
const struct macroblockd_plane *const pd = &xd->plane[plane];
- MB_MODE_INFO *mbmi = xd->mi[0];
// tx_type will be read out in av1_read_coeffs_txb_facade
const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
tx_size, cm->reduced_tx_set_used);
- if (plane == 0)
- update_txk_array(mbmi->txk_type, mbmi->sb_type, blk_row, blk_col, tx_size,
- tx_type);
-
uint8_t *dst =
&pd->dst
.buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
cm->reduced_tx_set_used);
+#if CONFIG_MISMATCH_DEBUG
+ int pixel_c, pixel_r;
+ BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+ int blk_w = block_size_wide[bsize];
+ int blk_h = block_size_high[bsize];
+ mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
+ pd->subsampling_x, pd->subsampling_y);
+ mismatch_check_block_tx(dst, pd->dst.stride, cm->frame_offset, plane, pixel_c,
+ pixel_r, blk_w, blk_h,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+#endif
}
static void set_cb_buffer_offsets(MACROBLOCKD *const xd, TX_SIZE tx_size,
@@ -239,11 +273,12 @@ static void set_cb_buffer_offsets(MACROBLOCKD *const xd, TX_SIZE tx_size,
xd->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
}
-static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd,
+static void decode_reconstruct_tx(AV1_COMMON *cm, ThreadData *const td,
aom_reader *r, MB_MODE_INFO *const mbmi,
int plane, BLOCK_SIZE plane_bsize,
int blk_row, int blk_col, int block,
TX_SIZE tx_size, int *eob_total) {
+ MACROBLOCKD *const xd = &td->xd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
const TX_SIZE plane_tx_size =
plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
@@ -257,30 +292,11 @@ static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd,
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
if (tx_size == plane_tx_size || plane) {
-#if TXCOEFF_TIMER
- struct aom_usec_timer timer;
- aom_usec_timer_start(&timer);
-#endif
- av1_read_coeffs_txb_facade(cm, xd, r, blk_row, blk_col, plane, tx_size);
-#if TXCOEFF_TIMER
- aom_usec_timer_mark(&timer);
- const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
- cm->txcoeff_timer += elapsed_time;
- ++cm->txb_count;
-#endif
- inverse_transform_inter_block(cm, xd, r, blk_row, blk_col, plane, tx_size);
+ td->read_coeffs_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+ tx_size);
-#if CONFIG_MISMATCH_DEBUG
- int pixel_c, pixel_r;
- BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
- int blk_w = block_size_wide[bsize];
- int blk_h = block_size_high[bsize];
- mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
- pd->subsampling_x, pd->subsampling_y);
- mismatch_check_block_tx(dst, pd->dst.stride, cm->frame_offset, plane,
- pixel_c, pixel_r, blk_w, blk_h,
- xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
-#endif
+ td->inverse_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+ tx_size);
eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
*eob_total += eob_data->eob;
set_cb_buffer_offsets(xd, tx_size, plane);
@@ -301,7 +317,7 @@ static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd,
if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
- decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, offsetr,
+ decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr,
offsetc, block, sub_txs, eob_total);
block += sub_step;
}
@@ -352,6 +368,7 @@ static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
int mi_row, int mi_col, aom_reader *r,
PARTITION_TYPE partition, BLOCK_SIZE bsize) {
AV1_COMMON *const cm = &pbi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
const int bw = mi_size_wide[bsize];
const int bh = mi_size_high[bsize];
const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
@@ -363,9 +380,11 @@ static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
xd->mi[0]->partition = partition;
av1_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
- if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
+ if (bsize >= BLOCK_8X8 &&
+ (seq_params->subsampling_x || seq_params->subsampling_y)) {
const BLOCK_SIZE uv_subsize =
- ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
+ ss_size_lookup[bsize][seq_params->subsampling_x]
+ [seq_params->subsampling_y];
if (uv_subsize == BLOCK_INVALID)
aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
"Invalid block size.");
@@ -843,8 +862,8 @@ static void dec_build_inter_predictors_sby(const AV1_COMMON *cm,
BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
{ xd->plane[0].dst.stride, 0, 0 } };
if (!ctx) ctx = &default_ctx;
- av1_build_interintra_predictors_sby(cm, xd, xd->plane[0].dst.buf,
- xd->plane[0].dst.stride, ctx, bsize);
+ av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride, ctx, 0, bsize);
}
}
@@ -1052,6 +1071,20 @@ static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
dec_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
if (mbmi->motion_mode == OBMC_CAUSAL)
dec_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+#if CONFIG_MISMATCH_DEBUG
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ int pixel_c, pixel_r;
+ mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x,
+ pd->subsampling_y);
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ continue;
+ mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset,
+ plane, pixel_c, pixel_r, pd->width, pd->height,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+ }
+#endif
}
static void set_color_index_map_offset(MACROBLOCKD *const xd, int plane,
@@ -1064,42 +1097,19 @@ static void set_color_index_map_offset(MACROBLOCKD *const xd, int plane,
xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
}
-static void decode_token_and_recon_block(AV1Decoder *const pbi,
- MACROBLOCKD *const xd, int mi_row,
- int mi_col, aom_reader *r,
- BLOCK_SIZE bsize) {
+static void decode_token_recon_block(AV1Decoder *const pbi,
+ ThreadData *const td, int mi_row,
+ int mi_col, aom_reader *r,
+ BLOCK_SIZE bsize) {
AV1_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &td->xd;
const int num_planes = av1_num_planes(cm);
- const int bw = mi_size_wide[bsize];
- const int bh = mi_size_high[bsize];
- const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
- const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
- set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
MB_MODE_INFO *mbmi = xd->mi[0];
CFL_CTX *const cfl = &xd->cfl;
cfl->is_chroma_reference = is_chroma_reference(
mi_row, mi_col, bsize, cfl->subsampling_x, cfl->subsampling_y);
- if (cm->delta_q_present_flag) {
- for (int i = 0; i < MAX_SEGMENTS; i++) {
- const int current_qindex =
- av1_get_qindex(&cm->seg, i, xd->current_qindex);
- for (int j = 0; j < num_planes; ++j) {
- const int dc_delta_q =
- j == 0 ? cm->y_dc_delta_q
- : (j == 1 ? cm->u_dc_delta_q : cm->v_dc_delta_q);
- const int ac_delta_q =
- j == 0 ? 0 : (j == 1 ? cm->u_ac_delta_q : cm->v_ac_delta_q);
- xd->plane[j].seg_dequant_QTX[i][0] =
- av1_dc_quant_QTX(current_qindex, dc_delta_q, cm->bit_depth);
- xd->plane[j].seg_dequant_QTX[i][1] =
- av1_ac_quant_QTX(current_qindex, ac_delta_q, cm->bit_depth);
- }
- }
- }
- if (mbmi->skip) av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
-
if (!is_inter_block(mbmi)) {
int row, col;
assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
@@ -1135,10 +1145,10 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi,
blk_row += stepr) {
for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
blk_col += stepc) {
- read_coeffs_tx_intra_block(cm, xd, r, plane, blk_row, blk_col,
- tx_size);
- predict_and_reconstruct_intra_block(cm, xd, r, plane, blk_row,
- blk_col, tx_size);
+ td->read_coeffs_tx_intra_block_visit(cm, xd, r, plane, blk_row,
+ blk_col, tx_size);
+ td->predict_and_recon_intra_block_visit(cm, xd, r, plane, blk_row,
+ blk_col, tx_size);
set_cb_buffer_offsets(xd, tx_size, plane);
}
}
@@ -1146,22 +1156,7 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi,
}
}
} else {
- predict_inter_block(cm, xd, mi_row, mi_col, bsize);
-#if CONFIG_MISMATCH_DEBUG
- for (int plane = 0; plane < num_planes; ++plane) {
- const struct macroblockd_plane *pd = &xd->plane[plane];
- int pixel_c, pixel_r;
- mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
- pd->subsampling_x, pd->subsampling_y);
- if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
- pd->subsampling_y))
- continue;
- mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset,
- plane, pixel_c, pixel_r, pd->width, pd->height,
- xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
- }
-#endif
-
+ td->predict_inter_block_visit(cm, xd, mi_row, mi_col, bsize);
// Reconstruction
if (!mbmi->skip) {
int eobtotal = 0;
@@ -1213,7 +1208,7 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi,
blk_row += bh_var_tx) {
for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
blk_col += bw_var_tx) {
- decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize,
+ decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize,
blk_row, blk_col, block, max_tx_size,
&eobtotal);
block += step;
@@ -1223,14 +1218,11 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi,
}
}
}
- cfl_store_inter_block(cm, xd);
+ td->cfl_store_inter_block_visit(cm, xd);
}
av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
set_color_index_map_offset);
-
- int reader_corrupted_flag = aom_reader_has_error(r);
- aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
}
static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
@@ -1338,15 +1330,17 @@ static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter,
}
}
-static void decode_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
- int mi_row, int mi_col, aom_reader *r,
- PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td,
+ int mi_row, int mi_col, aom_reader *r,
+ PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &td->xd;
decode_mbmi_block(pbi, xd, mi_row, mi_col, r, partition, bsize);
av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
av1_decode_palette_tokens);
AV1_COMMON *cm = &pbi->common;
+ const int num_planes = av1_num_planes(cm);
MB_MODE_INFO *mbmi = xd->mi[0];
int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
@@ -1368,7 +1362,63 @@ static void decode_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
mbmi->skip && is_inter_block(mbmi), xd);
}
- decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize);
+ if (cm->delta_q_present_flag) {
+ for (int i = 0; i < MAX_SEGMENTS; i++) {
+ const int current_qindex =
+ av1_get_qindex(&cm->seg, i, xd->current_qindex);
+ for (int j = 0; j < num_planes; ++j) {
+ const int dc_delta_q =
+ j == 0 ? cm->y_dc_delta_q
+ : (j == 1 ? cm->u_dc_delta_q : cm->v_dc_delta_q);
+ const int ac_delta_q =
+ j == 0 ? 0 : (j == 1 ? cm->u_ac_delta_q : cm->v_ac_delta_q);
+ xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX(
+ current_qindex, dc_delta_q, cm->seq_params.bit_depth);
+ xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX(
+ current_qindex, ac_delta_q, cm->seq_params.bit_depth);
+ }
+ }
+ }
+ if (mbmi->skip) av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
+
+ decode_token_recon_block(pbi, td, mi_row, mi_col, r, bsize);
+
+ int reader_corrupted_flag = aom_reader_has_error(r);
+ aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
+}
+
+static void set_offsets_for_pred_and_recon(AV1Decoder *const pbi,
+ ThreadData *const td, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &td->xd;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int num_planes = av1_num_planes(cm);
+
+ const int offset = mi_row * cm->mi_stride + mi_col;
+ const TileInfo *const tile = &xd->tile;
+
+ xd->mi = cm->mi_grid_visible + offset;
+ xd->cfl.mi_row = mi_row;
+ xd->cfl.mi_col = mi_col;
+
+ set_plane_n4(xd, bw, bh, num_planes);
+
+ // Distance of Mb to the various image edges. These are specified to 8th pel
+ // as they are always compared to values that are in 1/8th pel units
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+ av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+ mi_col, 0, num_planes);
+}
+
+static void decode_block(AV1Decoder *const pbi, ThreadData *const td,
+ int mi_row, int mi_col, aom_reader *r,
+ PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+ (void)partition;
+ set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize);
+ decode_token_recon_block(pbi, td, mi_row, mi_col, r, bsize);
}
static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
@@ -1401,10 +1451,11 @@ static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
}
// TODO(slavarnway): eliminate bsize and subsize in future commits
-static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+static void decode_partition(AV1Decoder *const pbi, ThreadData *const td,
int mi_row, int mi_col, aom_reader *r,
- BLOCK_SIZE bsize) {
+ BLOCK_SIZE bsize, int parse_decode_flag) {
AV1_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &td->xd;
const int bw = mi_size_wide[bsize];
const int hbs = bw >> 1;
PARTITION_TYPE partition;
@@ -1416,25 +1467,36 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
- const int num_planes = av1_num_planes(cm);
- for (int plane = 0; plane < num_planes; ++plane) {
- int rcol0, rcol1, rrow0, rrow1, tile_tl_idx;
- if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
- &rcol0, &rcol1, &rrow0, &rrow1,
- &tile_tl_idx)) {
- const int rstride = cm->rst_info[plane].horz_units_per_tile;
- for (int rrow = rrow0; rrow < rrow1; ++rrow) {
- for (int rcol = rcol0; rcol < rcol1; ++rcol) {
- const int runit_idx = tile_tl_idx + rcol + rrow * rstride;
- loop_restoration_read_sb_coeffs(cm, xd, r, plane, runit_idx);
+ // parse_decode_flag takes the following values :
+ // 01 - do parse only
+ // 10 - do decode only
+ // 11 - do parse and decode
+ static const block_visitor_fn_t block_visit[4] = {
+ NULL, parse_decode_block, decode_block, parse_decode_block
+ };
+
+ if (parse_decode_flag & 1) {
+ const int num_planes = av1_num_planes(cm);
+ for (int plane = 0; plane < num_planes; ++plane) {
+ int rcol0, rcol1, rrow0, rrow1;
+ if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+ &rcol0, &rcol1, &rrow0, &rrow1)) {
+ const int rstride = cm->rst_info[plane].horz_units_per_tile;
+ for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+ for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+ const int runit_idx = rcol + rrow * rstride;
+ loop_restoration_read_sb_coeffs(cm, xd, r, plane, runit_idx);
+ }
}
}
}
- }
- partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
- : read_partition(xd, mi_row, mi_col, r,
- has_rows, has_cols, bsize);
+ partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
+ : read_partition(xd, mi_row, mi_col, r,
+ has_rows, has_cols, bsize);
+ } else {
+ partition = get_partition(cm, mi_row, mi_col, bsize);
+ }
subsize = get_partition_subsize(bsize, partition);
// Check the bitstream is conformant: if there is subsampling on the
@@ -1442,18 +1504,19 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
const struct macroblockd_plane *const pd_u = &xd->plane[1];
if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) ==
BLOCK_INVALID) {
- aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
"Block size %dx%d invalid with this subsampling mode",
block_size_wide[subsize], block_size_high[subsize]);
}
#define DEC_BLOCK_STX_ARG
#define DEC_BLOCK_EPT_ARG partition,
-#define DEC_BLOCK(db_r, db_c, db_subsize) \
- decode_block(pbi, xd, DEC_BLOCK_STX_ARG(db_r), (db_c), r, \
- DEC_BLOCK_EPT_ARG(db_subsize))
-#define DEC_PARTITION(db_r, db_c, db_subsize) \
- decode_partition(pbi, xd, DEC_BLOCK_STX_ARG(db_r), (db_c), r, (db_subsize))
+#define DEC_BLOCK(db_r, db_c, db_subsize) \
+ block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), r, \
+ DEC_BLOCK_EPT_ARG(db_subsize))
+#define DEC_PARTITION(db_r, db_c, db_subsize) \
+ decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), r, (db_subsize), \
+ parse_decode_flag)
switch (partition) {
case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break;
@@ -1513,7 +1576,8 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
#undef DEC_BLOCK_EPT_ARG
#undef DEC_BLOCK_STX_ARG
- update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+ if (parse_decode_flag & 1)
+ update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
}
static void setup_bool_decoder(const uint8_t *data, const uint8_t *data_end,
@@ -1650,7 +1714,7 @@ static void decode_restoration_mode(AV1_COMMON *cm,
}
if (num_planes > 1) {
- int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
+ int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
if (s && !chroma_none) {
cm->rst_info[1].restoration_unit_size =
cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s);
@@ -1872,12 +1936,13 @@ static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) {
static void setup_quantization(AV1_COMMON *const cm,
struct aom_read_bit_buffer *rb) {
+ const SequenceHeader *const seq_params = &cm->seq_params;
const int num_planes = av1_num_planes(cm);
cm->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
cm->y_dc_delta_q = read_delta_q(rb);
if (num_planes > 1) {
int diff_uv_delta = 0;
- if (cm->separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
+ if (seq_params->separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
cm->u_dc_delta_q = read_delta_q(rb);
cm->u_ac_delta_q = read_delta_q(rb);
if (diff_uv_delta) {
@@ -1888,12 +1953,12 @@ static void setup_quantization(AV1_COMMON *const cm,
cm->v_ac_delta_q = cm->u_ac_delta_q;
}
}
- cm->dequant_bit_depth = cm->bit_depth;
+ cm->dequant_bit_depth = seq_params->bit_depth;
cm->using_qmatrix = aom_rb_read_bit(rb);
if (cm->using_qmatrix) {
cm->qm_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
cm->qm_u = aom_rb_read_literal(rb, QM_LEVEL_BITS);
- if (!cm->separate_uv_delta_q)
+ if (!seq_params->separate_uv_delta_q)
cm->qm_v = cm->qm_u;
else
cm->qm_v = aom_rb_read_literal(rb, QM_LEVEL_BITS);
@@ -1906,6 +1971,7 @@ static void setup_quantization(AV1_COMMON *const cm,
// Build y/uv dequant values based on segmentation.
static void setup_segmentation_dequant(AV1_COMMON *const cm) {
+ const int bit_depth = cm->seq_params.bit_depth;
const int using_qm = cm->using_qmatrix;
// When segmentation is disabled, only the first value is used. The
// remaining are don't cares.
@@ -1913,16 +1979,16 @@ static void setup_segmentation_dequant(AV1_COMMON *const cm) {
for (int i = 0; i < max_segments; ++i) {
const int qindex = av1_get_qindex(&cm->seg, i, cm->base_qindex);
cm->y_dequant_QTX[i][0] =
- av1_dc_quant_QTX(qindex, cm->y_dc_delta_q, cm->bit_depth);
- cm->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, cm->bit_depth);
+ av1_dc_quant_QTX(qindex, cm->y_dc_delta_q, bit_depth);
+ cm->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth);
cm->u_dequant_QTX[i][0] =
- av1_dc_quant_QTX(qindex, cm->u_dc_delta_q, cm->bit_depth);
+ av1_dc_quant_QTX(qindex, cm->u_dc_delta_q, bit_depth);
cm->u_dequant_QTX[i][1] =
- av1_ac_quant_QTX(qindex, cm->u_ac_delta_q, cm->bit_depth);
+ av1_ac_quant_QTX(qindex, cm->u_ac_delta_q, bit_depth);
cm->v_dequant_QTX[i][0] =
- av1_dc_quant_QTX(qindex, cm->v_dc_delta_q, cm->bit_depth);
+ av1_dc_quant_QTX(qindex, cm->v_dc_delta_q, bit_depth);
cm->v_dequant_QTX[i][1] =
- av1_ac_quant_QTX(qindex, cm->v_ac_delta_q, cm->bit_depth);
+ av1_ac_quant_QTX(qindex, cm->v_ac_delta_q, bit_depth);
const int lossless = qindex == 0 && cm->y_dc_delta_q == 0 &&
cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
@@ -1994,9 +2060,15 @@ static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
// Allocations in av1_alloc_context_buffers() depend on individual
// dimensions as well as the overall size.
if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
- if (av1_alloc_context_buffers(cm, width, height))
+ if (av1_alloc_context_buffers(cm, width, height)) {
+ // The cm->mi_* values have been cleared and any existing context
+ // buffers have been freed. Clear cm->width and cm->height to be
+ // consistent and to force a realloc next time.
+ cm->width = 0;
+ cm->height = 0;
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate context buffers");
+ }
} else {
av1_set_mb_mi(cm, width, height);
}
@@ -2012,21 +2084,22 @@ static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
struct aom_read_bit_buffer *rb) {
+ const SequenceHeader *const seq_params = &cm->seq_params;
int width, height;
BufferPool *const pool = cm->buffer_pool;
if (frame_size_override_flag) {
- int num_bits_width = cm->seq_params.num_bits_width;
- int num_bits_height = cm->seq_params.num_bits_height;
+ int num_bits_width = seq_params->num_bits_width;
+ int num_bits_height = seq_params->num_bits_height;
av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
- if (width > cm->seq_params.max_frame_width ||
- height > cm->seq_params.max_frame_height) {
+ if (width > seq_params->max_frame_width ||
+ height > seq_params->max_frame_height) {
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Frame dimensions are larger than the maximum values");
}
} else {
- width = cm->seq_params.max_frame_width;
- height = cm->seq_params.max_frame_height;
+ width = seq_params->max_frame_width;
+ height = seq_params->max_frame_height;
}
setup_superres(cm, rb, &width, &height);
@@ -2035,8 +2108,9 @@ static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
lock_buffer_pool(pool);
if (aom_realloc_frame_buffer(
- get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
- cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ get_frame_new_buffer(cm), cm->width, cm->height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
cm->byte_alignment,
&pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
pool->cb_priv)) {
@@ -2046,18 +2120,22 @@ static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
}
unlock_buffer_pool(pool);
- pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
- pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
- pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
- pool->frame_bufs[cm->new_fb_idx].buf.color_primaries = cm->color_primaries;
+ pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x =
+ seq_params->subsampling_x;
+ pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y =
+ seq_params->subsampling_y;
+ pool->frame_bufs[cm->new_fb_idx].buf.bit_depth =
+ (unsigned int)seq_params->bit_depth;
+ pool->frame_bufs[cm->new_fb_idx].buf.color_primaries =
+ seq_params->color_primaries;
pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics =
- cm->transfer_characteristics;
+ seq_params->transfer_characteristics;
pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients =
- cm->matrix_coefficients;
- pool->frame_bufs[cm->new_fb_idx].buf.monochrome = cm->seq_params.monochrome;
+ seq_params->matrix_coefficients;
+ pool->frame_bufs[cm->new_fb_idx].buf.monochrome = seq_params->monochrome;
pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position =
- cm->chroma_sample_position;
- pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
+ seq_params->chroma_sample_position;
+ pool->frame_bufs[cm->new_fb_idx].buf.color_range = seq_params->color_range;
pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
}
@@ -2095,9 +2173,10 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
}
}
+ const SequenceHeader *const seq_params = &cm->seq_params;
if (!found) {
- int num_bits_width = cm->seq_params.num_bits_width;
- int num_bits_height = cm->seq_params.num_bits_height;
+ int num_bits_width = seq_params->num_bits_width;
+ int num_bits_height = seq_params->num_bits_height;
av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
setup_superres(cm, rb, &width, &height);
@@ -2122,18 +2201,19 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
"Referenced frame has invalid size");
for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
RefBuffer *const ref_frame = &cm->frame_refs[i];
- if (!valid_ref_frame_img_fmt(ref_frame->buf->bit_depth,
- ref_frame->buf->subsampling_x,
- ref_frame->buf->subsampling_y, cm->bit_depth,
- cm->subsampling_x, cm->subsampling_y))
+ if (!valid_ref_frame_img_fmt(
+ ref_frame->buf->bit_depth, ref_frame->buf->subsampling_x,
+ ref_frame->buf->subsampling_y, seq_params->bit_depth,
+ seq_params->subsampling_x, seq_params->subsampling_y))
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Referenced frame has incompatible color format");
}
lock_buffer_pool(pool);
if (aom_realloc_frame_buffer(
- get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
- cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ get_frame_new_buffer(cm), cm->width, cm->height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
cm->byte_alignment,
&pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
pool->cb_priv)) {
@@ -2143,18 +2223,22 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
}
unlock_buffer_pool(pool);
- pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
- pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
- pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
- pool->frame_bufs[cm->new_fb_idx].buf.color_primaries = cm->color_primaries;
+ pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x =
+ seq_params->subsampling_x;
+ pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y =
+ seq_params->subsampling_y;
+ pool->frame_bufs[cm->new_fb_idx].buf.bit_depth =
+ (unsigned int)seq_params->bit_depth;
+ pool->frame_bufs[cm->new_fb_idx].buf.color_primaries =
+ seq_params->color_primaries;
pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics =
- cm->transfer_characteristics;
+ seq_params->transfer_characteristics;
pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients =
- cm->matrix_coefficients;
- pool->frame_bufs[cm->new_fb_idx].buf.monochrome = cm->seq_params.monochrome;
+ seq_params->matrix_coefficients;
+ pool->frame_bufs[cm->new_fb_idx].buf.monochrome = seq_params->monochrome;
pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position =
- cm->chroma_sample_position;
- pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
+ seq_params->chroma_sample_position;
+ pool->frame_bufs[cm->new_fb_idx].buf.color_range = seq_params->color_range;
pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
}
@@ -2500,8 +2584,15 @@ static void get_tile_buffers(AV1Decoder *pbi, const uint8_t *data,
}
}
-static void set_cb_buffer(MACROBLOCKD *const xd, CB_BUFFER *cb_buffer,
- const int num_planes) {
+static void set_cb_buffer(AV1Decoder *pbi, MACROBLOCKD *const xd,
+ CB_BUFFER *cb_buffer_base, const int num_planes,
+ int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &pbi->common;
+ int mib_size_log2 = cm->seq_params.mib_size_log2;
+ int stride = (cm->mi_cols >> mib_size_log2) + 1;
+ int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+ CB_BUFFER *cb_buffer = cb_buffer_base + offset;
+
for (int plane = 0; plane < num_planes; ++plane) {
xd->plane[plane].dqcoeff_block = cb_buffer->dqcoeff[plane];
xd->plane[plane].eob_data = cb_buffer->eob_data[plane];
@@ -2514,18 +2605,189 @@ static void set_cb_buffer(MACROBLOCKD *const xd, CB_BUFFER *cb_buffer,
xd->color_index_map_offset[1] = 0;
}
+static void decoder_alloc_tile_data(AV1Decoder *pbi, const int n_tiles) {
+ AV1_COMMON *const cm = &pbi->common;
+ aom_free(pbi->tile_data);
+ CHECK_MEM_ERROR(cm, pbi->tile_data,
+ aom_memalign(32, n_tiles * sizeof(*pbi->tile_data)));
+ pbi->allocated_tiles = n_tiles;
+ for (int i = 0; i < n_tiles; i++) {
+ TileDataDec *const tile_data = pbi->tile_data + i;
+ av1_zero(tile_data->dec_row_mt_sync);
+ }
+ pbi->allocated_row_mt_sync_rows = 0;
+}
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+// nsync numbers are picked by testing.
+#if 0
+ if (width < 640)
+ return 1;
+ else if (width <= 1280)
+ return 2;
+ else if (width <= 4096)
+ return 4;
+ else
+ return 8;
+#else
+ (void)width;
+#endif
+ return 1;
+}
+
+// Allocate memory for decoder row synchronization
+static void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync, AV1_COMMON *cm,
+ int rows) {
+ dec_row_mt_sync->allocated_sb_rows = rows;
+#if CONFIG_MULTITHREAD
+ {
+ int i;
+
+ CHECK_MEM_ERROR(cm, dec_row_mt_sync->mutex_,
+ aom_malloc(sizeof(*(dec_row_mt_sync->mutex_)) * rows));
+ if (dec_row_mt_sync->mutex_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&dec_row_mt_sync->mutex_[i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, dec_row_mt_sync->cond_,
+ aom_malloc(sizeof(*(dec_row_mt_sync->cond_)) * rows));
+ if (dec_row_mt_sync->cond_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&dec_row_mt_sync->cond_[i], NULL);
+ }
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+
+ CHECK_MEM_ERROR(cm, dec_row_mt_sync->cur_sb_col,
+ aom_malloc(sizeof(*(dec_row_mt_sync->cur_sb_col)) * rows));
+
+ // Set up nsync.
+ dec_row_mt_sync->sync_range = get_sync_range(cm->width);
+}
+
+// Deallocate decoder row synchronization related mutex and data
+void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync) {
+ if (dec_row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+ int i;
+ if (dec_row_mt_sync->mutex_ != NULL) {
+ for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
+ pthread_mutex_destroy(&dec_row_mt_sync->mutex_[i]);
+ }
+ aom_free(dec_row_mt_sync->mutex_);
+ }
+ if (dec_row_mt_sync->cond_ != NULL) {
+ for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
+ pthread_cond_destroy(&dec_row_mt_sync->cond_[i]);
+ }
+ aom_free(dec_row_mt_sync->cond_);
+ }
+#endif // CONFIG_MULTITHREAD
+ aom_free(dec_row_mt_sync->cur_sb_col);
+
+ // clear the structure as the source of this call may be a resize in which
+ // case this call will be followed by an _alloc() which may fail.
+ av1_zero(*dec_row_mt_sync);
+ }
+}
+
+static INLINE void sync_read(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+ int c) {
+#if CONFIG_MULTITHREAD
+ const int nsync = dec_row_mt_sync->sync_range;
+
+ if (r && !(c & (nsync - 1))) {
+ pthread_mutex_t *const mutex = &dec_row_mt_sync->mutex_[r - 1];
+ pthread_mutex_lock(mutex);
+
+ while (c > dec_row_mt_sync->cur_sb_col[r - 1] - nsync) {
+ pthread_cond_wait(&dec_row_mt_sync->cond_[r - 1], mutex);
+ }
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)dec_row_mt_sync;
+ (void)r;
+ (void)c;
+#endif // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+ int c, const int sb_cols) {
+#if CONFIG_MULTITHREAD
+ const int nsync = dec_row_mt_sync->sync_range;
+ int cur;
+ int sig = 1;
+
+ if (c < sb_cols - 1) {
+ cur = c;
+ if (c % nsync) sig = 0;
+ } else {
+ cur = sb_cols + nsync;
+ }
+
+ if (sig) {
+ pthread_mutex_lock(&dec_row_mt_sync->mutex_[r]);
+
+ dec_row_mt_sync->cur_sb_col[r] = cur;
+
+ pthread_cond_signal(&dec_row_mt_sync->cond_[r]);
+ pthread_mutex_unlock(&dec_row_mt_sync->mutex_[r]);
+ }
+#else
+ (void)dec_row_mt_sync;
+ (void)r;
+ (void)c;
+ (void)sb_cols;
+#endif // CONFIG_MULTITHREAD
+}
+
+static INLINE int get_sb_rows_in_tile(AV1Decoder *pbi, TileInfo tile) {
+ AV1_COMMON *cm = &pbi->common;
+ int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
+ tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
+ int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+ return sb_rows;
+}
+
+static INLINE int get_sb_cols_in_tile(AV1Decoder *pbi, TileInfo tile) {
+ AV1_COMMON *cm = &pbi->common;
+ int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
+ tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
+ int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+ return sb_cols;
+}
+
static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
TileInfo tile_info, const int mi_row) {
AV1_COMMON *const cm = &pbi->common;
const int num_planes = av1_num_planes(cm);
- av1_zero_left_context(&td->xd);
+ TileDataDec *const tile_data =
+ pbi->tile_data + tile_info.tile_row * cm->tile_cols + tile_info.tile_col;
+ const int sb_cols_in_tile = get_sb_cols_in_tile(pbi, tile_info);
+ const int sb_row_in_tile =
+ (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2;
+ int sb_col_in_tile = 0;
for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
- mi_col += cm->seq_params.mib_size) {
- set_cb_buffer(&td->xd, &td->cb_buffer_base, num_planes);
+ mi_col += cm->seq_params.mib_size, sb_col_in_tile++) {
+ set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
+ mi_col);
+
+ sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile);
- decode_partition(pbi, &td->xd, mi_row, mi_col, td->bit_reader,
- cm->seq_params.sb_size);
+ // Decoding of the super-block
+ decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+ cm->seq_params.sb_size, 0x2);
+
+ sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile,
+ sb_cols_in_tile);
}
}
@@ -2555,6 +2817,27 @@ static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
return 0;
}
+static void set_decode_func_pointers(ThreadData *td, int parse_decode_flag) {
+ td->read_coeffs_tx_intra_block_visit = decode_block_void;
+ td->predict_and_recon_intra_block_visit = decode_block_void;
+ td->read_coeffs_tx_inter_block_visit = decode_block_void;
+ td->inverse_tx_inter_block_visit = decode_block_void;
+ td->predict_inter_block_visit = predict_inter_block_void;
+ td->cfl_store_inter_block_visit = cfl_store_inter_block_void;
+
+ if (parse_decode_flag & 0x1) {
+ td->read_coeffs_tx_intra_block_visit = read_coeffs_tx_intra_block;
+ td->read_coeffs_tx_inter_block_visit = av1_read_coeffs_txb_facade;
+ }
+ if (parse_decode_flag & 0x2) {
+ td->predict_and_recon_intra_block_visit =
+ predict_and_reconstruct_intra_block;
+ td->inverse_tx_inter_block_visit = inverse_transform_inter_block;
+ td->predict_inter_block_visit = predict_inter_block;
+ td->cfl_store_inter_block_visit = cfl_store_inter_block;
+ }
+}
+
static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row,
int tile_col) {
TileInfo tile_info;
@@ -2564,13 +2847,23 @@ static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row,
av1_tile_set_row(&tile_info, cm, tile_row);
av1_tile_set_col(&tile_info, cm, tile_col);
- av1_zero_above_context(cm, tile_info.mi_col_start, tile_info.mi_col_end,
- tile_row);
+ av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
+ tile_info.mi_col_end, tile_row);
+ av1_reset_loop_filter_delta(&td->xd, num_planes);
av1_reset_loop_restoration(&td->xd, num_planes);
for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
mi_row += cm->seq_params.mib_size) {
- decode_tile_sb_row(pbi, td, tile_info, mi_row);
+ av1_zero_left_context(&td->xd);
+
+ for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+ mi_col += cm->seq_params.mib_size) {
+ set_cb_buffer(pbi, &td->xd, &td->cb_buffer_base, num_planes, 0, 0);
+
+ // Bit-stream parsing and decoding of the superblock
+ decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+ cm->seq_params.sb_size, 0x3);
+ }
}
int corrupted =
@@ -2582,6 +2875,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
const uint8_t *data_end, int start_tile,
int end_tile) {
AV1_COMMON *const cm = &pbi->common;
+ ThreadData *const td = &pbi->td;
const int tile_cols = cm->tile_cols;
const int tile_rows = cm->tile_rows;
const int n_tiles = tile_cols * tile_rows;
@@ -2641,23 +2935,26 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
- aom_free(pbi->tile_data);
- CHECK_MEM_ERROR(cm, pbi->tile_data,
- aom_memalign(32, n_tiles * (sizeof(*pbi->tile_data))));
- pbi->allocated_tiles = n_tiles;
+ decoder_alloc_tile_data(pbi, n_tiles);
}
#if CONFIG_ACCOUNTING
if (pbi->acct_enabled) {
aom_accounting_reset(&pbi->accounting);
}
#endif
+
+ set_decode_func_pointers(&pbi->td, 0x3);
+
// Load all tile information into thread_data.
+ td->xd = pbi->mb;
+ td->xd.corrupted = 0;
+ td->xd.mc_buf[0] = td->mc_buf[0];
+ td->xd.mc_buf[1] = td->mc_buf[1];
for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
- ThreadData *const td = &pbi->td;
TileDataDec *const tile_data = pbi->tile_data + row * cm->tile_cols + col;
const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col];
@@ -2665,13 +2962,10 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
row * cm->tile_cols + col > end_tile)
continue;
- td->xd = pbi->mb;
- td->xd.corrupted = 0;
- td->xd.mc_buf[0] = pbi->td.mc_buf[0];
- td->xd.mc_buf[1] = pbi->td.mc_buf[1];
td->bit_reader = &tile_data->bit_reader;
av1_zero(td->dqcoeff);
av1_tile_init(&td->xd.tile, cm, row, col);
+ td->xd.current_qindex = cm->base_qindex;
setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
&cm->error, td->bit_reader, allow_update_cdf);
#if CONFIG_ACCOUNTING
@@ -2691,7 +2985,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
td->xd.tile_ctx = &tile_data->tctx;
// decode tile
- decode_tile(pbi, &pbi->td, row, col);
+ decode_tile(pbi, td, row, col);
aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted);
if (pbi->mb.corrupted)
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
@@ -2729,6 +3023,47 @@ static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) {
return cur_job_info;
}
+static void tile_worker_hook_init(AV1Decoder *const pbi,
+ DecWorkerData *const thread_data,
+ const TileBufferDec *const tile_buffer,
+ TileDataDec *const tile_data,
+ uint8_t allow_update_cdf) {
+ AV1_COMMON *cm = &pbi->common;
+ ThreadData *const td = thread_data->td;
+ int tile_row = tile_data->tile_info.tile_row;
+ int tile_col = tile_data->tile_info.tile_col;
+
+ td->bit_reader = &tile_data->bit_reader;
+ av1_zero(td->dqcoeff);
+ av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
+ td->xd.current_qindex = cm->base_qindex;
+ setup_bool_decoder(tile_buffer->data, thread_data->data_end,
+ tile_buffer->size, &thread_data->error_info,
+ td->bit_reader, allow_update_cdf);
+#if CONFIG_ACCOUNTING
+ if (pbi->acct_enabled) {
+ td->bit_reader->accounting = &pbi->accounting;
+ td->bit_reader->accounting->last_tell_frac =
+ aom_reader_tell_frac(td->bit_reader);
+ } else {
+ td->bit_reader->accounting = NULL;
+ }
+#endif
+ av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+ td->xd.error_info = &thread_data->error_info;
+ av1_init_above_context(cm, &td->xd, tile_row);
+
+ // Initialise the tile context from the frame context
+ tile_data->tctx = *cm->fc;
+ td->xd.tile_ctx = &tile_data->tctx;
+#if CONFIG_ACCOUNTING
+ if (pbi->acct_enabled) {
+ tile_data->bit_reader.accounting->last_tell_frac =
+ aom_reader_tell_frac(&tile_data->bit_reader);
+ }
+#endif
+}
+
static int tile_worker_hook(void *arg1, void *arg2) {
DecWorkerData *const thread_data = (DecWorkerData *)arg1;
AV1Decoder *const pbi = (AV1Decoder *)arg2;
@@ -2736,14 +3071,21 @@ static int tile_worker_hook(void *arg1, void *arg2) {
ThreadData *const td = thread_data->td;
uint8_t allow_update_cdf;
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
if (setjmp(thread_data->error_info.jmp)) {
thread_data->error_info.setjmp = 0;
thread_data->td->xd.corrupted = 1;
return 0;
}
+ thread_data->error_info.setjmp = 1;
+
allow_update_cdf = cm->large_scale_tile ? 0 : 1;
allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
+ set_decode_func_pointers(td, 0x3);
+
assert(cm->tile_cols > 0);
while (1) {
TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
@@ -2751,46 +3093,248 @@ static int tile_worker_hook(void *arg1, void *arg2) {
if (cur_job_info != NULL && !td->xd.corrupted) {
const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
TileDataDec *const tile_data = cur_job_info->tile_data;
- volatile int tile_row = tile_data->tile_info.tile_row;
- volatile int tile_col = tile_data->tile_info.tile_col;
+ tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
+ allow_update_cdf);
+ // decode tile
+ int tile_row = tile_data->tile_info.tile_row;
+ int tile_col = tile_data->tile_info.tile_col;
+ decode_tile(pbi, td, tile_row, tile_col);
+ } else {
+ break;
+ }
+ }
+ thread_data->error_info.setjmp = 0;
+ return !td->xd.corrupted;
+}
- td->xd = pbi->mb;
- td->xd.corrupted = 0;
- td->xd.mc_buf[0] = td->mc_buf[0];
- td->xd.mc_buf[1] = td->mc_buf[1];
- td->bit_reader = &tile_data->bit_reader;
- av1_zero(td->dqcoeff);
- av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
- setup_bool_decoder(tile_buffer->data, thread_data->data_end,
- tile_buffer->size, &cm->error, td->bit_reader,
- allow_update_cdf);
-#if CONFIG_ACCOUNTING
- if (pbi->acct_enabled) {
- td->bit_reader->accounting = &pbi->accounting;
- td->bit_reader->accounting->last_tell_frac =
- aom_reader_tell_frac(td->bit_reader);
- } else {
- td->bit_reader->accounting = NULL;
+static int get_next_job_info(AV1Decoder *const pbi,
+ AV1DecRowMTJobInfo *next_job_info,
+ int *end_of_frame) {
+ AV1_COMMON *cm = &pbi->common;
+ TileDataDec *tile_data;
+ AV1DecRowMTSync *dec_row_mt_sync;
+ AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+ TileInfo tile_info;
+ const int tile_rows_start = frame_row_mt_info->tile_rows_start;
+ const int tile_rows_end = frame_row_mt_info->tile_rows_end;
+ const int tile_cols_start = frame_row_mt_info->tile_cols_start;
+ const int tile_cols_end = frame_row_mt_info->tile_cols_end;
+ const int start_tile = frame_row_mt_info->start_tile;
+ const int end_tile = frame_row_mt_info->end_tile;
+ const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+ int num_mis_to_decode, num_threads_working;
+ int num_mis_waiting_for_decode;
+ int min_threads_working = INT_MAX;
+ int max_mis_to_decode = 0;
+ int tile_row_idx, tile_col_idx;
+ int tile_row = 0;
+ int tile_col = 0;
+
+ memset(next_job_info, 0, sizeof(*next_job_info));
+
+ // Frame decode is completed or error is encountered.
+ *end_of_frame = (frame_row_mt_info->mi_rows_decode_started ==
+ frame_row_mt_info->mi_rows_to_decode) ||
+ (frame_row_mt_info->row_mt_exit == 1);
+ if (*end_of_frame) {
+ return 1;
+ }
+
+ // Decoding cannot start as bit-stream parsing is not complete.
+ if (frame_row_mt_info->mi_rows_parse_done -
+ frame_row_mt_info->mi_rows_decode_started ==
+ 0)
+ return 0;
+
+ // Choose the tile to decode.
+ for (tile_row_idx = tile_rows_start; tile_row_idx < tile_rows_end;
+ ++tile_row_idx) {
+ for (tile_col_idx = tile_cols_start; tile_col_idx < tile_cols_end;
+ ++tile_col_idx) {
+ if (tile_row_idx * cm->tile_cols + tile_col_idx < start_tile ||
+ tile_row_idx * cm->tile_cols + tile_col_idx > end_tile)
+ continue;
+
+ tile_data = pbi->tile_data + tile_row_idx * cm->tile_cols + tile_col_idx;
+ dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+ num_threads_working = dec_row_mt_sync->num_threads_working;
+ num_mis_waiting_for_decode = (dec_row_mt_sync->mi_rows_parse_done -
+ dec_row_mt_sync->mi_rows_decode_started) *
+ dec_row_mt_sync->mi_cols;
+ num_mis_to_decode =
+ (dec_row_mt_sync->mi_rows - dec_row_mt_sync->mi_rows_decode_started) *
+ dec_row_mt_sync->mi_cols;
+
+ assert(num_mis_to_decode >= num_mis_waiting_for_decode);
+
+ // Pick the tile which has minimum number of threads working on it.
+ if (num_mis_waiting_for_decode > 0) {
+ if (num_threads_working < min_threads_working) {
+ min_threads_working = num_threads_working;
+ max_mis_to_decode = 0;
+ }
+ if (num_threads_working == min_threads_working &&
+ num_mis_to_decode > max_mis_to_decode) {
+ max_mis_to_decode = num_mis_to_decode;
+ tile_row = tile_row_idx;
+ tile_col = tile_col_idx;
+ }
}
+ }
+ }
+
+ tile_data = pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+ tile_info = tile_data->tile_info;
+ dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+ next_job_info->tile_row = tile_row;
+ next_job_info->tile_col = tile_col;
+ next_job_info->mi_row =
+ dec_row_mt_sync->mi_rows_decode_started + tile_info.mi_row_start;
+
+ dec_row_mt_sync->num_threads_working++;
+ dec_row_mt_sync->mi_rows_decode_started += sb_mi_size;
+ frame_row_mt_info->mi_rows_decode_started += sb_mi_size;
+
+ return 1;
+}
+
+static INLINE void signal_parse_sb_row_done(AV1Decoder *const pbi,
+ TileDataDec *const tile_data,
+ const int sb_mi_size) {
+ AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
#endif
- av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
- av1_init_above_context(cm, &td->xd, tile_row);
+ tile_data->dec_row_mt_sync.mi_rows_parse_done += sb_mi_size;
+ frame_row_mt_info->mi_rows_parse_done += sb_mi_size;
+#if CONFIG_MULTITHREAD
+ pthread_cond_broadcast(pbi->row_mt_cond_);
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+}
- // Initialise the tile context from the frame context
- tile_data->tctx = *cm->fc;
- td->xd.tile_ctx = &tile_data->tctx;
-#if CONFIG_ACCOUNTING
- if (pbi->acct_enabled) {
- tile_data->bit_reader.accounting->last_tell_frac =
- aom_reader_tell_frac(&tile_data->bit_reader);
- }
+static int row_mt_worker_hook(void *arg1, void *arg2) {
+ DecWorkerData *const thread_data = (DecWorkerData *)arg1;
+ AV1Decoder *const pbi = (AV1Decoder *)arg2;
+ AV1_COMMON *cm = &pbi->common;
+ ThreadData *const td = thread_data->td;
+ uint8_t allow_update_cdf;
+ const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+ AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+ td->xd.corrupted = 0;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(thread_data->error_info.jmp)) {
+ thread_data->error_info.setjmp = 0;
+ thread_data->td->xd.corrupted = 1;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
#endif
+ frame_row_mt_info->row_mt_exit = 1;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+ return 0;
+ }
+ thread_data->error_info.setjmp = 1;
+
+ const int num_planes = av1_num_planes(cm);
+ allow_update_cdf = cm->large_scale_tile ? 0 : 1;
+ allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
+
+ assert(cm->tile_cols > 0);
+ while (1) {
+ TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
+
+ if (cur_job_info != NULL && !td->xd.corrupted) {
+ const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
+ TileDataDec *const tile_data = cur_job_info->tile_data;
+ tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
+ allow_update_cdf);
+
+ set_decode_func_pointers(td, 0x1);
+
// decode tile
- decode_tile(pbi, td, tile_row, tile_col);
+ TileInfo tile_info = tile_data->tile_info;
+ int tile_row = tile_info.tile_row;
+
+ av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
+ tile_info.mi_col_end, tile_row);
+ av1_reset_loop_filter_delta(&td->xd, num_planes);
+ av1_reset_loop_restoration(&td->xd, num_planes);
+
+ for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+ mi_row += cm->seq_params.mib_size) {
+ av1_zero_left_context(&td->xd);
+
+ for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+ mi_col += cm->seq_params.mib_size) {
+ set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
+ mi_col);
+
+ // Bit-stream parsing of the superblock
+ decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+ cm->seq_params.sb_size, 0x1);
+ }
+ signal_parse_sb_row_done(pbi, tile_data, sb_mi_size);
+ }
+
+ int corrupted =
+ (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
+ aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
} else {
break;
}
}
+
+ set_decode_func_pointers(td, 0x2);
+
+ while (1) {
+ AV1DecRowMTJobInfo next_job_info;
+ int end_of_frame = 0;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ while (!get_next_job_info(pbi, &next_job_info, &end_of_frame)) {
+#if CONFIG_MULTITHREAD
+ pthread_cond_wait(pbi->row_mt_cond_, pbi->row_mt_mutex_);
+#endif
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+
+ if (end_of_frame) break;
+
+ int tile_row = next_job_info.tile_row;
+ int tile_col = next_job_info.tile_col;
+ int mi_row = next_job_info.mi_row;
+
+ TileDataDec *tile_data =
+ pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+ AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+ TileInfo tile_info = tile_data->tile_info;
+
+ av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
+ av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+ td->xd.error_info = &thread_data->error_info;
+
+ decode_tile_sb_row(pbi, td, tile_info, mi_row);
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ dec_row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+ }
+ thread_data->error_info.setjmp = 0;
return !td->xd.corrupted;
}
@@ -2842,8 +3386,7 @@ static void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm,
aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles));
}
-void av1_free_mc_tmp_buf(void *td, int use_highbd) {
- ThreadData *thread_data = (ThreadData *)td;
+void av1_free_mc_tmp_buf(ThreadData *thread_data, int use_highbd) {
int ref;
for (ref = 0; ref < 2; ref++) {
if (use_highbd)
@@ -2855,10 +3398,8 @@ void av1_free_mc_tmp_buf(void *td, int use_highbd) {
thread_data->mc_buf_size = 0;
}
-static void allocate_mc_tmp_buf(AV1_COMMON *const cm, void *td, int buf_size,
- int use_highbd) {
- ThreadData *thread_data = (ThreadData *)td;
-
+static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data,
+ int buf_size, int use_highbd) {
for (int ref = 0; ref < 2; ref++) {
if (use_highbd) {
uint16_t *hbd_mc_buf;
@@ -2872,11 +3413,130 @@ static void allocate_mc_tmp_buf(AV1_COMMON *const cm, void *td, int buf_size,
thread_data->mc_buf_size = buf_size;
}
+static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook,
+ int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+ // Reset tile decoding hook
+ for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+ AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+ DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+ thread_data->td->xd = pbi->mb;
+ thread_data->td->xd.corrupted = 0;
+ thread_data->td->xd.mc_buf[0] = thread_data->td->mc_buf[0];
+ thread_data->td->xd.mc_buf[1] = thread_data->td->mc_buf[1];
+ winterface->sync(worker);
+
+ worker->hook = worker_hook;
+ worker->data1 = thread_data;
+ worker->data2 = pbi;
+ }
+#if CONFIG_ACCOUNTING
+ if (pbi->acct_enabled) {
+ aom_accounting_reset(&pbi->accounting);
+ }
+#endif
+}
+
+static void launch_dec_workers(AV1Decoder *pbi, const uint8_t *data_end,
+ int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+ for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+ AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+ DecWorkerData *const thread_data = (DecWorkerData *)worker->data1;
+
+ thread_data->data_end = data_end;
+
+ worker->had_error = 0;
+ if (worker_idx == num_workers - 1) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+}
+
+static void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int corrupted = 0;
+
+ for (int worker_idx = num_workers; worker_idx > 0; --worker_idx) {
+ AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1];
+ aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
+ }
+
+ pbi->mb.corrupted = corrupted;
+}
+
+static void decode_mt_init(AV1Decoder *pbi) {
+ AV1_COMMON *const cm = &pbi->common;
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int worker_idx;
+
+ // Create workers and thread_data
+ if (pbi->num_workers == 0) {
+ const int num_threads = pbi->max_threads;
+ CHECK_MEM_ERROR(cm, pbi->tile_workers,
+ aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
+ CHECK_MEM_ERROR(cm, pbi->thread_data,
+ aom_malloc(num_threads * sizeof(*pbi->thread_data)));
+
+ for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) {
+ AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+ DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+ ++pbi->num_workers;
+
+ winterface->init(worker);
+ if (worker_idx < num_threads - 1 && !winterface->reset(worker)) {
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Tile decoder thread creation failed");
+ }
+
+ if (worker_idx < num_threads - 1) {
+ // Allocate thread data.
+ CHECK_MEM_ERROR(cm, thread_data->td,
+ aom_memalign(32, sizeof(*thread_data->td)));
+ av1_zero(*thread_data->td);
+ } else {
+ // Main thread acts as a worker and uses the thread data in pbi
+ thread_data->td = &pbi->td;
+ }
+ thread_data->error_info.error_code = AOM_CODEC_OK;
+ thread_data->error_info.setjmp = 0;
+ }
+ }
+ const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+ const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+ for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
+ DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+ if (thread_data->td->mc_buf_size != buf_size) {
+ av1_free_mc_tmp_buf(thread_data->td, use_highbd);
+ allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
+ }
+ }
+}
+
+static void tile_mt_queue(AV1Decoder *pbi, int tile_cols, int tile_rows,
+ int tile_rows_start, int tile_rows_end,
+ int tile_cols_start, int tile_cols_end,
+ int start_tile, int end_tile) {
+ AV1_COMMON *const cm = &pbi->common;
+ if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
+ pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
+ av1_dealloc_dec_jobs(&pbi->tile_mt_info);
+ alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols);
+ }
+ enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start,
+ tile_cols_end, start_tile, end_tile);
+ qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued,
+ sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers);
+}
+
static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
const uint8_t *data_end, int start_tile,
int end_tile) {
AV1_COMMON *const cm = &pbi->common;
- const AVxWorkerInterface *const winterface = aom_get_worker_interface();
const int tile_cols = cm->tile_cols;
const int tile_rows = cm->tile_rows;
const int n_tiles = tile_cols * tile_rows;
@@ -2891,7 +3551,6 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
int tile_cols_end;
int tile_count_tg;
int num_workers;
- int worker_idx;
const uint8_t *raw_data_end = NULL;
if (cm->large_scale_tile) {
@@ -2923,48 +3582,188 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
assert(start_tile <= end_tile);
assert(start_tile >= 0 && end_tile < n_tiles);
- // Create workers and thread_data
- if (pbi->num_workers == 0) {
- const int num_threads = pbi->max_threads;
- CHECK_MEM_ERROR(cm, pbi->tile_workers,
- aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
- CHECK_MEM_ERROR(cm, pbi->thread_data,
- aom_malloc(num_threads * sizeof(*pbi->thread_data)));
+ decode_mt_init(pbi);
- for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) {
- AVxWorker *const worker = &pbi->tile_workers[worker_idx];
- DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
- ++pbi->num_workers;
+ // get tile size in tile group
+#if EXT_TILE_DEBUG
+ if (cm->large_scale_tile) assert(pbi->ext_tile_debug == 1);
+ if (cm->large_scale_tile)
+ raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+ else
+#endif // EXT_TILE_DEBUG
+ get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
- winterface->init(worker);
- if (worker_idx < num_threads - 1 && !winterface->reset(worker)) {
- aom_internal_error(&cm->error, AOM_CODEC_ERROR,
- "Tile decoder thread creation failed");
- }
+ if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+ decoder_alloc_tile_data(pbi, n_tiles);
+ }
- if (worker_idx < num_threads - 1) {
- // Allocate thread data.
- CHECK_MEM_ERROR(cm, thread_data->td,
- aom_memalign(32, sizeof(*thread_data->td)));
- av1_zero(*thread_data->td);
- } else {
- // Main thread acts as a worker and uses the thread data in pbi
- thread_data->td = &pbi->td;
- }
+ for (int row = 0; row < tile_rows; row++) {
+ for (int col = 0; col < tile_cols; col++) {
+ TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
+ av1_tile_init(&tile_data->tile_info, cm, row, col);
}
}
- const int use_highbd = cm->use_highbitdepth ? 1 : 0;
- const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
- for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
- DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
- if (thread_data->td->mc_buf_size != buf_size) {
- av1_free_mc_tmp_buf(thread_data->td, use_highbd);
- allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
+
+ tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
+ tile_cols_start, tile_cols_end, start_tile, end_tile);
+
+ reset_dec_workers(pbi, tile_worker_hook, num_workers);
+ launch_dec_workers(pbi, data_end, num_workers);
+ sync_dec_workers(pbi, num_workers);
+
+ if (pbi->mb.corrupted)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Failed to decode tile data");
+
+ if (cm->large_scale_tile) {
+ if (n_tiles == 1) {
+ // Find the end of the single tile buffer
+ return aom_reader_find_end(&pbi->tile_data->bit_reader);
}
+ // Return the end of the last tile buffer
+ return raw_data_end;
+ }
+ TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+ return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static void dec_alloc_cb_buf(AV1Decoder *pbi) {
+ AV1_COMMON *const cm = &pbi->common;
+ int size = ((cm->mi_rows >> cm->seq_params.mib_size_log2) + 1) *
+ ((cm->mi_cols >> cm->seq_params.mib_size_log2) + 1);
+
+ if (pbi->cb_buffer_alloc_size < size) {
+ av1_dec_free_cb_buf(pbi);
+ CHECK_MEM_ERROR(cm, pbi->cb_buffer_base,
+ aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size));
+ pbi->cb_buffer_alloc_size = size;
+ }
+}
+
+static void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start,
+ int tile_rows_end, int tile_cols_start,
+ int tile_cols_end, int start_tile, int end_tile,
+ int max_sb_rows) {
+ AV1_COMMON *const cm = &pbi->common;
+ AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+
+ frame_row_mt_info->tile_rows_start = tile_rows_start;
+ frame_row_mt_info->tile_rows_end = tile_rows_end;
+ frame_row_mt_info->tile_cols_start = tile_cols_start;
+ frame_row_mt_info->tile_cols_end = tile_cols_end;
+ frame_row_mt_info->start_tile = start_tile;
+ frame_row_mt_info->end_tile = end_tile;
+ frame_row_mt_info->mi_rows_to_decode = 0;
+ frame_row_mt_info->mi_rows_parse_done = 0;
+ frame_row_mt_info->mi_rows_decode_started = 0;
+ frame_row_mt_info->row_mt_exit = 0;
+
+ for (int tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+ for (int tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+ if (tile_row * cm->tile_cols + tile_col < start_tile ||
+ tile_row * cm->tile_cols + tile_col > end_tile)
+ continue;
+
+ TileDataDec *const tile_data =
+ pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+ TileInfo tile_info = tile_data->tile_info;
+
+ tile_data->dec_row_mt_sync.mi_rows_parse_done = 0;
+ tile_data->dec_row_mt_sync.mi_rows_decode_started = 0;
+ tile_data->dec_row_mt_sync.num_threads_working = 0;
+ tile_data->dec_row_mt_sync.mi_rows =
+ ALIGN_POWER_OF_TWO(tile_info.mi_row_end - tile_info.mi_row_start,
+ cm->seq_params.mib_size_log2);
+ tile_data->dec_row_mt_sync.mi_cols =
+ ALIGN_POWER_OF_TWO(tile_info.mi_col_end - tile_info.mi_col_start,
+ cm->seq_params.mib_size_log2);
+
+ frame_row_mt_info->mi_rows_to_decode +=
+ tile_data->dec_row_mt_sync.mi_rows;
+
+ // Initialize cur_sb_col to -1 for all SB rows.
+ memset(tile_data->dec_row_mt_sync.cur_sb_col, -1,
+ sizeof(*tile_data->dec_row_mt_sync.cur_sb_col) * max_sb_rows);
+ }
+ }
+
+#if CONFIG_MULTITHREAD
+ if (pbi->row_mt_mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, pbi->row_mt_mutex_,
+ aom_malloc(sizeof(*(pbi->row_mt_mutex_))));
+ if (pbi->row_mt_mutex_) {
+ pthread_mutex_init(pbi->row_mt_mutex_, NULL);
+ }
+ }
+
+ if (pbi->row_mt_cond_ == NULL) {
+ CHECK_MEM_ERROR(cm, pbi->row_mt_cond_,
+ aom_malloc(sizeof(*(pbi->row_mt_cond_))));
+ if (pbi->row_mt_cond_) {
+ pthread_cond_init(pbi->row_mt_cond_, NULL);
+ }
+ }
+#endif
+}
+
+static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end,
+ int start_tile, int end_tile) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ const int n_tiles = tile_cols * tile_rows;
+ TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+ const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+ const int single_row = pbi->dec_tile_row >= 0;
+ const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+ const int single_col = pbi->dec_tile_col >= 0;
+ int tile_rows_start;
+ int tile_rows_end;
+ int tile_cols_start;
+ int tile_cols_end;
+ int tile_count_tg;
+ int num_workers;
+ const uint8_t *raw_data_end = NULL;
+ int max_sb_rows = 0;
+
+ if (cm->large_scale_tile) {
+ tile_rows_start = single_row ? dec_tile_row : 0;
+ tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+ tile_cols_start = single_col ? dec_tile_col : 0;
+ tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+ } else {
+ tile_rows_start = 0;
+ tile_rows_end = tile_rows;
+ tile_cols_start = 0;
+ tile_cols_end = tile_cols;
}
+ tile_count_tg = end_tile - start_tile + 1;
+ num_workers = pbi->max_threads;
- // get tile size in tile group
+ // No tiles to decode.
+ if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+ // First tile is larger than end_tile.
+ tile_rows_start * tile_cols + tile_cols_start > end_tile ||
+ // Last tile is smaller than start_tile.
+ (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
+ return data;
+
+ assert(tile_rows <= MAX_TILE_ROWS);
+ assert(tile_cols <= MAX_TILE_COLS);
+ assert(tile_count_tg > 0);
+ assert(num_workers > 0);
+ assert(start_tile <= end_tile);
+ assert(start_tile >= 0 && end_tile < n_tiles);
+
+ (void)tile_count_tg;
+
+ decode_mt_init(pbi);
+
+ // get tile size in tile group
#if EXT_TILE_DEBUG
+ if (cm->large_scale_tile) assert(pbi->ext_tile_debug == 1);
if (cm->large_scale_tile)
raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
else
@@ -2972,74 +3771,43 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
- aom_free(pbi->tile_data);
- CHECK_MEM_ERROR(cm, pbi->tile_data,
- aom_memalign(32, n_tiles * sizeof(*pbi->tile_data)));
- pbi->allocated_tiles = n_tiles;
+ for (int i = 0; i < pbi->allocated_tiles; i++) {
+ TileDataDec *const tile_data = pbi->tile_data + i;
+ av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+ }
+ decoder_alloc_tile_data(pbi, n_tiles);
}
- // Reset tile decoding hook
- for (worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
- AVxWorker *const worker = &pbi->tile_workers[worker_idx];
- DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
- winterface->sync(worker);
-
- worker->hook = tile_worker_hook;
- worker->data1 = thread_data;
- worker->data2 = pbi;
- }
-#if CONFIG_ACCOUNTING
- if (pbi->acct_enabled) {
- aom_accounting_reset(&pbi->accounting);
- }
-#endif
for (int row = 0; row < tile_rows; row++) {
for (int col = 0; col < tile_cols; col++) {
TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
av1_tile_init(&tile_data->tile_info, cm, row, col);
+
+ max_sb_rows =
+ AOMMAX(max_sb_rows, get_sb_rows_in_tile(pbi, tile_data->tile_info));
}
}
- if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
- pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
- av1_dealloc_dec_jobs(&pbi->tile_mt_info);
- alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols);
+ if (pbi->allocated_row_mt_sync_rows != max_sb_rows) {
+ for (int i = 0; i < n_tiles; ++i) {
+ TileDataDec *const tile_data = pbi->tile_data + i;
+ av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+ dec_row_mt_alloc(&tile_data->dec_row_mt_sync, cm, max_sb_rows);
+ }
+ pbi->allocated_row_mt_sync_rows = max_sb_rows;
}
- enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start,
- tile_cols_end, start_tile, end_tile);
- qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued,
- sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers);
- {
- const int base = tile_count_tg / num_workers;
- const int remain = tile_count_tg % num_workers;
- int tile_start = start_tile;
- int corrupted = 0;
-
- for (worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
- // compute number of tiles assign to each worker
- const int count = base + (remain + worker_idx) / num_workers;
- AVxWorker *const worker = &pbi->tile_workers[worker_idx];
- DecWorkerData *const thread_data = (DecWorkerData *)worker->data1;
-
- thread_data->data_end = data_end;
- tile_start += count;
+ tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
+ tile_cols_start, tile_cols_end, start_tile, end_tile);
- worker->had_error = 0;
- if (worker_idx == num_workers - 1) {
- winterface->execute(worker);
- } else {
- winterface->launch(worker);
- }
- }
+ dec_alloc_cb_buf(pbi);
- for (; worker_idx > 0; --worker_idx) {
- AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1];
- aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
- }
+ row_mt_frame_init(pbi, tile_rows_start, tile_rows_end, tile_cols_start,
+ tile_cols_end, start_tile, end_tile, max_sb_rows);
- pbi->mb.corrupted = corrupted;
- }
+ reset_dec_workers(pbi, row_mt_worker_hook, num_workers);
+ launch_dec_workers(pbi, data_end, num_workers);
+ sync_dec_workers(pbi, num_workers);
if (pbi->mb.corrupted)
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
@@ -3064,17 +3832,20 @@ static void error_handler(void *data) {
}
// Reads the high_bitdepth and twelve_bit fields in color_config() and sets
-// cm->bit_depth based on the values of those fields and cm->profile. Reports
-// errors by calling rb->error_handler() or aom_internal_error().
-static void av1_read_bitdepth(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+// seq_params->bit_depth based on the values of those fields and
+// seq_params->profile. Reports errors by calling rb->error_handler() or
+// aom_internal_error().
+static void read_bitdepth(struct aom_read_bit_buffer *rb,
+ SequenceHeader *seq_params,
+ struct aom_internal_error_info *error_info) {
const int high_bitdepth = aom_rb_read_bit(rb);
- if (cm->profile == PROFILE_2 && high_bitdepth) {
+ if (seq_params->profile == PROFILE_2 && high_bitdepth) {
const int twelve_bit = aom_rb_read_bit(rb);
- cm->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10;
- } else if (cm->profile <= PROFILE_2) {
- cm->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8;
+ seq_params->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10;
+ } else if (seq_params->profile <= PROFILE_2) {
+ seq_params->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8;
} else {
- aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
"Unsupported profile/bit-depth combination");
}
}
@@ -3082,6 +3853,7 @@ static void av1_read_bitdepth(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
void av1_read_film_grain_params(AV1_COMMON *cm,
struct aom_read_bit_buffer *rb) {
aom_film_grain_t *pars = &cm->film_grain_params;
+ const SequenceHeader *const seq_params = &cm->seq_params;
pars->apply_grain = aom_rb_read_bit(rb);
if (!pars->apply_grain) {
@@ -3095,6 +3867,8 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
else
pars->update_parameters = 1;
+ pars->bit_depth = seq_params->bit_depth;
+
if (!pars->update_parameters) {
// inherit parameters from a previous reference frame
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
@@ -3129,11 +3903,11 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8);
}
- if (!cm->seq_params.monochrome)
+ if (!seq_params->monochrome)
pars->chroma_scaling_from_luma = aom_rb_read_bit(rb);
- if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
- ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) &&
+ if (seq_params->monochrome || pars->chroma_scaling_from_luma ||
+ ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
(pars->num_y_points == 0))) {
pars->num_cb_points = 0;
pars->num_cr_points = 0;
@@ -3168,7 +3942,7 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8);
}
- if ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) &&
+ if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
(((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) ||
((pars->num_cb_points != 0) && (pars->num_cr_points == 0))))
aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
@@ -3222,89 +3996,93 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
}
static void read_film_grain(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
- if (cm->film_grain_params_present && (cm->show_frame || cm->showable_frame)) {
+ if (cm->seq_params.film_grain_params_present &&
+ (cm->show_frame || cm->showable_frame)) {
av1_read_film_grain_params(cm, rb);
} else {
memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
}
- cm->film_grain_params.bit_depth = cm->bit_depth;
+ cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params,
sizeof(aom_film_grain_t));
}
-void av1_read_color_config(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
- int allow_lowbitdepth) {
- av1_read_bitdepth(cm, rb);
+void av1_read_color_config(struct aom_read_bit_buffer *rb,
+ int allow_lowbitdepth, SequenceHeader *seq_params,
+ struct aom_internal_error_info *error_info) {
+ read_bitdepth(rb, seq_params, error_info);
- cm->use_highbitdepth = cm->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
+ seq_params->use_highbitdepth =
+ seq_params->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
// monochrome bit (not needed for PROFILE_1)
- const int is_monochrome = cm->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0;
- cm->seq_params.monochrome = is_monochrome;
+ const int is_monochrome =
+ seq_params->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0;
+ seq_params->monochrome = is_monochrome;
int color_description_present_flag = aom_rb_read_bit(rb);
if (color_description_present_flag) {
- cm->color_primaries = aom_rb_read_literal(rb, 8);
- cm->transfer_characteristics = aom_rb_read_literal(rb, 8);
- cm->matrix_coefficients = aom_rb_read_literal(rb, 8);
+ seq_params->color_primaries = aom_rb_read_literal(rb, 8);
+ seq_params->transfer_characteristics = aom_rb_read_literal(rb, 8);
+ seq_params->matrix_coefficients = aom_rb_read_literal(rb, 8);
} else {
- cm->color_primaries = AOM_CICP_CP_UNSPECIFIED;
- cm->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
- cm->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
+ seq_params->color_primaries = AOM_CICP_CP_UNSPECIFIED;
+ seq_params->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
+ seq_params->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
}
if (is_monochrome) {
// [16,235] (including xvycc) vs [0,255] range
- cm->color_range = aom_rb_read_bit(rb);
- cm->subsampling_y = cm->subsampling_x = 1;
- cm->chroma_sample_position = AOM_CSP_UNKNOWN;
- cm->separate_uv_delta_q = 0;
+ seq_params->color_range = aom_rb_read_bit(rb);
+ seq_params->subsampling_y = seq_params->subsampling_x = 1;
+ seq_params->chroma_sample_position = AOM_CSP_UNKNOWN;
+ seq_params->separate_uv_delta_q = 0;
return;
}
- if (cm->color_primaries == AOM_CICP_CP_BT_709 &&
- cm->transfer_characteristics == AOM_CICP_TC_SRGB &&
- cm->matrix_coefficients == AOM_CICP_MC_IDENTITY) { // it would be better
- // to remove this
- // dependency too
- cm->subsampling_y = cm->subsampling_x = 0;
- cm->color_range = 1; // assume full color-range
- if (!(cm->profile == PROFILE_1 ||
- (cm->profile == PROFILE_2 && cm->bit_depth == AOM_BITS_12))) {
+ if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+ seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+ seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ // It would be good to remove this dependency.
+ seq_params->subsampling_y = seq_params->subsampling_x = 0;
+ seq_params->color_range = 1; // assume full color-range
+ if (!(seq_params->profile == PROFILE_1 ||
+ (seq_params->profile == PROFILE_2 &&
+ seq_params->bit_depth == AOM_BITS_12))) {
aom_internal_error(
- &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ error_info, AOM_CODEC_UNSUP_BITSTREAM,
"sRGB colorspace not compatible with specified profile");
}
} else {
// [16,235] (including xvycc) vs [0,255] range
- cm->color_range = aom_rb_read_bit(rb);
- if (cm->profile == PROFILE_0) {
+ seq_params->color_range = aom_rb_read_bit(rb);
+ if (seq_params->profile == PROFILE_0) {
// 420 only
- cm->subsampling_x = cm->subsampling_y = 1;
- } else if (cm->profile == PROFILE_1) {
+ seq_params->subsampling_x = seq_params->subsampling_y = 1;
+ } else if (seq_params->profile == PROFILE_1) {
// 444 only
- cm->subsampling_x = cm->subsampling_y = 0;
+ seq_params->subsampling_x = seq_params->subsampling_y = 0;
} else {
- assert(cm->profile == PROFILE_2);
- if (cm->bit_depth == AOM_BITS_12) {
- cm->subsampling_x = aom_rb_read_bit(rb);
- if (cm->subsampling_x)
- cm->subsampling_y = aom_rb_read_bit(rb); // 422 or 420
+ assert(seq_params->profile == PROFILE_2);
+ if (seq_params->bit_depth == AOM_BITS_12) {
+ seq_params->subsampling_x = aom_rb_read_bit(rb);
+ if (seq_params->subsampling_x)
+ seq_params->subsampling_y = aom_rb_read_bit(rb); // 422 or 420
else
- cm->subsampling_y = 0; // 444
+ seq_params->subsampling_y = 0; // 444
} else {
// 422
- cm->subsampling_x = 1;
- cm->subsampling_y = 0;
+ seq_params->subsampling_x = 1;
+ seq_params->subsampling_y = 0;
}
}
- if (cm->matrix_coefficients == AOM_CICP_MC_IDENTITY &&
- (cm->subsampling_x || cm->subsampling_y)) {
+ if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY &&
+ (seq_params->subsampling_x || seq_params->subsampling_y)) {
aom_internal_error(
- &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ error_info, AOM_CODEC_UNSUP_BITSTREAM,
"Identity CICP Matrix incompatible with non 4:4:4 color sampling");
}
- if (cm->subsampling_x && cm->subsampling_y) {
- cm->chroma_sample_position = aom_rb_read_literal(rb, 2);
+ if (seq_params->subsampling_x && seq_params->subsampling_y) {
+ seq_params->chroma_sample_position = aom_rb_read_literal(rb, 2);
}
}
- cm->separate_uv_delta_q = aom_rb_read_bit(rb);
+ seq_params->separate_uv_delta_q = aom_rb_read_bit(rb);
}
void av1_read_timing_info_header(AV1_COMMON *cm,
@@ -3338,8 +4116,8 @@ void av1_read_decoder_model_info(AV1_COMMON *cm,
aom_rb_read_literal(rb, 5) + 1;
cm->buffer_model.num_units_in_decoding_tick = aom_rb_read_unsigned_literal(
rb, 32); // Number of units in a decoding tick
- cm->buffer_model.buffer_removal_delay_length = aom_rb_read_literal(rb, 5) + 1;
- cm->buffer_model.frame_presentation_delay_length =
+ cm->buffer_model.buffer_removal_time_length = aom_rb_read_literal(rb, 5) + 1;
+ cm->buffer_model.frame_presentation_time_length =
aom_rb_read_literal(rb, 5) + 1;
}
@@ -3352,32 +4130,27 @@ void av1_read_op_parameters_info(AV1_COMMON *const cm,
op_num + 1);
}
- cm->op_params[op_num].decoder_buffer_delay = aom_rb_read_literal(
+ cm->op_params[op_num].decoder_buffer_delay = aom_rb_read_unsigned_literal(
rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
- cm->op_params[op_num].encoder_buffer_delay = aom_rb_read_literal(
+ cm->op_params[op_num].encoder_buffer_delay = aom_rb_read_unsigned_literal(
rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
cm->op_params[op_num].low_delay_mode_flag = aom_rb_read_bit(rb);
}
-static void av1_read_tu_pts_info(AV1_COMMON *const cm,
- struct aom_read_bit_buffer *rb) {
- cm->tu_presentation_delay =
- aom_rb_read_literal(rb, cm->buffer_model.frame_presentation_delay_length);
-}
-
-void read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
- // rb->error_handler may be triggered during aom_rb_read_bit(), raising
- // internal errors and immediate decoding termination. We use a local variable
- // to store the info. as we decode. At the end, if no errors have occurred,
- // cm->seq_params is updated.
- SequenceHeader sh = cm->seq_params;
- SequenceHeader *const seq_params = &sh;
- int num_bits_width = aom_rb_read_literal(rb, 4) + 1;
- int num_bits_height = aom_rb_read_literal(rb, 4) + 1;
- int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1;
- int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1;
+static void av1_read_temporal_point_info(AV1_COMMON *const cm,
+ struct aom_read_bit_buffer *rb) {
+ cm->frame_presentation_time = aom_rb_read_unsigned_literal(
+ rb, cm->buffer_model.frame_presentation_time_length);
+}
+
+void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+ SequenceHeader *seq_params) {
+ const int num_bits_width = aom_rb_read_literal(rb, 4) + 1;
+ const int num_bits_height = aom_rb_read_literal(rb, 4) + 1;
+ const int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1;
+ const int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1;
seq_params->num_bits_width = num_bits_width;
seq_params->num_bits_height = num_bits_height;
@@ -3452,7 +4225,6 @@ void read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
seq_params->enable_superres = aom_rb_read_bit(rb);
seq_params->enable_cdef = aom_rb_read_bit(rb);
seq_params->enable_restoration = aom_rb_read_bit(rb);
- cm->seq_params = *seq_params;
}
static int read_global_motion_params(WarpedMotionParams *params,
@@ -3640,9 +4412,12 @@ static void show_existing_frame_reset(AV1Decoder *const pbi,
*cm->fc = cm->frame_contexts[existing_frame_idx];
}
+// On success, returns 0. On failure, calls aom_internal_error and does not
+// return.
static int read_uncompressed_header(AV1Decoder *pbi,
struct aom_read_bit_buffer *rb) {
AV1_COMMON *const cm = &pbi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
MACROBLOCKD *const xd = &pbi->mb;
BufferPool *const pool = cm->buffer_pool;
RefCntBuffer *const frame_bufs = pool->frame_bufs;
@@ -3658,7 +4433,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
// NOTE: By default all coded frames to be used as a reference
cm->is_reference_frame = 1;
- if (cm->seq_params.reduced_still_picture_hdr) {
+ if (seq_params->reduced_still_picture_hdr) {
cm->show_existing_frame = 0;
cm->show_frame = 1;
cm->frame_type = KEY_FRAME;
@@ -3671,12 +4446,12 @@ static int read_uncompressed_header(AV1Decoder *pbi,
// Show an existing frame directly.
const int existing_frame_idx = aom_rb_read_literal(rb, 3);
const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
- if (cm->seq_params.decoder_model_info_present_flag &&
+ if (seq_params->decoder_model_info_present_flag &&
cm->timing_info.equal_picture_interval == 0) {
- av1_read_tu_pts_info(cm, rb);
+ av1_read_temporal_point_info(cm, rb);
}
- if (cm->seq_params.frame_id_numbers_present_flag) {
- int frame_id_length = cm->seq_params.frame_id_length;
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_length = seq_params->frame_id_length;
int display_frame_id = aom_rb_read_literal(rb, frame_id_length);
/* Compare display_frame_id with ref_frame_id and check valid for
* referencing */
@@ -3719,16 +4494,16 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2); // 2 bits
cm->show_frame = aom_rb_read_bit(rb);
- if (cm->seq_params.still_picture &&
+ if (seq_params->still_picture &&
(cm->frame_type != KEY_FRAME || !cm->show_frame)) {
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Still pictures must be coded as shown keyframes");
}
cm->showable_frame = cm->frame_type != KEY_FRAME;
if (cm->show_frame) {
- if (cm->seq_params.decoder_model_info_present_flag &&
+ if (seq_params->decoder_model_info_present_flag &&
cm->timing_info.equal_picture_interval == 0)
- av1_read_tu_pts_info(cm, rb);
+ av1_read_temporal_point_info(cm, rb);
} else {
// See if this frame can be used as show_existing_frame in future
cm->showable_frame = aom_rb_read_bit(rb);
@@ -3742,17 +4517,17 @@ static int read_uncompressed_header(AV1Decoder *pbi,
}
cm->disable_cdf_update = aom_rb_read_bit(rb);
- if (cm->seq_params.force_screen_content_tools == 2) {
+ if (seq_params->force_screen_content_tools == 2) {
cm->allow_screen_content_tools = aom_rb_read_bit(rb);
} else {
- cm->allow_screen_content_tools = cm->seq_params.force_screen_content_tools;
+ cm->allow_screen_content_tools = seq_params->force_screen_content_tools;
}
if (cm->allow_screen_content_tools) {
- if (cm->seq_params.force_integer_mv == 2) {
+ if (seq_params->force_integer_mv == 2) {
cm->cur_frame_force_integer_mv = aom_rb_read_bit(rb);
} else {
- cm->cur_frame_force_integer_mv = cm->seq_params.force_integer_mv;
+ cm->cur_frame_force_integer_mv = seq_params->force_integer_mv;
}
} else {
cm->cur_frame_force_integer_mv = 0;
@@ -3763,10 +4538,10 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->allow_intrabc = 0;
cm->primary_ref_frame = PRIMARY_REF_NONE;
- if (!cm->seq_params.reduced_still_picture_hdr) {
- if (cm->seq_params.frame_id_numbers_present_flag) {
- int frame_id_length = cm->seq_params.frame_id_length;
- int diff_len = cm->seq_params.delta_frame_id_length;
+ if (!seq_params->reduced_still_picture_hdr) {
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_length = seq_params->frame_id_length;
+ int diff_len = seq_params->delta_frame_id_length;
int prev_frame_id = 0;
int have_prev_frame_id = !pbi->decoding_first_frame &&
!(cm->frame_type == KEY_FRAME && cm->show_frame);
@@ -3811,7 +4586,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
frame_is_sframe(cm) ? 1 : aom_rb_read_literal(rb, 1);
cm->frame_offset =
- aom_rb_read_literal(rb, cm->seq_params.order_hint_bits_minus_1 + 1);
+ aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1);
cm->current_video_frame = cm->frame_offset;
if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
@@ -3819,27 +4594,27 @@ static int read_uncompressed_header(AV1Decoder *pbi,
}
}
- if (cm->seq_params.decoder_model_info_present_flag) {
- cm->buffer_removal_delay_present = aom_rb_read_bit(rb);
- if (cm->buffer_removal_delay_present) {
+ if (seq_params->decoder_model_info_present_flag) {
+ cm->buffer_removal_time_present = aom_rb_read_bit(rb);
+ if (cm->buffer_removal_time_present) {
for (int op_num = 0;
- op_num < cm->seq_params.operating_points_cnt_minus_1 + 1; op_num++) {
+ op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
if (cm->op_params[op_num].decoder_model_param_present_flag) {
- if ((((cm->seq_params.operating_point_idc[op_num] >>
+ if ((((seq_params->operating_point_idc[op_num] >>
cm->temporal_layer_id) &
0x1) &&
- ((cm->seq_params.operating_point_idc[op_num] >>
+ ((seq_params->operating_point_idc[op_num] >>
(cm->spatial_layer_id + 8)) &
0x1)) ||
- cm->seq_params.operating_point_idc[op_num] == 0) {
- cm->op_frame_timing[op_num].buffer_removal_delay =
- aom_rb_read_literal(
- rb, cm->buffer_model.buffer_removal_delay_length);
+ seq_params->operating_point_idc[op_num] == 0) {
+ cm->op_frame_timing[op_num].buffer_removal_time =
+ aom_rb_read_unsigned_literal(
+ rb, cm->buffer_model.buffer_removal_time_length);
} else {
- cm->op_frame_timing[op_num].buffer_removal_delay = 0;
+ cm->op_frame_timing[op_num].buffer_removal_time = 0;
}
} else {
- cm->op_frame_timing[op_num].buffer_removal_delay = 0;
+ cm->op_frame_timing[op_num].buffer_removal_time = 0;
}
}
}
@@ -3882,11 +4657,11 @@ static int read_uncompressed_header(AV1Decoder *pbi,
if (!frame_is_intra_only(cm) || pbi->refresh_frame_flags != 0xFF) {
// Read all ref frame order hints if error_resilient_mode == 1
- if (cm->error_resilient_mode && cm->seq_params.enable_order_hint) {
+ if (cm->error_resilient_mode && seq_params->enable_order_hint) {
for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
// Read order hint from bit stream
unsigned int frame_offset =
- aom_rb_read_literal(rb, cm->seq_params.order_hint_bits_minus_1 + 1);
+ aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1);
// Get buffer index
int buf_idx = cm->ref_frame_map[ref_idx];
assert(buf_idx < FRAME_BUFFERS);
@@ -3906,10 +4681,10 @@ static int read_uncompressed_header(AV1Decoder *pbi,
}
lock_buffer_pool(pool);
if (aom_realloc_frame_buffer(
- &frame_bufs[buf_idx].buf, cm->seq_params.max_frame_width,
- cm->seq_params.max_frame_height, cm->subsampling_x,
- cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment,
+ &frame_bufs[buf_idx].buf, seq_params->max_frame_width,
+ seq_params->max_frame_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment,
&pool->frame_bufs[buf_idx].raw_frame_buffer, pool->get_fb_cb,
pool->cb_priv)) {
unlock_buffer_pool(pool);
@@ -3917,7 +4692,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
"Failed to allocate frame buffer");
}
unlock_buffer_pool(pool);
- set_planes_to_neutral_grey(cm, &frame_bufs[buf_idx].buf, 0);
+ set_planes_to_neutral_grey(seq_params, &frame_bufs[buf_idx].buf, 0);
cm->ref_frame_map[ref_idx] = buf_idx;
frame_bufs[buf_idx].cur_frame_offset = frame_offset;
@@ -3937,7 +4712,8 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->allow_ref_frame_mvs = 0;
if (cm->intra_only) {
- cm->cur_frame->film_grain_params_present = cm->film_grain_params_present;
+ cm->cur_frame->film_grain_params_present =
+ seq_params->film_grain_params_present;
setup_frame_size(cm, frame_size_override_flag, rb);
if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
cm->allow_intrabc = aom_rb_read_bit(rb);
@@ -3945,7 +4721,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
} else if (pbi->need_resync != 1) { /* Skip if need resync */
// Frame refs short signaling is off when error resilient mode is on.
- if (cm->seq_params.enable_order_hint)
+ if (seq_params->enable_order_hint)
cm->frame_refs_short_signaling = aom_rb_read_bit(rb);
if (cm->frame_refs_short_signaling) {
@@ -3999,9 +4775,9 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
- if (cm->seq_params.frame_id_numbers_present_flag) {
- int frame_id_length = cm->seq_params.frame_id_length;
- int diff_len = cm->seq_params.delta_frame_id_length;
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_length = seq_params->frame_id_length;
+ int diff_len = seq_params->delta_frame_id_length;
int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len);
int ref_frame_id =
((cm->current_frame_id - (delta_frame_id_minus_1 + 1) +
@@ -4064,7 +4840,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only;
cm->cur_frame->frame_type = cm->frame_type;
- if (cm->seq_params.frame_id_numbers_present_flag) {
+ if (seq_params->frame_id_numbers_present_flag) {
/* If bitmask is set, update reference frame id values and
mark frames as valid for reference */
int refresh_frame_flags = pbi->refresh_frame_flags;
@@ -4077,7 +4853,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
}
const int might_bwd_adapt =
- !(cm->seq_params.reduced_still_picture_hdr) && !(cm->disable_cdf_update);
+ !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update);
if (might_bwd_adapt) {
cm->refresh_frame_context = aom_rb_read_bit(rb)
? REFRESH_FRAME_CONTEXT_DISABLED
@@ -4086,14 +4862,16 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
}
- get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
- get_frame_new_buffer(cm)->color_primaries = cm->color_primaries;
+ get_frame_new_buffer(cm)->bit_depth = seq_params->bit_depth;
+ get_frame_new_buffer(cm)->color_primaries = seq_params->color_primaries;
get_frame_new_buffer(cm)->transfer_characteristics =
- cm->transfer_characteristics;
- get_frame_new_buffer(cm)->matrix_coefficients = cm->matrix_coefficients;
- get_frame_new_buffer(cm)->monochrome = cm->seq_params.monochrome;
- get_frame_new_buffer(cm)->chroma_sample_position = cm->chroma_sample_position;
- get_frame_new_buffer(cm)->color_range = cm->color_range;
+ seq_params->transfer_characteristics;
+ get_frame_new_buffer(cm)->matrix_coefficients =
+ seq_params->matrix_coefficients;
+ get_frame_new_buffer(cm)->monochrome = seq_params->monochrome;
+ get_frame_new_buffer(cm)->chroma_sample_position =
+ seq_params->chroma_sample_position;
+ get_frame_new_buffer(cm)->color_range = seq_params->color_range;
get_frame_new_buffer(cm)->render_width = cm->render_width;
get_frame_new_buffer(cm)->render_height = cm->render_height;
@@ -4145,7 +4923,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
read_tile_info(pbi, rb);
setup_quantization(cm, rb);
- xd->bd = (int)cm->bit_depth;
+ xd->bd = (int)seq_params->bit_depth;
if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
cm->num_allocated_above_context_mi_col < cm->mi_cols ||
@@ -4196,22 +4974,22 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->lf.filter_level[0] = 0;
cm->lf.filter_level[1] = 0;
}
- if (cm->coded_lossless || !cm->seq_params.enable_cdef) {
+ if (cm->coded_lossless || !seq_params->enable_cdef) {
cm->cdef_bits = 0;
cm->cdef_strengths[0] = 0;
cm->cdef_uv_strengths[0] = 0;
}
- if (cm->all_lossless || !cm->seq_params.enable_restoration) {
+ if (cm->all_lossless || !seq_params->enable_restoration) {
cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
}
setup_loopfilter(cm, rb);
- if (!cm->coded_lossless && cm->seq_params.enable_cdef) {
+ if (!cm->coded_lossless && seq_params->enable_cdef) {
setup_cdef(cm, rb);
}
- if (!cm->all_lossless && cm->seq_params.enable_restoration) {
+ if (!cm->all_lossless && seq_params->enable_restoration) {
decode_restoration_mode(cm, rb);
}
@@ -4236,7 +5014,8 @@ static int read_uncompressed_header(AV1Decoder *pbi,
if (!frame_is_intra_only(cm)) read_global_motion(cm, rb);
- cm->cur_frame->film_grain_params_present = cm->film_grain_params_present;
+ cm->cur_frame->film_grain_params_present =
+ seq_params->film_grain_params_present;
read_film_grain(cm, rb);
#if EXT_TILE_DEBUG
@@ -4282,11 +5061,11 @@ void superres_post_decode(AV1Decoder *pbi) {
unlock_buffer_pool(pool);
}
-int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
- struct aom_read_bit_buffer *rb,
- const uint8_t *data,
- const uint8_t **p_data_end,
- int trailing_bits_present) {
+uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb,
+ const uint8_t *data,
+ const uint8_t **p_data_end,
+ int trailing_bits_present) {
AV1_COMMON *const cm = &pbi->common;
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *const xd = &pbi->mb;
@@ -4316,7 +5095,8 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
pbi->dec_tile_col = -1;
}
- pbi->uncomp_hdr_size = aom_rb_bytes_read(rb);
+ const uint32_t uncomp_hdr_size =
+ (uint32_t)aom_rb_bytes_read(rb); // Size of the uncompressed header
YV12_BUFFER_CONFIG *new_fb = get_frame_new_buffer(cm);
xd->cur_buf = new_fb;
if (av1_allow_intrabc(cm)) {
@@ -4327,7 +5107,7 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
if (cm->show_existing_frame) {
// showing a frame directly
- *p_data_end = data + aom_rb_bytes_read(rb);
+ *p_data_end = data + uncomp_hdr_size;
if (cm->reset_decoder_state) {
// Use the default frame context values.
*cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
@@ -4335,7 +5115,7 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Uninitialized entropy context.");
}
- return 0;
+ return uncomp_hdr_size;
}
cm->setup_mi(cm);
@@ -4344,7 +5124,8 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
av1_setup_motion_field(cm);
- av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y, num_planes);
+ av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y, num_planes);
if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
// use the default frame context values
*cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
@@ -4356,7 +5137,7 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
"Uninitialized entropy context.");
xd->corrupted = 0;
- return 0;
+ return uncomp_hdr_size;
}
// Once-per-frame initialization
@@ -4368,7 +5149,7 @@ static void setup_frame_info(AV1Decoder *pbi) {
cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
av1_alloc_restoration_buffers(cm);
}
- const int use_highbd = cm->use_highbitdepth ? 1 : 0;
+ const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
if (pbi->td.mc_buf_size != buf_size) {
av1_free_mc_tmp_buf(&pbi->td, use_highbd);
@@ -4386,14 +5167,21 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
if (initialize_flag) setup_frame_info(pbi);
- if (pbi->max_threads > 1 && tile_count_tg > 1 && !cm->large_scale_tile)
+ if (pbi->max_threads > 1 && !(cm->large_scale_tile && !pbi->ext_tile_debug) &&
+ pbi->row_mt)
+ *p_data_end =
+ decode_tiles_row_mt(pbi, data, data_end, start_tile, end_tile);
+ else if (pbi->max_threads > 1 && tile_count_tg > 1 &&
+ !(cm->large_scale_tile && !pbi->ext_tile_debug))
*p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile);
else
*p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
const int num_planes = av1_num_planes(cm);
// If the bit stream is monochrome, set the U and V buffers to a constant.
- if (num_planes < 3) set_planes_to_neutral_grey(cm, xd->cur_buf, 1);
+ if (num_planes < 3) {
+ set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1);
+ }
if (end_tile != cm->tile_rows * cm->tile_cols - 1) {
return;
diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h
index 330cedcdce..d289b31f20 100644
--- a/third_party/aom/av1/decoder/decodeframe.h
+++ b/third_party/aom/av1/decoder/decodeframe.h
@@ -18,12 +18,13 @@ extern "C" {
struct AV1Decoder;
struct aom_read_bit_buffer;
+struct ThreadData;
// Reads the middle part of the sequence header OBU (from
-// frame_width_bits_minus_1 to enable_restoration) into cm->seq_params (a
-// SequenceHeader). Reports errors by calling rb->error_handler() or
-// aom_internal_error().
-void read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb);
+// frame_width_bits_minus_1 to enable_restoration) into seq_params.
+// Reports errors by calling rb->error_handler() or aom_internal_error().
+void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+ SequenceHeader *seq_params);
void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
int num_bits_height, int *width, int *height);
@@ -34,11 +35,14 @@ BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb);
int av1_check_trailing_bits(struct AV1Decoder *pbi,
struct aom_read_bit_buffer *rb);
-int av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi,
- struct aom_read_bit_buffer *rb,
- const uint8_t *data,
- const uint8_t **p_data_end,
- int trailing_bits_present);
+// On success, returns the frame header size. On failure, calls
+// aom_internal_error and does not return.
+// TODO(wtc): Figure out and document the p_data_end parameter.
+uint32_t av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb,
+ const uint8_t *data,
+ const uint8_t **p_data_end,
+ int trailing_bits_present);
void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data,
const uint8_t *data_end,
@@ -47,8 +51,9 @@ void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data,
// Implements the color_config() function in the spec. Reports errors by
// calling rb->error_handler() or aom_internal_error().
-void av1_read_color_config(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
- int allow_lowbitdepth);
+void av1_read_color_config(struct aom_read_bit_buffer *rb,
+ int allow_lowbitdepth, SequenceHeader *seq_params,
+ struct aom_internal_error_info *error_info);
// Implements the timing_info() function in the spec. Reports errors by calling
// rb->error_handler().
@@ -69,7 +74,7 @@ struct aom_read_bit_buffer *av1_init_read_bit_buffer(
struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
const uint8_t *data_end);
-void av1_free_mc_tmp_buf(void *td, int use_highbd);
+void av1_free_mc_tmp_buf(struct ThreadData *thread_data, int use_highbd);
void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c
index cc8f4d29e0..5e920b18d4 100644
--- a/third_party/aom/av1/decoder/decodemv.c
+++ b/third_party/aom/av1/decoder/decodemv.c
@@ -290,7 +290,7 @@ static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd,
av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1);
if (segment_id < 0 || segment_id > seg->last_active_segid) {
- aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
"Corrupted segment_ids");
}
return segment_id;
@@ -573,7 +573,7 @@ static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
PALETTE_SIZES, ACCT_STR) +
2;
- read_palette_colors_y(xd, cm->bit_depth, pmi, r);
+ read_palette_colors_y(xd, cm->seq_params.bit_depth, pmi, r);
}
}
if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED &&
@@ -587,7 +587,7 @@ static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
PALETTE_SIZES, ACCT_STR) +
2;
- read_palette_colors_uv(xd, cm->bit_depth, pmi, r);
+ read_palette_colors_uv(xd, cm->seq_params.bit_depth, pmi, r);
}
}
}
@@ -1299,7 +1299,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
}
if (is_compound != is_inter_compound_mode(mbmi->mode)) {
- aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
"Prediction mode %d invalid with ref frame %d %d",
mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
}
@@ -1480,8 +1480,9 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
}
}
- xd->cfl.is_chroma_reference = is_chroma_reference(
- mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+ xd->cfl.is_chroma_reference =
+ is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y);
xd->cfl.store_y = store_cfl_required(cm, xd);
#if DEC_MISMATCH_DEBUG
diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c
index 2e91d27d36..e978fad6ce 100644
--- a/third_party/aom/av1/decoder/decoder.c
+++ b/third_party/aom/av1/decoder/decoder.c
@@ -71,6 +71,7 @@ static void dec_free_mi(AV1_COMMON *cm) {
cm->mip = NULL;
aom_free(cm->mi_grid_base);
cm->mi_grid_base = NULL;
+ cm->mi_alloc_size = 0;
}
AV1Decoder *av1_decoder_create(BufferPool *const pool) {
@@ -81,6 +82,9 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
av1_zero(*pbi);
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
if (setjmp(cm->error.jmp)) {
cm->error.setjmp = 0;
av1_decoder_remove(pbi);
@@ -98,7 +102,7 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
memset(cm->frame_contexts, 0, FRAME_CONTEXTS * sizeof(*cm->frame_contexts));
pbi->need_resync = 1;
- once(initialize_dec);
+ aom_once(initialize_dec);
// Initialize the references to not point to any frame buffers.
memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
@@ -108,7 +112,7 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
pbi->decoding_first_frame = 1;
pbi->common.buffer_pool = pool;
- cm->bit_depth = AOM_BITS_8;
+ cm->seq_params.bit_depth = AOM_BITS_8;
cm->dequant_bit_depth = AOM_BITS_8;
cm->alloc_mi = av1_dec_alloc_mi;
@@ -146,6 +150,12 @@ void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) {
}
}
+void av1_dec_free_cb_buf(AV1Decoder *pbi) {
+ aom_free(pbi->cb_buffer_base);
+ pbi->cb_buffer_base = NULL;
+ pbi->cb_buffer_alloc_size = 0;
+}
+
void av1_decoder_remove(AV1Decoder *pbi) {
int i;
@@ -161,7 +171,7 @@ void av1_decoder_remove(AV1Decoder *pbi) {
if (pbi->thread_data) {
for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) {
DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
- const int use_highbd = pbi->common.use_highbitdepth ? 1 : 0;
+ const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0;
av1_free_mc_tmp_buf(thread_data->td, use_highbd);
aom_free(thread_data->td);
}
@@ -172,6 +182,20 @@ void av1_decoder_remove(AV1Decoder *pbi) {
AVxWorker *const worker = &pbi->tile_workers[i];
aom_get_worker_interface()->end(worker);
}
+#if CONFIG_MULTITHREAD
+ if (pbi->row_mt_mutex_ != NULL) {
+ pthread_mutex_destroy(pbi->row_mt_mutex_);
+ aom_free(pbi->row_mt_mutex_);
+ }
+ if (pbi->row_mt_cond_ != NULL) {
+ pthread_cond_destroy(pbi->row_mt_cond_);
+ aom_free(pbi->row_mt_cond_);
+ }
+#endif
+ for (i = 0; i < pbi->allocated_tiles; i++) {
+ TileDataDec *const tile_data = pbi->tile_data + i;
+ av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+ }
aom_free(pbi->tile_data);
aom_free(pbi->tile_workers);
@@ -181,10 +205,11 @@ void av1_decoder_remove(AV1Decoder *pbi) {
av1_dealloc_dec_jobs(&pbi->tile_mt_info);
}
+ av1_dec_free_cb_buf(pbi);
#if CONFIG_ACCOUNTING
aom_accounting_clear(&pbi->accounting);
#endif
- const int use_highbd = pbi->common.use_highbitdepth ? 1 : 0;
+ const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0;
av1_free_mc_tmp_buf(&pbi->td, use_highbd);
aom_free(pbi);
@@ -279,7 +304,7 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
ref_buf->y_buffer = sd->y_buffer;
ref_buf->u_buffer = sd->u_buffer;
ref_buf->v_buffer = sd->v_buffer;
- ref_buf->use_external_refernce_buffers = 1;
+ ref_buf->use_external_reference_buffers = 1;
}
}
@@ -414,7 +439,10 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
// Find a free frame buffer. Return error if can not find any.
cm->new_fb_idx = get_free_fb(cm);
- if (cm->new_fb_idx == INVALID_IDX) return AOM_CODEC_MEM_ERROR;
+ if (cm->new_fb_idx == INVALID_IDX) {
+ cm->error.error_code = AOM_CODEC_MEM_ERROR;
+ return 1;
+ }
// Assign a MV array to the frame buffer.
cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
@@ -423,6 +451,9 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
if (setjmp(cm->error.jmp)) {
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
int i;
@@ -474,7 +505,13 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
int frame_decoded =
aom_decode_frame_from_obus(pbi, source, source + size, psource);
- if (cm->error.error_code != AOM_CODEC_OK) return 1;
+ if (cm->error.error_code != AOM_CODEC_OK) {
+ lock_buffer_pool(pool);
+ decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+ unlock_buffer_pool(pool);
+ cm->error.setjmp = 0;
+ return 1;
+ }
#if TXCOEFF_TIMER
cm->cum_txcoeff_timer += cm->txcoeff_timer;
@@ -493,7 +530,10 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
pbi->decoding_first_frame = 0;
}
- if (cm->error.error_code != AOM_CODEC_OK) return 1;
+ if (cm->error.error_code != AOM_CODEC_OK) {
+ cm->error.setjmp = 0;
+ return 1;
+ }
aom_clear_system_state();
diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h
index 42fcc12569..610b98d95e 100644
--- a/third_party/aom/av1/decoder/decoder.h
+++ b/third_party/aom/av1/decoder/decoder.h
@@ -33,6 +33,20 @@
extern "C" {
#endif
+typedef void (*decode_block_visitor_fn_t)(const AV1_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ aom_reader *const r, const int plane,
+ const int row, const int col,
+ const TX_SIZE tx_size);
+
+typedef void (*predict_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
+
+typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd);
+
typedef struct ThreadData {
aom_reader *bit_reader;
DECLARE_ALIGNED(32, MACROBLOCKD, xd);
@@ -41,12 +55,54 @@ typedef struct ThreadData {
CB_BUFFER cb_buffer_base;
uint8_t *mc_buf[2];
int32_t mc_buf_size;
+
+ decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit;
+ decode_block_visitor_fn_t predict_and_recon_intra_block_visit;
+ decode_block_visitor_fn_t read_coeffs_tx_inter_block_visit;
+ decode_block_visitor_fn_t inverse_tx_inter_block_visit;
+ predict_inter_block_visitor_fn_t predict_inter_block_visit;
+ cfl_store_inter_block_visitor_fn_t cfl_store_inter_block_visit;
} ThreadData;
+typedef struct AV1DecRowMTJobInfo {
+ int tile_row;
+ int tile_col;
+ int mi_row;
+} AV1DecRowMTJobInfo;
+
+typedef struct AV1DecRowMTSyncData {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex_;
+ pthread_cond_t *cond_;
+#endif
+ int allocated_sb_rows;
+ int *cur_sb_col;
+ int sync_range;
+ int mi_rows;
+ int mi_cols;
+ int mi_rows_parse_done;
+ int mi_rows_decode_started;
+ int num_threads_working;
+} AV1DecRowMTSync;
+
+typedef struct AV1DecRowMTInfo {
+ int tile_rows_start;
+ int tile_rows_end;
+ int tile_cols_start;
+ int tile_cols_end;
+ int start_tile;
+ int end_tile;
+ int mi_rows_parse_done;
+ int mi_rows_decode_started;
+ int mi_rows_to_decode;
+ int row_mt_exit;
+} AV1DecRowMTInfo;
+
typedef struct TileDataDec {
TileInfo tile_info;
aom_reader bit_reader;
DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+ AV1DecRowMTSync dec_row_mt_sync;
} TileDataDec;
typedef struct TileBufferDec {
@@ -139,9 +195,8 @@ typedef struct AV1Decoder {
int acct_enabled;
Accounting accounting;
#endif
- size_t uncomp_hdr_size; // Size of the uncompressed header
- int tg_size; // Number of tiles in the current tilegroup
- int tg_start; // First tile in the current tilegroup
+ int tg_size; // Number of tiles in the current tilegroup
+ int tg_start; // First tile in the current tilegroup
int tg_size_bit_offset;
int sequence_header_ready;
#if CONFIG_INSPECTION
@@ -162,12 +217,27 @@ typedef struct AV1Decoder {
int tile_count_minus_1;
uint32_t coded_tile_data_size;
unsigned int ext_tile_debug; // for ext-tile software debug & testing
+ unsigned int row_mt;
EXTERNAL_REFERENCES ext_refs;
size_t tile_list_size;
uint8_t *tile_list_output;
size_t buffer_sz;
+
+ CB_BUFFER *cb_buffer_base;
+ int cb_buffer_alloc_size;
+
+ int allocated_row_mt_sync_rows;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *row_mt_mutex_;
+ pthread_cond_t *row_mt_cond_;
+#endif
+
+ AV1DecRowMTInfo frame_row_mt_info;
} AV1Decoder;
+// Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error
+// code and returns a nonzero value on failure.
int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size,
const uint8_t **dest);
@@ -192,6 +262,10 @@ struct AV1Decoder *av1_decoder_create(BufferPool *const pool);
void av1_decoder_remove(struct AV1Decoder *pbi);
void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_jobs_sync);
+void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync);
+
+void av1_dec_free_cb_buf(AV1Decoder *pbi);
+
static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
BufferPool *const pool) {
if (idx >= 0) {
@@ -207,18 +281,6 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
}
}
-static INLINE int dec_is_ref_frame_buf(AV1Decoder *const pbi,
- RefCntBuffer *frame_buf) {
- AV1_COMMON *const cm = &pbi->common;
- int i;
- for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
- RefBuffer *const ref_frame = &cm->frame_refs[i];
- if (ref_frame->idx == INVALID_IDX) continue;
- if (frame_buf == &cm->buffer_pool->frame_bufs[ref_frame->idx]) break;
- }
- return (i < INTER_REFS_PER_FRAME);
-}
-
#define ACCT_STR __func__
static INLINE int av1_read_uniform(aom_reader *r, int n) {
const int l = get_unsigned_bits(n);
@@ -238,6 +300,10 @@ void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row,
int mi_col, aom_reader *r, BLOCK_SIZE bsize,
palette_visitor_fn_t visit);
+typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td,
+ int mi_row, int mi_col, aom_reader *r,
+ PARTITION_TYPE partition, BLOCK_SIZE bsize);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/third_party/aom/av1/decoder/decodetxb.c b/third_party/aom/av1/decoder/decodetxb.c
index f9a3e85780..f3ef2d55e4 100644
--- a/third_party/aom/av1/decoder/decodetxb.c
+++ b/third_party/aom/av1/decoder/decodetxb.c
@@ -320,10 +320,14 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
return cul_level;
}
-uint8_t av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
- MACROBLOCKD *const xd, aom_reader *const r,
- const int row, const int col,
- const int plane, const TX_SIZE tx_size) {
+void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+ MACROBLOCKD *const xd, aom_reader *const r,
+ const int plane, const int row, const int col,
+ const TX_SIZE tx_size) {
+#if TXCOEFF_TIMER
+ struct aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+#endif
MB_MODE_INFO *const mbmi = xd->mi[0];
struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -337,5 +341,22 @@ uint8_t av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
const uint8_t cul_level =
av1_read_coeffs_txb(cm, xd, r, row, col, plane, &txb_ctx, tx_size);
av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col, row);
- return cul_level;
+
+ if (is_inter_block(mbmi)) {
+ PLANE_TYPE plane_type = get_plane_type(plane);
+ // tx_type will be read out in av1_read_coeffs_txb_facade
+ const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, row, col, tx_size,
+ cm->reduced_tx_set_used);
+
+ if (plane == 0)
+ update_txk_array(mbmi->txk_type, mbmi->sb_type, row, col, tx_size,
+ tx_type);
+ }
+
+#if TXCOEFF_TIMER
+ aom_usec_timer_mark(&timer);
+ const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+ cm->txcoeff_timer += elapsed_time;
+ ++cm->txb_count;
+#endif
}
diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h
index d0b3d8c7ae..687bba9581 100644
--- a/third_party/aom/av1/decoder/decodetxb.h
+++ b/third_party/aom/av1/decoder/decodetxb.h
@@ -25,8 +25,8 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
const TXB_CTX *const txb_ctx,
const TX_SIZE tx_size);
-uint8_t av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
- MACROBLOCKD *const xd, aom_reader *const r,
- const int row, const int col,
- const int plane, const TX_SIZE tx_size);
+void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+ MACROBLOCKD *const xd, aom_reader *const r,
+ const int plane, const int row, const int col,
+ const TX_SIZE tx_size);
#endif // DECODETXB_H_
diff --git a/third_party/aom/av1/decoder/dthread.c b/third_party/aom/av1/decoder/dthread.c
index ff03502e62..3946c787a1 100644
--- a/third_party/aom/av1/decoder/dthread.c
+++ b/third_party/aom/av1/decoder/dthread.c
@@ -157,8 +157,8 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker,
dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
av1_frameworker_unlock_stats(src_worker);
- dst_cm->bit_depth = src_cm->bit_depth;
- dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
+ dst_cm->seq_params.bit_depth = src_cm->seq_params.bit_depth;
+ dst_cm->seq_params.use_highbitdepth = src_cm->seq_params.use_highbitdepth;
// TODO(zoeliu): To handle parallel decoding
dst_cm->prev_frame =
src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame;
@@ -166,8 +166,8 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker,
!src_cm->show_existing_frame ? src_cm->width : src_cm->last_width;
dst_cm->last_height =
!src_cm->show_existing_frame ? src_cm->height : src_cm->last_height;
- dst_cm->subsampling_x = src_cm->subsampling_x;
- dst_cm->subsampling_y = src_cm->subsampling_y;
+ dst_cm->seq_params.subsampling_x = src_cm->seq_params.subsampling_x;
+ dst_cm->seq_params.subsampling_y = src_cm->seq_params.subsampling_y;
dst_cm->frame_type = src_cm->frame_type;
dst_cm->last_show_frame = !src_cm->show_existing_frame
? src_cm->show_frame
diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h
index 33d89006e7..9f854e0158 100644
--- a/third_party/aom/av1/decoder/dthread.h
+++ b/third_party/aom/av1/decoder/dthread.h
@@ -39,7 +39,6 @@ typedef struct FrameWorkerData {
const uint8_t *data_end;
size_t data_size;
void *user_priv;
- int result;
int worker_id;
int received_frame;
diff --git a/third_party/aom/av1/decoder/obu.c b/third_party/aom/av1/decoder/obu.c
index 482b6415e3..715bc68377 100644
--- a/third_party/aom/av1/decoder/obu.c
+++ b/third_party/aom/av1/decoder/obu.c
@@ -161,6 +161,17 @@ static int is_obu_in_current_operating_point(AV1Decoder *pbi,
return 0;
}
+static int byte_alignment(AV1_COMMON *const cm,
+ struct aom_read_bit_buffer *const rb) {
+ while (rb->bit_offset & 7) {
+ if (aom_rb_read_bit(rb)) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ }
+ return 0;
+}
+
static uint32_t read_temporal_delimiter_obu() { return 0; }
// Returns a boolean that indicates success.
@@ -173,6 +184,13 @@ static int read_bitstream_level(BitstreamLevel *bl,
return 1;
}
+// Returns whether two sequence headers are consistent with each other.
+// TODO(huisu,wtc@google.com): make sure the code matches the spec exactly.
+static int are_seq_headers_consistent(const SequenceHeader *seq_params_old,
+ const SequenceHeader *seq_params_new) {
+ return !memcmp(seq_params_old, seq_params_new, sizeof(SequenceHeader));
+}
+
// On success, sets pbi->sequence_header_ready to 1 and returns the number of
// bytes read from 'rb'.
// On failure, sets pbi->common.error.error_code and returns 0.
@@ -184,14 +202,17 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
// Verify rb has been configured to report errors.
assert(rb->error_handler);
- cm->profile = av1_read_profile(rb);
- if (cm->profile > PROFILE_2) {
+ // Use a local variable to store the information as we decode. At the end,
+ // if no errors have occurred, cm->seq_params is updated.
+ SequenceHeader sh = cm->seq_params;
+ SequenceHeader *const seq_params = &sh;
+
+ seq_params->profile = av1_read_profile(rb);
+ if (seq_params->profile > PROFILE_2) {
cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
return 0;
}
- SequenceHeader *const seq_params = &cm->seq_params;
-
// Still picture or not
seq_params->still_picture = aom_rb_read_bit(rb);
seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb);
@@ -252,7 +273,8 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
(cm->timing_info.equal_picture_interval ||
cm->op_params[i].decoder_model_param_present_flag)) {
cm->op_params[i].bitrate = max_level_bitrate(
- cm->profile, major_minor_to_seq_level_idx(seq_params->level[i]),
+ seq_params->profile,
+ major_minor_to_seq_level_idx(seq_params->level[i]),
seq_params->tier[i]);
// Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
// the check
@@ -305,30 +327,49 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
return 0;
}
- read_sequence_header(cm, rb);
+ av1_read_sequence_header(cm, rb, seq_params);
- av1_read_color_config(cm, rb, pbi->allow_lowbitdepth);
+ av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &cm->error);
+ if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) &&
+ !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) &&
+ !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, "
+ "%d %d subsampling is not supported.\n",
+ seq_params->subsampling_x, seq_params->subsampling_y);
+ }
- cm->film_grain_params_present = aom_rb_read_bit(rb);
+ seq_params->film_grain_params_present = aom_rb_read_bit(rb);
if (av1_check_trailing_bits(pbi, rb) != 0) {
// cm->error.error_code is already set.
return 0;
}
+ // If a sequence header has been decoded before, we check if the new
+ // one is consistent with the old one.
+ if (pbi->sequence_header_ready) {
+ if (!are_seq_headers_consistent(&cm->seq_params, seq_params)) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Inconsistent sequence headers received.");
+ }
+ }
+
+ cm->seq_params = *seq_params;
pbi->sequence_header_ready = 1;
return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
}
+// On success, returns the frame header size. On failure, calls
+// aom_internal_error and does not return.
static uint32_t read_frame_header_obu(AV1Decoder *pbi,
struct aom_read_bit_buffer *rb,
const uint8_t *data,
const uint8_t **p_data_end,
int trailing_bits_present) {
- av1_decode_frame_headers_and_setup(pbi, rb, data, p_data_end,
- trailing_bits_present);
- return (uint32_t)(pbi->uncomp_hdr_size);
+ return av1_decode_frame_headers_and_setup(pbi, rb, data, p_data_end,
+ trailing_bits_present);
}
static int32_t read_tile_group_header(AV1Decoder *pbi,
@@ -353,7 +394,6 @@ static int32_t read_tile_group_header(AV1Decoder *pbi,
aom_internal_error(
&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
"For OBU_FRAME type obu tile_start_and_end_present_flag must be 0");
- cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
return -1;
}
*start_tile =
@@ -371,9 +411,12 @@ static uint32_t read_one_tile_group_obu(
int start_tile, end_tile;
int32_t header_size, tg_payload_size;
+ assert((rb->bit_offset & 7) == 0);
+ assert(rb->bit_buffer + aom_rb_bytes_read(rb) == data);
+
header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile,
tile_start_implicit);
- if (header_size == -1) return 0;
+ if (header_size == -1 || byte_alignment(cm, rb)) return 0;
if (start_tile > end_tile) return header_size;
data += header_size;
av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile,
@@ -386,44 +429,22 @@ static uint32_t read_one_tile_group_obu(
return header_size + tg_payload_size;
}
-// Only called while large_scale_tile = 1.
-static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
- struct aom_read_bit_buffer *rb,
- const uint8_t *data,
- const uint8_t *data_end,
- const uint8_t **p_data_end,
- int *frame_decoding_finished) {
- AV1_COMMON *const cm = &pbi->common;
- uint32_t tile_list_payload_size = 0;
- const int num_tiles = cm->tile_cols * cm->tile_rows;
- const int start_tile = 0;
- const int end_tile = num_tiles - 1;
- int i = 0;
-
- // Process the tile list info.
- pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
- pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
- pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
- if (pbi->tile_count_minus_1 > 511) {
- cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
- return 0;
- }
-
- // Allocate output frame buffer for the tile list.
+static void alloc_tile_list_buffer(AV1Decoder *pbi) {
// TODO(yunqing): for now, copy each tile's decoded YUV data directly to the
// output buffer. This needs to be modified according to the application
// requirement.
+ AV1_COMMON *const cm = &pbi->common;
const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
- const int ssy = cm->subsampling_y;
- const int ssx = cm->subsampling_x;
+ const int ssy = cm->seq_params.subsampling_y;
+ const int ssx = cm->seq_params.subsampling_x;
const int num_planes = av1_num_planes(cm);
const size_t yplane_tile_size = tile_height_in_pixels * tile_width_in_pixels;
const size_t uvplane_tile_size =
(num_planes > 1)
? (tile_height_in_pixels >> ssy) * (tile_width_in_pixels >> ssx)
: 0;
- const size_t tile_size = (cm->use_highbitdepth ? 2 : 1) *
+ const size_t tile_size = (cm->seq_params.use_highbitdepth ? 2 : 1) *
(yplane_tile_size + 2 * uvplane_tile_size);
pbi->tile_list_size = tile_size * (pbi->tile_count_minus_1 + 1);
@@ -437,6 +458,83 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
"Failed to allocate the tile list output buffer");
pbi->buffer_sz = pbi->tile_list_size;
}
+}
+
+static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi,
+ uint8_t **output) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
+ const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
+ const int ssy = cm->seq_params.subsampling_y;
+ const int ssx = cm->seq_params.subsampling_x;
+ const int num_planes = av1_num_planes(cm);
+
+ // Copy decoded tile to the tile list output buffer.
+ YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm);
+ const int mi_row = pbi->dec_tile_row * cm->tile_height;
+ const int mi_col = pbi->dec_tile_col * cm->tile_width;
+ const int is_hbd = (cur_frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+ uint8_t *bufs[MAX_MB_PLANE] = { NULL, NULL, NULL };
+ int strides[MAX_MB_PLANE] = { 0, 0, 0 };
+ int plane;
+
+ for (plane = 0; plane < num_planes; ++plane) {
+ int shift_x = plane > 0 ? ssx : 0;
+ int shift_y = plane > 0 ? ssy : 0;
+
+ bufs[plane] = cur_frame->buffers[plane];
+ strides[plane] =
+ (plane > 0) ? cur_frame->strides[1] : cur_frame->strides[0];
+
+ bufs[plane] += mi_row * (MI_SIZE >> shift_y) * strides[plane] +
+ mi_col * (MI_SIZE >> shift_x);
+
+ if (is_hbd) {
+ bufs[plane] = (uint8_t *)CONVERT_TO_SHORTPTR(bufs[plane]);
+ strides[plane] *= 2;
+ }
+
+ int w, h;
+ w = (plane > 0 && shift_x > 0) ? ((tile_width_in_pixels + 1) >> shift_x)
+ : tile_width_in_pixels;
+ w *= (1 + is_hbd);
+ h = (plane > 0 && shift_y > 0) ? ((tile_height_in_pixels + 1) >> shift_y)
+ : tile_height_in_pixels;
+ int j;
+
+ for (j = 0; j < h; ++j) {
+ memcpy(*output, bufs[plane], w);
+ bufs[plane] += strides[plane];
+ *output += w;
+ }
+ }
+}
+
+// Only called while large_scale_tile = 1.
+static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb,
+ const uint8_t *data,
+ const uint8_t *data_end,
+ const uint8_t **p_data_end,
+ int *frame_decoding_finished) {
+ AV1_COMMON *const cm = &pbi->common;
+ uint32_t tile_list_payload_size = 0;
+ const int num_tiles = cm->tile_cols * cm->tile_rows;
+ const int start_tile = 0;
+ const int end_tile = num_tiles - 1;
+ int i = 0;
+
+ // Process the tile list info.
+ pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+ pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+ pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
+ if (pbi->tile_count_minus_1 > MAX_TILES - 1) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+
+ // Allocate output frame buffer for the tile list.
+ alloc_tile_list_buffer(pbi);
uint32_t tile_list_info_bytes = 4;
tile_list_payload_size += tile_list_info_bytes;
@@ -485,45 +583,8 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
data = *p_data_end;
assert(data <= data_end);
- // Copy decoded tile to the tile list output buffer.
- YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm);
- const int mi_row = pbi->dec_tile_row * cm->tile_height;
- const int mi_col = pbi->dec_tile_col * cm->tile_width;
- const int is_hbd = (cur_frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
- uint8_t *bufs[MAX_MB_PLANE] = { NULL, NULL, NULL };
- int strides[MAX_MB_PLANE] = { 0, 0, 0 };
- int plane;
-
- for (plane = 0; plane < num_planes; ++plane) {
- int shift_x = plane > 0 ? ssx : 0;
- int shift_y = plane > 0 ? ssy : 0;
-
- bufs[plane] = cur_frame->buffers[plane];
- strides[plane] =
- (plane > 0) ? cur_frame->strides[1] : cur_frame->strides[0];
- if (is_hbd) {
- bufs[plane] = (uint8_t *)CONVERT_TO_SHORTPTR(cur_frame->buffers[plane]);
- strides[plane] =
- (plane > 0) ? 2 * cur_frame->strides[1] : 2 * cur_frame->strides[0];
- }
-
- bufs[plane] += mi_row * (MI_SIZE >> shift_y) * strides[plane] +
- mi_col * (MI_SIZE >> shift_x);
-
- int w, h;
- w = (plane > 0 && shift_x > 0) ? ((tile_width_in_pixels + 1) >> shift_x)
- : tile_width_in_pixels;
- w *= (1 + is_hbd);
- h = (plane > 0 && shift_y > 0) ? ((tile_height_in_pixels + 1) >> shift_y)
- : tile_height_in_pixels;
- int j;
-
- for (j = 0; j < h; ++j) {
- memcpy(output, bufs[plane], w);
- bufs[plane] += strides[plane];
- output += w;
- }
- }
+ // Copy the decoded tile to the tile list output buffer.
+ copy_decoded_tile_to_tile_list_buffer(pbi, &output);
}
*frame_decoding_finished = 1;
@@ -710,7 +771,6 @@ aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
return AOM_CODEC_OK;
}
-#define EXT_TILE_DEBUG 0
// On success, returns a boolean that indicates whether the decoding of the
// current frame is finished. On failure, sets cm->error.error_code and
// returns -1.
@@ -720,7 +780,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
AV1_COMMON *const cm = &pbi->common;
int frame_decoding_finished = 0;
int is_first_tg_obu_received = 1;
- int frame_header_size = 0;
+ uint32_t frame_header_size = 0;
int seq_header_received = 0;
size_t seq_header_size = 0;
ObuHeader obu_header;
@@ -785,7 +845,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
}
}
- av1_init_read_bit_buffer(pbi, &rb, data, data_end);
+ av1_init_read_bit_buffer(pbi, &rb, data, data + payload_size);
switch (obu_header.type) {
case OBU_TEMPORAL_DELIMITER:
@@ -813,21 +873,35 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
// Only decode first frame header received
if (!pbi->seen_frame_header ||
(cm->large_scale_tile && !pbi->camera_frame_header_ready)) {
- pbi->seen_frame_header = 1;
frame_header_size = read_frame_header_obu(
pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME);
- if (cm->large_scale_tile) pbi->camera_frame_header_ready = 1;
+ pbi->seen_frame_header = 1;
+ if (!pbi->ext_tile_debug && cm->large_scale_tile)
+ pbi->camera_frame_header_ready = 1;
+ } else {
+ // TODO(wtc): Verify that the frame_header_obu is identical to the
+ // original frame_header_obu. For now just skip frame_header_size
+ // bytes in the bit buffer.
+ if (frame_header_size > payload_size) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ assert(rb.bit_offset == 0);
+ rb.bit_offset = 8 * frame_header_size;
}
decoded_payload_size = frame_header_size;
- pbi->frame_header_size = (size_t)frame_header_size;
+ pbi->frame_header_size = frame_header_size;
if (cm->show_existing_frame) {
+ if (obu_header.type == OBU_FRAME) {
+ cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return -1;
+ }
frame_decoding_finished = 1;
pbi->seen_frame_header = 0;
break;
}
-#if !EXT_TILE_DEBUG
// In large scale tile coding, decode the common camera frame header
// before any tile list OBU.
if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) {
@@ -838,17 +912,18 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
*p_data_end = data_end;
break;
}
-#endif // EXT_TILE_DEBUG
if (obu_header.type != OBU_FRAME) break;
obu_payload_offset = frame_header_size;
+ // Byte align the reader before reading the tile group.
+ if (byte_alignment(cm, &rb)) return -1;
AOM_FALLTHROUGH_INTENDED; // fall through to read tile group.
case OBU_TILE_GROUP:
if (!pbi->seen_frame_header) {
cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
return -1;
}
- if ((size_t)(data_end - data) < obu_payload_offset) {
+ if (obu_payload_offset > payload_size) {
cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
return -1;
}
@@ -904,4 +979,3 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
return frame_decoding_finished;
}
-#undef EXT_TILE_DEBUG
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
index c5a6bc8319..b721b6d2b7 100644
--- a/third_party/aom/av1/encoder/aq_complexity.c
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -66,7 +66,8 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
cpi->refresh_alt_ref_frame ||
(cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
int segment;
- const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+ const int aq_strength =
+ get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth);
// Clear down the segment map.
memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols);
@@ -93,7 +94,7 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
qindex_delta = av1_compute_qdelta_by_rate(
&cpi->rc, cm->frame_type, cm->base_qindex,
- aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth);
+ aq_c_q_adj_factor[aq_strength][segment], cm->seq_params.bit_depth);
// For AQ complexity mode, we dont allow Q0 in a segment if the base
// Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
@@ -138,7 +139,8 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
const int target_rate = (int)(num / denom);
double logvar;
double low_var_thresh;
- const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+ const int aq_strength =
+ get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth);
aom_clear_system_state();
low_var_thresh = (cpi->oxcf.pass == 2) ? AOMMAX(cpi->twopass.mb_av_energy,
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
index a1fe37d4ac..dec2c730d5 100644
--- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -137,8 +137,9 @@ static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
const RATE_CONTROL *const rc = &cpi->rc;
- int deltaq = av1_compute_qdelta_by_rate(rc, cpi->common.frame_type, q,
- rate_factor, cpi->common.bit_depth);
+ int deltaq =
+ av1_compute_qdelta_by_rate(rc, cpi->common.frame_type, q, rate_factor,
+ cpi->common.seq_params.bit_depth);
if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
deltaq = -cr->max_qdelta_perc * q / 100;
}
@@ -164,15 +165,16 @@ int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
estimated_bits =
(int)((1.0 - weight_segment1 - weight_segment2) *
av1_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
- correction_factor, cm->bit_depth) +
- weight_segment1 *
- av1_estimate_bits_at_q(cm->frame_type,
- cm->base_qindex + cr->qindex_delta[1],
- mbs, correction_factor, cm->bit_depth) +
- weight_segment2 *
- av1_estimate_bits_at_q(cm->frame_type,
- cm->base_qindex + cr->qindex_delta[2],
- mbs, correction_factor, cm->bit_depth));
+ correction_factor,
+ cm->seq_params.bit_depth) +
+ weight_segment1 * av1_estimate_bits_at_q(
+ cm->frame_type,
+ cm->base_qindex + cr->qindex_delta[1], mbs,
+ correction_factor, cm->seq_params.bit_depth) +
+ weight_segment2 * av1_estimate_bits_at_q(
+ cm->frame_type,
+ cm->base_qindex + cr->qindex_delta[2], mbs,
+ correction_factor, cm->seq_params.bit_depth));
return estimated_bits;
}
@@ -197,12 +199,13 @@ int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
// Compute delta-q corresponding to qindex i.
int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
// Take segment weighted average for bits per mb.
- bits_per_mb = (int)((1.0 - weight_segment) *
- av1_rc_bits_per_mb(cm->frame_type, i,
- correction_factor, cm->bit_depth) +
- weight_segment *
- av1_rc_bits_per_mb(cm->frame_type, i + deltaq,
- correction_factor, cm->bit_depth));
+ bits_per_mb =
+ (int)((1.0 - weight_segment) *
+ av1_rc_bits_per_mb(cm->frame_type, i, correction_factor,
+ cm->seq_params.bit_depth) +
+ weight_segment * av1_rc_bits_per_mb(cm->frame_type, i + deltaq,
+ correction_factor,
+ cm->seq_params.bit_depth));
return bits_per_mb;
}
@@ -507,7 +510,8 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
} else {
int qindex_delta = 0;
int qindex2;
- const double q = av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
+ const double q =
+ av1_convert_qindex_to_q(cm->base_qindex, cm->seq_params.bit_depth);
aom_clear_system_state();
// Set rate threshold to some multiple (set to 2 for now) of the target
// rate (target is given by sb64_target_rate and scaled by 256).
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
index 29a3114472..6cb6adc42d 100644
--- a/third_party/aom/av1/encoder/aq_variance.c
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -71,7 +71,7 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
for (i = 0; i < MAX_SEGMENTS; ++i) {
int qindex_delta =
av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
- rate_ratio[i], cm->bit_depth);
+ rate_ratio[i], cm->seq_params.bit_depth);
// We don't allow qindex 0 in a segment if the base value is not 0.
// Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
@@ -235,9 +235,9 @@ int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
const int rate_level = SEGMENT_ID(block_var_level);
const AV1_COMMON *const cm = &cpi->common;
- int qindex_delta =
- av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
- rate_ratio[rate_level], cm->bit_depth);
+ int qindex_delta = av1_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[rate_level],
+ cm->seq_params.bit_depth);
if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
qindex_delta = -cm->base_qindex + 1;
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
index 1c5bdeb253..d0477b35be 100644
--- a/third_party/aom/av1/encoder/av1_quantize.c
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -613,9 +613,9 @@ void av1_init_quantizer(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
QUANTS *const quants = &cpi->quants;
Dequants *const dequants = &cpi->dequants;
- av1_build_quantizer(cm->bit_depth, cm->y_dc_delta_q, cm->u_dc_delta_q,
- cm->u_ac_delta_q, cm->v_dc_delta_q, cm->v_ac_delta_q,
- quants, dequants);
+ av1_build_quantizer(cm->seq_params.bit_depth, cm->y_dc_delta_q,
+ cm->u_dc_delta_q, cm->u_ac_delta_q, cm->v_dc_delta_q,
+ cm->v_ac_delta_q, quants, dequants);
}
void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
@@ -713,7 +713,7 @@ void av1_set_quantizer(AV1_COMMON *cm, int q) {
cm->qm_u = aom_get_qmlevel(cm->base_qindex + cm->u_ac_delta_q,
cm->min_qmlevel, cm->max_qmlevel);
- if (!cm->separate_uv_delta_q)
+ if (!cm->seq_params.separate_uv_delta_q)
cm->qm_v = cm->qm_u;
else
cm->qm_v = aom_get_qmlevel(cm->base_qindex + cm->v_ac_delta_q,
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
index cdd7c24929..2070755cda 100644
--- a/third_party/aom/av1/encoder/bitstream.c
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -769,7 +769,7 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
aom_write_symbol(w, n - PALETTE_MIN_SIZE,
xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
PALETTE_SIZES);
- write_palette_colors_y(xd, pmi, cm->bit_depth, w);
+ write_palette_colors_y(xd, pmi, cm->seq_params.bit_depth, w);
}
}
@@ -786,7 +786,7 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
aom_write_symbol(w, n - PALETTE_MIN_SIZE,
xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
PALETTE_SIZES);
- write_palette_colors_uv(xd, pmi, cm->bit_depth, w);
+ write_palette_colors_uv(xd, pmi, cm->seq_params.bit_depth, w);
}
}
}
@@ -1421,8 +1421,8 @@ static void write_inter_txb_coeff(AV1_COMMON *const cm, MACROBLOCK *const x,
for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
blk_col += bkw) {
pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
- cm->bit_depth, *block, blk_row, blk_col, max_tx_size,
- token_stats);
+ cm->seq_params.bit_depth, *block, blk_row, blk_col,
+ max_tx_size, token_stats);
*block += step;
}
}
@@ -1612,14 +1612,13 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
const int num_planes = av1_num_planes(cm);
for (int plane = 0; plane < num_planes; ++plane) {
- int rcol0, rcol1, rrow0, rrow1, tile_tl_idx;
+ int rcol0, rcol1, rrow0, rrow1;
if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
- &rcol0, &rcol1, &rrow0, &rrow1,
- &tile_tl_idx)) {
+ &rcol0, &rcol1, &rrow0, &rrow1)) {
const int rstride = cm->rst_info[plane].horz_units_per_tile;
for (int rrow = rrow0; rrow < rrow1; ++rrow) {
for (int rcol = rcol0; rcol < rcol1; ++rcol) {
- const int runit_idx = tile_tl_idx + rcol + rrow * rstride;
+ const int runit_idx = rcol + rrow * rstride;
const RestorationUnitInfo *rui =
&cm->rst_info[plane].unit_info[runit_idx];
loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane,
@@ -1705,7 +1704,7 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
const int mi_col_end = tile->mi_col_end;
int mi_row, mi_col;
- av1_zero_above_context(cm, mi_col_start, mi_col_end, tile->tile_row);
+ av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row);
av1_init_above_context(cm, xd, tile->tile_row);
if (cpi->common.delta_q_present_flag) {
@@ -1779,7 +1778,7 @@ static void encode_restoration_mode(AV1_COMMON *cm,
}
if (num_planes > 1) {
- int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
+ int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
if (s && !chroma_none) {
aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size !=
cm->rst_info[0].restoration_unit_size);
@@ -2020,7 +2019,7 @@ static void encode_quantization(const AV1_COMMON *const cm,
if (num_planes > 1) {
int diff_uv_delta = (cm->u_dc_delta_q != cm->v_dc_delta_q) ||
(cm->u_ac_delta_q != cm->v_ac_delta_q);
- if (cm->separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
+ if (cm->seq_params.separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
write_delta_q(wb, cm->u_dc_delta_q);
write_delta_q(wb, cm->u_ac_delta_q);
if (diff_uv_delta) {
@@ -2032,7 +2031,7 @@ static void encode_quantization(const AV1_COMMON *const cm,
if (cm->using_qmatrix) {
aom_wb_write_literal(wb, cm->qm_y, QM_LEVEL_BITS);
aom_wb_write_literal(wb, cm->qm_u, QM_LEVEL_BITS);
- if (!cm->separate_uv_delta_q)
+ if (!cm->seq_params.separate_uv_delta_q)
assert(cm->qm_u == cm->qm_v);
else
aom_wb_write_literal(wb, cm->qm_v, QM_LEVEL_BITS);
@@ -2240,7 +2239,8 @@ static int get_refresh_mask_gf16(AV1_COMP *cpi) {
#endif // USE_GF16_MULTI_LAYER
static int get_refresh_mask(AV1_COMP *cpi) {
- if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common))
+ if ((cpi->common.frame_type == KEY_FRAME && cpi->common.show_frame) ||
+ frame_is_sframe(&cpi->common))
return 0xFF;
int refresh_mask = 0;
@@ -2258,9 +2258,15 @@ static int get_refresh_mask(AV1_COMP *cpi) {
// LAST3_FRAME.
refresh_mask |=
(cpi->refresh_last_frame << cpi->ref_fb_idx[LAST_REF_FRAMES - 1]);
-
+#if USE_SYMM_MULTI_LAYER
+ refresh_mask |=
+ (cpi->new_bwdref_update_rule == 1)
+ ? (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[EXTREF_FRAME - 1])
+ : (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[BWDREF_FRAME - 1]);
+#else
refresh_mask |=
(cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[BWDREF_FRAME - 1]);
+#endif
refresh_mask |=
(cpi->refresh_alt2_ref_frame << cpi->ref_fb_idx[ALTREF2_FRAME - 1]);
@@ -2419,80 +2425,82 @@ static void write_profile(BITSTREAM_PROFILE profile,
aom_wb_write_literal(wb, profile, PROFILE_BITS);
}
-static void write_bitdepth(AV1_COMMON *const cm,
+static void write_bitdepth(const SequenceHeader *const seq_params,
struct aom_write_bit_buffer *wb) {
// Profile 0/1: [0] for 8 bit, [1] 10-bit
// Profile 2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit
- aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_8 ? 0 : 1);
- if (cm->profile == PROFILE_2 && cm->bit_depth != AOM_BITS_8) {
- aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_10 ? 0 : 1);
+ aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1);
+ if (seq_params->profile == PROFILE_2 && seq_params->bit_depth != AOM_BITS_8) {
+ aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_10 ? 0 : 1);
}
}
-static void write_color_config(AV1_COMMON *const cm,
+static void write_color_config(const SequenceHeader *const seq_params,
struct aom_write_bit_buffer *wb) {
- write_bitdepth(cm, wb);
- const int is_monochrome = cm->seq_params.monochrome;
+ write_bitdepth(seq_params, wb);
+ const int is_monochrome = seq_params->monochrome;
// monochrome bit
- if (cm->profile != PROFILE_1)
+ if (seq_params->profile != PROFILE_1)
aom_wb_write_bit(wb, is_monochrome);
else
assert(!is_monochrome);
- if (cm->color_primaries == AOM_CICP_CP_UNSPECIFIED &&
- cm->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED &&
- cm->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) {
+ if (seq_params->color_primaries == AOM_CICP_CP_UNSPECIFIED &&
+ seq_params->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED &&
+ seq_params->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) {
aom_wb_write_bit(wb, 0); // No color description present
} else {
aom_wb_write_bit(wb, 1); // Color description present
- aom_wb_write_literal(wb, cm->color_primaries, 8);
- aom_wb_write_literal(wb, cm->transfer_characteristics, 8);
- aom_wb_write_literal(wb, cm->matrix_coefficients, 8);
+ aom_wb_write_literal(wb, seq_params->color_primaries, 8);
+ aom_wb_write_literal(wb, seq_params->transfer_characteristics, 8);
+ aom_wb_write_literal(wb, seq_params->matrix_coefficients, 8);
}
if (is_monochrome) {
// 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
- aom_wb_write_bit(wb, cm->color_range);
+ aom_wb_write_bit(wb, seq_params->color_range);
return;
}
- if (cm->color_primaries == AOM_CICP_CP_BT_709 &&
- cm->transfer_characteristics == AOM_CICP_TC_SRGB &&
- cm->matrix_coefficients ==
+ if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+ seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+ seq_params->matrix_coefficients ==
AOM_CICP_MC_IDENTITY) { // it would be better to remove this
// dependency too
- assert(cm->subsampling_x == 0 && cm->subsampling_y == 0);
- assert(cm->profile == PROFILE_1 ||
- (cm->profile == PROFILE_2 && cm->bit_depth == AOM_BITS_12));
+ assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+ assert(seq_params->profile == PROFILE_1 ||
+ (seq_params->profile == PROFILE_2 &&
+ seq_params->bit_depth == AOM_BITS_12));
} else {
// 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
- aom_wb_write_bit(wb, cm->color_range);
- if (cm->profile == PROFILE_0) {
+ aom_wb_write_bit(wb, seq_params->color_range);
+ if (seq_params->profile == PROFILE_0) {
// 420 only
- assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
- } else if (cm->profile == PROFILE_1) {
+ assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1);
+ } else if (seq_params->profile == PROFILE_1) {
// 444 only
- assert(cm->subsampling_x == 0 && cm->subsampling_y == 0);
- } else if (cm->profile == PROFILE_2) {
- if (cm->bit_depth == AOM_BITS_12) {
+ assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+ } else if (seq_params->profile == PROFILE_2) {
+ if (seq_params->bit_depth == AOM_BITS_12) {
// 420, 444 or 422
- aom_wb_write_bit(wb, cm->subsampling_x);
- if (cm->subsampling_x == 0) {
- assert(cm->subsampling_y == 0 &&
+ aom_wb_write_bit(wb, seq_params->subsampling_x);
+ if (seq_params->subsampling_x == 0) {
+ assert(seq_params->subsampling_y == 0 &&
"4:4:0 subsampling not allowed in AV1");
} else {
- aom_wb_write_bit(wb, cm->subsampling_y);
+ aom_wb_write_bit(wb, seq_params->subsampling_y);
}
} else {
// 422 only
- assert(cm->subsampling_x == 1 && cm->subsampling_y == 0);
+ assert(seq_params->subsampling_x == 1 &&
+ seq_params->subsampling_y == 0);
}
}
- if (cm->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
- assert(cm->subsampling_x == 0 && cm->subsampling_y == 0);
+ if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
}
- if (cm->subsampling_x == 1 && cm->subsampling_y == 1) {
- aom_wb_write_literal(wb, cm->chroma_sample_position, 2);
+ if (seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) {
+ aom_wb_write_literal(wb, seq_params->chroma_sample_position, 2);
}
}
- aom_wb_write_bit(wb, cm->separate_uv_delta_q);
+ aom_wb_write_bit(wb, seq_params->separate_uv_delta_q);
}
static void write_timing_info_header(AV1_COMMON *const cm,
@@ -2517,8 +2525,8 @@ static void write_decoder_model_info(AV1_COMMON *const cm,
wb, cm->buffer_model.encoder_decoder_buffer_delay_length - 1, 5);
aom_wb_write_unsigned_literal(wb, cm->buffer_model.num_units_in_decoding_tick,
32); // Number of units in decoding tick
- aom_wb_write_literal(wb, cm->buffer_model.buffer_removal_delay_length - 1, 5);
- aom_wb_write_literal(wb, cm->buffer_model.frame_presentation_delay_length - 1,
+ aom_wb_write_literal(wb, cm->buffer_model.buffer_removal_time_length - 1, 5);
+ aom_wb_write_literal(wb, cm->buffer_model.frame_presentation_time_length - 1,
5);
}
@@ -2533,23 +2541,25 @@ static void write_dec_model_op_parameters(AV1_COMMON *const cm,
// aom_wb_write_bit(wb, cm->op_params[op_num].has_parameters);
// if (!cm->op_params[op_num].has_parameters) return;
- aom_wb_write_literal(wb, cm->op_params[op_num].decoder_buffer_delay,
- cm->buffer_model.encoder_decoder_buffer_delay_length);
+ aom_wb_write_unsigned_literal(
+ wb, cm->op_params[op_num].decoder_buffer_delay,
+ cm->buffer_model.encoder_decoder_buffer_delay_length);
- aom_wb_write_literal(wb, cm->op_params[op_num].encoder_buffer_delay,
- cm->buffer_model.encoder_decoder_buffer_delay_length);
+ aom_wb_write_unsigned_literal(
+ wb, cm->op_params[op_num].encoder_buffer_delay,
+ cm->buffer_model.encoder_decoder_buffer_delay_length);
aom_wb_write_bit(wb, cm->op_params[op_num].low_delay_mode_flag);
- cm->op_frame_timing[op_num].buffer_removal_delay =
+ cm->op_frame_timing[op_num].buffer_removal_time =
0; // reset the decoded frame counter
}
static void write_tu_pts_info(AV1_COMMON *const cm,
struct aom_write_bit_buffer *wb) {
aom_wb_write_unsigned_literal(
- wb, (uint32_t)cm->tu_presentation_delay,
- cm->buffer_model.frame_presentation_delay_length);
+ wb, cm->frame_presentation_time,
+ cm->buffer_model.frame_presentation_time_length);
}
static void write_film_grain_params(AV1_COMP *cpi,
@@ -2601,8 +2611,8 @@ static void write_film_grain_params(AV1_COMP *cpi,
pars->chroma_scaling_from_luma = 0; // for monochrome override to 0
if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
- ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) &&
- (pars->num_y_points == 0))) {
+ ((cm->seq_params.subsampling_x == 1) &&
+ (cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) {
pars->num_cb_points = 0;
pars->num_cr_points = 0;
} else {
@@ -2931,18 +2941,19 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
struct aom_write_bit_buffer *saved_wb,
struct aom_write_bit_buffer *wb) {
AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
// NOTE: By default all coded frames to be used as a reference
cm->is_reference_frame = 1;
cm->frame_type = cm->intra_only ? INTRA_ONLY_FRAME : cm->frame_type;
- if (cm->seq_params.still_picture) {
+ if (seq_params->still_picture) {
assert(cm->show_existing_frame == 0);
assert(cm->show_frame == 1);
assert(cm->frame_type == KEY_FRAME);
}
- if (!cm->seq_params.reduced_still_picture_hdr) {
+ if (!seq_params->reduced_still_picture_hdr) {
if (cm->show_existing_frame) {
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
@@ -2957,12 +2968,12 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
aom_wb_write_bit(wb, 1); // show_existing_frame
aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
- if (cm->seq_params.decoder_model_info_present_flag &&
+ if (seq_params->decoder_model_info_present_flag &&
cm->timing_info.equal_picture_interval == 0) {
write_tu_pts_info(cm, wb);
}
- if (cm->seq_params.frame_id_numbers_present_flag) {
- int frame_id_len = cm->seq_params.frame_id_length;
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_len = seq_params->frame_id_length;
int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
aom_wb_write_literal(wb, display_frame_id, frame_id_len);
}
@@ -2983,7 +2994,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
aom_wb_write_bit(wb, cm->show_frame);
if (cm->show_frame) {
- if (cm->seq_params.decoder_model_info_present_flag &&
+ if (seq_params->decoder_model_info_present_flag &&
cm->timing_info.equal_picture_interval == 0)
write_tu_pts_info(cm, wb);
} else {
@@ -2997,18 +3008,18 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
}
aom_wb_write_bit(wb, cm->disable_cdf_update);
- if (cm->seq_params.force_screen_content_tools == 2) {
+ if (seq_params->force_screen_content_tools == 2) {
aom_wb_write_bit(wb, cm->allow_screen_content_tools);
} else {
assert(cm->allow_screen_content_tools ==
- cm->seq_params.force_screen_content_tools);
+ seq_params->force_screen_content_tools);
}
if (cm->allow_screen_content_tools) {
- if (cm->seq_params.force_integer_mv == 2) {
+ if (seq_params->force_integer_mv == 2) {
aom_wb_write_bit(wb, cm->cur_frame_force_integer_mv);
} else {
- assert(cm->cur_frame_force_integer_mv == cm->seq_params.force_integer_mv);
+ assert(cm->cur_frame_force_integer_mv == seq_params->force_integer_mv);
}
} else {
assert(cm->cur_frame_force_integer_mv == 0);
@@ -3018,53 +3029,57 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
int frame_size_override_flag = 0;
cm->frame_refs_short_signaling = 0;
- if (cm->seq_params.reduced_still_picture_hdr) {
- assert(cm->width == cm->seq_params.max_frame_width &&
- cm->height == cm->seq_params.max_frame_height);
+ if (seq_params->reduced_still_picture_hdr) {
+ assert(cm->width == seq_params->max_frame_width &&
+ cm->height == seq_params->max_frame_height);
} else {
- if (cm->seq_params.frame_id_numbers_present_flag) {
- int frame_id_len = cm->seq_params.frame_id_length;
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_len = seq_params->frame_id_length;
aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
}
- if (cm->width > cm->seq_params.max_frame_width ||
- cm->height > cm->seq_params.max_frame_height) {
+ if (cm->width > seq_params->max_frame_width ||
+ cm->height > seq_params->max_frame_height) {
aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
"Frame dimensions are larger than the maximum values");
}
frame_size_override_flag =
frame_is_sframe(cm) ? 1
- : (cm->width != cm->seq_params.max_frame_width ||
- cm->height != cm->seq_params.max_frame_height);
+ : (cm->width != seq_params->max_frame_width ||
+ cm->height != seq_params->max_frame_height);
if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag);
- if (cm->seq_params.enable_order_hint)
+ if (seq_params->enable_order_hint)
aom_wb_write_literal(wb, cm->frame_offset,
- cm->seq_params.order_hint_bits_minus_1 + 1);
+ seq_params->order_hint_bits_minus_1 + 1);
if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
aom_wb_write_literal(wb, cm->primary_ref_frame, PRIMARY_REF_BITS);
}
}
- if (cm->seq_params.decoder_model_info_present_flag) {
- aom_wb_write_bit(wb, cm->buffer_removal_delay_present);
- if (cm->buffer_removal_delay_present) {
+ if (seq_params->decoder_model_info_present_flag) {
+ aom_wb_write_bit(wb, cm->buffer_removal_time_present);
+ if (cm->buffer_removal_time_present) {
for (int op_num = 0;
- op_num < cm->seq_params.operating_points_cnt_minus_1 + 1; op_num++) {
+ op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
if (cm->op_params[op_num].decoder_model_param_present_flag) {
- if (((cm->seq_params.operating_point_idc[op_num] >>
+ if (((seq_params->operating_point_idc[op_num] >>
cm->temporal_layer_id) &
0x1 &&
- (cm->seq_params.operating_point_idc[op_num] >>
+ (seq_params->operating_point_idc[op_num] >>
(cm->spatial_layer_id + 8)) &
0x1) ||
- cm->seq_params.operating_point_idc[op_num] == 0) {
- aom_wb_write_literal(
- wb, (uint32_t)cm->op_frame_timing[op_num].buffer_removal_delay,
- cm->buffer_model.buffer_removal_delay_length);
- cm->op_frame_timing[op_num].buffer_removal_delay++;
+ seq_params->operating_point_idc[op_num] == 0) {
+ aom_wb_write_unsigned_literal(
+ wb, cm->op_frame_timing[op_num].buffer_removal_time,
+ cm->buffer_model.buffer_removal_time_length);
+ cm->op_frame_timing[op_num].buffer_removal_time++;
+ if (cm->op_frame_timing[op_num].buffer_removal_time == 0) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "buffer_removal_time overflowed");
+ }
}
}
}
@@ -3122,7 +3137,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
if (!frame_is_intra_only(cm) || cpi->refresh_frame_mask != 0xFF) {
// Write all ref frame order hints if error_resilient_mode == 1
- if (cm->error_resilient_mode && cm->seq_params.enable_order_hint) {
+ if (cm->error_resilient_mode && seq_params->enable_order_hint) {
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
// Get buffer index
@@ -3131,7 +3146,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
// Write order hint to bit stream
aom_wb_write_literal(wb, frame_bufs[buf_idx].cur_frame_offset,
- cm->seq_params.order_hint_bits_minus_1 + 1);
+ seq_params->order_hint_bits_minus_1 + 1);
}
}
}
@@ -3156,7 +3171,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
// automatically.
#define FRAME_REFS_SHORT_SIGNALING 0
#if FRAME_REFS_SHORT_SIGNALING
- cm->frame_refs_short_signaling = cm->seq_params.enable_order_hint;
+ cm->frame_refs_short_signaling = seq_params->enable_order_hint;
#endif // FRAME_REFS_SHORT_SIGNALING
if (cm->frame_refs_short_signaling) {
@@ -3167,7 +3182,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
check_frame_refs_short_signaling(cpi);
}
- if (cm->seq_params.enable_order_hint)
+ if (seq_params->enable_order_hint)
aom_wb_write_bit(wb, cm->frame_refs_short_signaling);
if (cm->frame_refs_short_signaling) {
@@ -3183,10 +3198,10 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
if (!cm->frame_refs_short_signaling)
aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
REF_FRAMES_LOG2);
- if (cm->seq_params.frame_id_numbers_present_flag) {
+ if (seq_params->frame_id_numbers_present_flag) {
int i = get_ref_frame_map_idx(cpi, ref_frame);
- int frame_id_len = cm->seq_params.frame_id_length;
- int diff_len = cm->seq_params.delta_frame_id_length;
+ int frame_id_len = seq_params->frame_id_length;
+ int diff_len = seq_params->delta_frame_id_length;
int delta_frame_id_minus_1 =
((cm->current_frame_id - cm->ref_frame_id[i] +
(1 << frame_id_len)) %
@@ -3222,7 +3237,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
}
const int might_bwd_adapt =
- !(cm->seq_params.reduced_still_picture_hdr) && !(cm->disable_cdf_update);
+ !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update);
if (cm->large_scale_tile)
cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
@@ -3282,7 +3297,8 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
- if (cm->film_grain_params_present && (cm->show_frame || cm->showable_frame)) {
+ if (seq_params->film_grain_params_present &&
+ (cm->show_frame || cm->showable_frame)) {
int flip_back_update_parameters_flag = 0;
if (cm->frame_type != INTER_FRAME &&
cm->film_grain_params.update_parameters == 0) {
@@ -3497,7 +3513,7 @@ static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
struct aom_write_bit_buffer wb = { dst, 0 };
uint32_t size = 0;
- write_profile(cm->profile, &wb);
+ write_profile(cm->seq_params.profile, &wb);
// Still picture or not
aom_wb_write_bit(&wb, cm->seq_params.still_picture);
@@ -3551,9 +3567,9 @@ static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
}
write_sequence_header(cpi, &wb);
- write_color_config(cm, &wb);
+ write_color_config(&cm->seq_params, &wb);
- aom_wb_write_bit(&wb, cm->film_grain_params_present);
+ aom_wb_write_bit(&wb, cm->seq_params.film_grain_params_present);
add_trailing_bits(&wb);
@@ -3960,7 +3976,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
// The TD is now written outside the frame encode loop
// write sequence header obu if KEY_FRAME, preceded by 4-byte size
- if (cm->frame_type == KEY_FRAME) {
+ if (cm->frame_type == KEY_FRAME && cm->show_frame) {
obu_header_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data);
obu_payload_size = write_sequence_header_obu(cpi, data + obu_header_size);
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
index 13fc11c315..003e59e395 100644
--- a/third_party/aom/av1/encoder/block.h
+++ b/third_party/aom/av1/encoder/block.h
@@ -224,6 +224,7 @@ struct macroblock {
int sadperbit4;
int rdmult;
int mb_energy;
+ int sb_energy_level;
int *m_search_count_ptr;
int *ex_search_count_ptr;
@@ -258,7 +259,6 @@ struct macroblock {
MvLimits mv_limits;
uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
- uint8_t blk_skip_drl[MAX_MIB_SIZE * MAX_MIB_SIZE];
int skip;
int skip_chroma_rd;
diff --git a/third_party/aom/av1/encoder/dwt.c b/third_party/aom/av1/encoder/dwt.c
index 0a57ebcfba..04088b25f9 100644
--- a/third_party/aom/av1/encoder/dwt.c
+++ b/third_party/aom/av1/encoder/dwt.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
#include <assert.h>
#include <stdlib.h>
#include <math.h>
diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h
index 9a86db2f14..03318e5b70 100644
--- a/third_party/aom/av1/encoder/dwt.h
+++ b/third_party/aom/av1/encoder/dwt.h
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
#include "av1/common/common.h"
#include "av1/common/enums.h"
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
index 027b80a161..27ca537619 100644
--- a/third_party/aom/av1/encoder/encodeframe.c
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -41,7 +41,6 @@
#include "av1/common/seg_common.h"
#include "av1/common/tile_common.h"
-#include "av1/encoder/ab_partition_model_weights.h"
#include "av1/encoder/aq_complexity.h"
#include "av1/encoder/aq_cyclicrefresh.h"
#include "av1/encoder/aq_variance.h"
@@ -54,6 +53,7 @@
#include "av1/encoder/ethread.h"
#include "av1/encoder/extend.h"
#include "av1/encoder/ml.h"
+#include "av1/encoder/partition_model_weights.h"
#include "av1/encoder/rd.h"
#include "av1/encoder/rdopt.h"
#include "av1/encoder/segmentation.h"
@@ -2099,7 +2099,7 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
// When use_square_partition_only is true, make sure at least one square
// partition is allowed by selecting the next smaller square size as
// *min_block_size.
- if (cpi->sf.use_square_partition_only) {
+ if (min_size >= cpi->sf.use_square_partition_only_threshold) {
min_size = AOMMIN(min_size, next_square_size[max_size]);
}
@@ -2363,6 +2363,7 @@ static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x,
static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
pc_tree->partitioning = PARTITION_NONE;
pc_tree->cb_search_range = SEARCH_FULL_PLANE;
+ pc_tree->none.skip = 0;
if (bsize >= BLOCK_8X8) {
BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
@@ -2876,6 +2877,168 @@ static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
}
}
+#define FEATURES 18
+#define LABELS 4
+// Use a ML model to predict if horz4 and vert4 should be considered.
+static void ml_prune_4_partition(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE bsize,
+ int part_ctx, int64_t best_rd,
+ int64_t horz_rd[2], int64_t vert_rd[2],
+ int64_t split_rd[4],
+ int *const partition_horz4_allowed,
+ int *const partition_vert4_allowed) {
+ if (best_rd >= 1000000000) return;
+ const NN_CONFIG *nn_config = NULL;
+ switch (bsize) {
+ case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
+ case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break;
+ case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config) return;
+
+ aom_clear_system_state();
+
+ // Generate features.
+ float features[FEATURES];
+ int feature_index = 0;
+ features[feature_index++] = (float)part_ctx;
+ features[feature_index++] = (float)get_unsigned_bits(x->source_variance);
+
+ const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+ int sub_block_rdcost[8] = { 0 };
+ int rd_index = 0;
+ for (int i = 0; i < 2; ++i) {
+ if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)horz_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 2; ++i) {
+ if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)vert_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 4; ++i) {
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)split_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 8; ++i) {
+ // Ratio between the sub-block RD and the whole-block RD.
+ float rd_ratio = 1.0f;
+ if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+ rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+ features[feature_index++] = rd_ratio;
+ }
+
+ // Get variance of the 1:4 and 4:1 sub-blocks.
+ unsigned int horz_4_source_var[4] = { 0 };
+ unsigned int vert_4_source_var[4] = { 0 };
+ {
+ BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+ BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+ const int src_stride = x->plane[0].src.stride;
+ const uint8_t *src = x->plane[0].src.buf;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ for (int i = 0; i < 4; ++i) {
+ const uint8_t *horz_src =
+ src + i * block_size_high[horz_4_bs] * src_stride;
+ const uint8_t *vert_src = src + i * block_size_wide[vert_4_bs];
+ unsigned int horz_var, vert_var, sse;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (xd->bd) {
+ case 10:
+ horz_var = cpi->fn_ptr[horz_4_bs].vf(
+ horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10),
+ 0, &sse);
+ vert_var = cpi->fn_ptr[vert_4_bs].vf(
+ vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10),
+ 0, &sse);
+ break;
+ case 12:
+ horz_var = cpi->fn_ptr[horz_4_bs].vf(
+ horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12),
+ 0, &sse);
+ vert_var = cpi->fn_ptr[vert_4_bs].vf(
+ vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12),
+ 0, &sse);
+ break;
+ case 8:
+ default:
+ horz_var = cpi->fn_ptr[horz_4_bs].vf(
+ horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8),
+ 0, &sse);
+ vert_var = cpi->fn_ptr[vert_4_bs].vf(
+ vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8),
+ 0, &sse);
+ break;
+ }
+ horz_4_source_var[i] =
+ ROUND_POWER_OF_TWO(horz_var, num_pels_log2_lookup[horz_4_bs]);
+ vert_4_source_var[i] =
+ ROUND_POWER_OF_TWO(vert_var, num_pels_log2_lookup[vert_4_bs]);
+ } else {
+ horz_var = cpi->fn_ptr[horz_4_bs].vf(horz_src, src_stride, AV1_VAR_OFFS,
+ 0, &sse);
+ vert_var = cpi->fn_ptr[vert_4_bs].vf(vert_src, src_stride, AV1_VAR_OFFS,
+ 0, &sse);
+ horz_4_source_var[i] =
+ ROUND_POWER_OF_TWO(horz_var, num_pels_log2_lookup[horz_4_bs]);
+ vert_4_source_var[i] =
+ ROUND_POWER_OF_TWO(vert_var, num_pels_log2_lookup[vert_4_bs]);
+ }
+ }
+ }
+
+ const float denom = (float)(x->source_variance + 1);
+ const float low_b = 0.1f;
+ const float high_b = 10.0f;
+ for (int i = 0; i < 4; ++i) {
+ // Ratio between the 4:1 sub-block variance and the whole-block variance.
+ float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+ if (var_ratio < low_b) var_ratio = low_b;
+ if (var_ratio > high_b) var_ratio = high_b;
+ features[feature_index++] = var_ratio;
+ }
+ for (int i = 0; i < 4; ++i) {
+ // Ratio between the 1:4 sub-block RD and the whole-block RD.
+ float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+ if (var_ratio < low_b) var_ratio = low_b;
+ if (var_ratio > high_b) var_ratio = high_b;
+ features[feature_index++] = var_ratio;
+ }
+ assert(feature_index == FEATURES);
+
+ // Calculate scores using the NN model.
+ float score[LABELS] = { 0.0f };
+ av1_nn_predict(features, nn_config, score);
+ int int_score[LABELS];
+ int max_score = -1000;
+ for (int i = 0; i < LABELS; ++i) {
+ int_score[i] = (int)(100 * score[i]);
+ max_score = AOMMAX(int_score[i], max_score);
+ }
+
+ // Make decisions based on the model scores.
+ int thresh = max_score;
+ switch (bsize) {
+ case BLOCK_16X16: thresh -= 400; break;
+ case BLOCK_32X32: thresh -= 400; break;
+ case BLOCK_64X64: thresh -= 100; break;
+ default: break;
+ }
+ *partition_horz4_allowed = 0;
+ *partition_vert4_allowed = 0;
+ for (int i = 0; i < LABELS; ++i) {
+ if (int_score[i] >= thresh) {
+ if ((i >> 0) & 1) *partition_horz4_allowed = 1;
+ if ((i >> 1) & 1) *partition_vert4_allowed = 1;
+ }
+ }
+}
+#undef FEATURES
+#undef LABELS
+
// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previous rate-distortion optimization
// results, for encoding speed-up.
@@ -3003,7 +3166,8 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
partition_vert_allowed &= partition_allowed || !has_cols;
do_square_split &= bsize > min_size;
}
- if (cpi->sf.use_square_partition_only) {
+
+ if (bsize > cpi->sf.use_square_partition_only_threshold) {
partition_horz_allowed &= !has_rows;
partition_vert_allowed &= !has_cols;
}
@@ -3480,13 +3644,6 @@ BEGIN_PARTITION_SEARCH:
const int ext_partition_allowed =
do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed;
- // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
- // PARTITION_VERT_4 for this block. This is almost the same as
- // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks,
- // so we require that bsize is not BLOCK_128X128.
- const int partition4_allowed =
- ext_partition_allowed && bsize != BLOCK_128X128;
-
// The standard AB partitions are allowed whenever ext-partition-types are
// allowed
int horzab_partition_allowed = ext_partition_allowed;
@@ -3642,15 +3799,34 @@ BEGIN_PARTITION_SEARCH:
restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
}
- // PARTITION_HORZ_4
+ // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
+ // PARTITION_VERT_4 for this block. This is almost the same as
+ // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks,
+ // so we require that bsize is not BLOCK_128X128.
+ const int partition4_allowed =
+ ext_partition_allowed && bsize != BLOCK_128X128;
int partition_horz4_allowed = partition4_allowed && partition_horz_allowed;
+ int partition_vert4_allowed = partition4_allowed && partition_vert_allowed;
if (cpi->sf.prune_ext_partition_types_search_level == 2) {
partition_horz4_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
pc_tree->partitioning == PARTITION_HORZ_A ||
pc_tree->partitioning == PARTITION_HORZ_B ||
pc_tree->partitioning == PARTITION_SPLIT ||
pc_tree->partitioning == PARTITION_NONE);
+ partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+ pc_tree->partitioning == PARTITION_VERT_A ||
+ pc_tree->partitioning == PARTITION_VERT_B ||
+ pc_tree->partitioning == PARTITION_SPLIT ||
+ pc_tree->partitioning == PARTITION_NONE);
}
+ if (cpi->sf.ml_prune_4_partition && partition4_allowed &&
+ partition_horz_allowed && partition_vert_allowed) {
+ ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning, best_rdc.rdcost,
+ horz_rd, vert_rd, split_rd, &partition_horz4_allowed,
+ &partition_vert4_allowed);
+ }
+
+ // PARTITION_HORZ_4
if (partition_horz4_allowed && has_rows &&
(do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
av1_init_rd_stats(&sum_rdc);
@@ -3687,14 +3863,6 @@ BEGIN_PARTITION_SEARCH:
}
// PARTITION_VERT_4
- int partition_vert4_allowed = partition4_allowed && partition_vert_allowed;
- if (cpi->sf.prune_ext_partition_types_search_level == 2) {
- partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
- pc_tree->partitioning == PARTITION_VERT_A ||
- pc_tree->partitioning == PARTITION_VERT_B ||
- pc_tree->partitioning == PARTITION_SPLIT ||
- pc_tree->partitioning == PARTITION_NONE);
- }
if (partition_vert4_allowed && has_cols &&
(do_rectangular_split || active_v_edge(cpi, mi_row, mi_step))) {
av1_init_rd_stats(&sum_rdc);
@@ -3857,6 +4025,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
}
xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv;
+ x->sb_energy_level = 0;
if (cm->delta_q_present_flag) {
// Delta-q modulation based on variance
av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
@@ -3865,11 +4034,13 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
if (DELTAQ_MODULATION == 1) {
const int block_wavelet_energy_level =
av1_block_wavelet_energy_level(cpi, x, cm->seq_params.sb_size);
+ x->sb_energy_level = block_wavelet_energy_level;
offset_qindex = av1_compute_deltaq_from_energy_level(
cpi, block_wavelet_energy_level);
} else {
const int block_var_level =
av1_block_energy(cpi, x, cm->seq_params.sb_size);
+ x->sb_energy_level = block_var_level;
offset_qindex =
av1_compute_deltaq_from_energy_level(cpi, block_var_level);
}
@@ -3943,6 +4114,8 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
x->use_cb_search_range = 0;
init_first_partition_pass_stats_tables(x->first_partition_pass_stats);
if (cpi->sf.two_pass_partition_search &&
+ cpi->sf.use_square_partition_only_threshold <
+ cm->seq_params.sb_size &&
mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows &&
mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols &&
cm->frame_type != KEY_FRAME) {
@@ -4030,7 +4203,8 @@ static void init_encode_frame_mb_context(AV1_COMP *cpi) {
// Copy data over into macro block data structures.
av1_setup_src_planes(x, cpi->source, 0, 0, num_planes);
- av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y, num_planes);
+ av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y, num_planes);
}
static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
@@ -4116,8 +4290,8 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
int mi_row;
- av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end,
- tile_row);
+ av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
+ tile_info->mi_col_end, tile_row);
av1_init_above_context(cm, &td->mb.e_mbd, tile_row);
// Set up pointers to per thread motion search counters.
@@ -4128,7 +4302,7 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
this_tile->tctx = *cm->fc;
td->mb.e_mbd.tile_ctx = &this_tile->tctx;
- cfl_init(&td->mb.e_mbd.cfl, cm);
+ cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
@@ -4263,25 +4437,24 @@ static int is_screen_content(const uint8_t *src, int use_hbd, int bd,
return counts * blk_h * blk_w * 10 > width * height;
}
+static const uint8_t ref_frame_flag_list[REF_FRAMES] = { 0,
+ AOM_LAST_FLAG,
+ AOM_LAST2_FLAG,
+ AOM_LAST3_FLAG,
+ AOM_GOLD_FLAG,
+ AOM_BWD_FLAG,
+ AOM_ALT2_FLAG,
+ AOM_ALT_FLAG };
+
// Enforce the number of references for each arbitrary frame limited to
// (INTER_REFS_PER_FRAME - 1)
static void enforce_max_ref_frames(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
- static const int flag_list[REF_FRAMES] = { 0,
- AOM_LAST_FLAG,
- AOM_LAST2_FLAG,
- AOM_LAST3_FLAG,
- AOM_GOLD_FLAG,
- AOM_BWD_FLAG,
- AOM_ALT2_FLAG,
- AOM_ALT_FLAG };
MV_REFERENCE_FRAME ref_frame;
int total_valid_refs = 0;
-
- (void)flag_list;
-
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- if (cpi->ref_frame_flags & flag_list[ref_frame]) total_valid_refs++;
+ if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])
+ total_valid_refs++;
}
// NOTE(zoeliu): When all the possible reference frames are availble, we
@@ -4617,7 +4790,6 @@ static void encode_frame_internal(AV1_COMP *cpi) {
cm->prev_mi = cm->allow_ref_frame_mvs ? cm->prev_mip : NULL;
x->txb_split_count = 0;
- av1_zero(x->blk_skip_drl);
av1_zero(rdc->global_motion_used);
av1_zero(cpi->gmparams_cost);
@@ -4672,8 +4844,9 @@ static void encode_frame_internal(AV1_COMP *cpi) {
}
compute_global_motion_feature_based(
- model, cpi->source, ref_buf[frame], cpi->common.bit_depth,
- inliers_by_motion, params_by_motion, RANSAC_NUM_MOTIONS);
+ model, cpi->source, ref_buf[frame],
+ cpi->common.seq_params.bit_depth, inliers_by_motion,
+ params_by_motion, RANSAC_NUM_MOTIONS);
for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
if (inliers_by_motion[i] == 0) continue;
@@ -4734,6 +4907,15 @@ static void encode_frame_internal(AV1_COMP *cpi) {
cpi->gmtype_cost[cm->global_motion[frame].wmtype] -
cpi->gmtype_cost[IDENTITY];
}
+ // clear disabled ref_frames
+ for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+ const int ref_disabled =
+ !(cpi->ref_frame_flags & ref_frame_flag_list[frame]);
+ if (ref_disabled && cpi->sf.recode_loop != DISALLOW_RECODE) {
+ cpi->gmparams_cost[frame] = 0;
+ cm->global_motion[frame] = default_warp_params;
+ }
+ }
cpi->global_motion_search_done = 1;
}
memcpy(cm->cur_frame->global_motion, cm->global_motion,
@@ -5082,8 +5264,9 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
}
if (!is_inter) {
- xd->cfl.is_chroma_reference = is_chroma_reference(
- mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+ xd->cfl.is_chroma_reference =
+ is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y);
xd->cfl.store_y = store_cfl_required(cm, xd);
mbmi->skip = 1;
for (int plane = 0; plane < num_planes; ++plane) {
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
index 196e18d8a4..13ea32e389 100644
--- a/third_party/aom/av1/encoder/encoder.c
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -56,6 +56,11 @@
#include "av1/encoder/grain_test_vectors.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/noise_model.h"
+#endif
#include "aom_ports/aom_timer.h"
#include "aom_ports/mem.h"
#include "aom_ports/system_state.h"
@@ -290,7 +295,8 @@ static void setup_frame(AV1_COMP *cpi) {
cm->fb_of_context_type[i] = -1;
}
cm->fb_of_context_type[REGULAR_FRAME] =
- get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+ cm->show_frame ? get_ref_frame_map_idx(cpi, GOLDEN_FRAME)
+ : get_ref_frame_map_idx(cpi, ALTREF_FRAME);
cm->frame_context_idx = REGULAR_FRAME;
} else {
const GF_GROUP *gf_group = &cpi->twopass.gf_group;
@@ -315,7 +321,7 @@ static void setup_frame(AV1_COMP *cpi) {
}
}
- if (cm->frame_type == KEY_FRAME) {
+ if (cm->frame_type == KEY_FRAME && cm->show_frame) {
cpi->refresh_golden_frame = 1;
cpi->refresh_alt_ref_frame = 1;
av1_zero(cpi->interp_filter_selected);
@@ -344,19 +350,20 @@ static void setup_frame(AV1_COMP *cpi) {
static void enc_setup_mi(AV1_COMMON *cm) {
int i;
+ int mi_rows_sb_aligned = calc_mi_size(cm->mi_rows);
cm->mi = cm->mip;
- memset(cm->mip, 0, cm->mi_stride * cm->mi_rows * sizeof(*cm->mip));
+ memset(cm->mip, 0, cm->mi_stride * mi_rows_sb_aligned * sizeof(*cm->mip));
cm->prev_mi = cm->prev_mip;
// Clear top border row
memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
// Clear left border column
- for (i = 0; i < cm->mi_rows; ++i)
+ for (i = 0; i < mi_rows_sb_aligned; ++i)
memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
cm->mi_grid_visible = cm->mi_grid_base;
cm->prev_mi_grid_visible = cm->prev_mi_grid_base;
memset(cm->mi_grid_base, 0,
- cm->mi_stride * cm->mi_rows * sizeof(*cm->mi_grid_base));
+ cm->mi_stride * mi_rows_sb_aligned * sizeof(*cm->mi_grid_base));
}
static int enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
@@ -441,32 +448,32 @@ static void update_film_grain_parameters(struct AV1_COMP *cpi,
AV1_COMMON *const cm = &cpi->common;
cpi->oxcf = *oxcf;
- if (cm->film_grain_table) {
- aom_film_grain_table_free(cm->film_grain_table);
- aom_free(cm->film_grain_table);
+ if (cpi->film_grain_table) {
+ aom_film_grain_table_free(cpi->film_grain_table);
+ aom_free(cpi->film_grain_table);
+ cpi->film_grain_table = NULL;
}
- cm->film_grain_table = 0;
if (oxcf->film_grain_test_vector) {
- cm->film_grain_params_present = 1;
+ cm->seq_params.film_grain_params_present = 1;
if (cm->frame_type == KEY_FRAME) {
memcpy(&cm->film_grain_params,
film_grain_test_vectors + oxcf->film_grain_test_vector - 1,
sizeof(cm->film_grain_params));
- cm->film_grain_params.bit_depth = cm->bit_depth;
- if (cm->color_range == AOM_CR_FULL_RANGE) {
+ cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+ if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) {
cm->film_grain_params.clip_to_restricted_range = 0;
}
}
} else if (oxcf->film_grain_table_filename) {
- cm->film_grain_table = aom_malloc(sizeof(*cm->film_grain_table));
- memset(cm->film_grain_table, 0, sizeof(aom_film_grain_table_t));
+ cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+ memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t));
- aom_film_grain_table_read(cm->film_grain_table,
+ aom_film_grain_table_read(cpi->film_grain_table,
oxcf->film_grain_table_filename, &cm->error);
} else {
- cm->film_grain_params_present = 0;
+ cm->seq_params.film_grain_params_present = 0;
memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
}
}
@@ -523,6 +530,17 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
av1_free_pc_tree(&cpi->td, num_planes);
aom_free(cpi->td.mb.palette_buffer);
+
+#if CONFIG_DENOISE
+ if (cpi->denoise_and_model) {
+ aom_denoise_and_model_free(cpi->denoise_and_model);
+ cpi->denoise_and_model = NULL;
+ }
+#endif
+ if (cpi->film_grain_table) {
+ aom_film_grain_table_free(cpi->film_grain_table);
+ cpi->film_grain_table = NULL;
+ }
}
static void save_coding_context(AV1_COMP *cpi) {
@@ -596,8 +614,8 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
seg->update_map = 1;
seg->update_data = 1;
- qi_delta =
- av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, cm->bit_depth);
+ qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
+ cm->seq_params.bit_depth);
av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
@@ -621,8 +639,8 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
seg->update_map = 0;
seg->update_data = 1;
- qi_delta =
- av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, cm->bit_depth);
+ qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125,
+ cm->seq_params.bit_depth);
av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
@@ -705,53 +723,58 @@ static void update_reference_segmentation_map(AV1_COMP *cpi) {
static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
AV1_COMMON *cm = &cpi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
const AV1EncoderConfig *oxcf = &cpi->oxcf;
if (!cpi->lookahead)
- cpi->lookahead = av1_lookahead_init(
- oxcf->width, oxcf->height, cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, oxcf->lag_in_frames);
+ cpi->lookahead =
+ av1_lookahead_init(oxcf->width, oxcf->height, seq_params->subsampling_x,
+ seq_params->subsampling_y,
+ seq_params->use_highbitdepth, oxcf->lag_in_frames);
if (!cpi->lookahead)
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate lag buffers");
// TODO(agrange) Check if ARF is enabled and skip allocation if not.
- if (aom_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height,
- cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment, NULL, NULL, NULL))
+ if (aom_realloc_frame_buffer(
+ &cpi->alt_ref_buffer, oxcf->width, oxcf->height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate altref buffer");
}
static void alloc_util_frame_buffers(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
- if (aom_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height,
- cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment, NULL, NULL, NULL))
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ if (aom_realloc_frame_buffer(
+ &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate last frame buffer");
if (aom_realloc_frame_buffer(
&cpi->trial_frame_rst, cm->superres_upscaled_width,
- cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
- NULL, NULL))
+ cm->superres_upscaled_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate trial restored frame buffer");
- if (aom_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height,
- cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment, NULL, NULL, NULL))
+ if (aom_realloc_frame_buffer(
+ &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate scaled source buffer");
- if (aom_realloc_frame_buffer(&cpi->scaled_last_source, cm->width, cm->height,
- cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment, NULL, NULL, NULL))
+ if (aom_realloc_frame_buffer(
+ &cpi->scaled_last_source, cm->width, cm->height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate scaled last source buffer");
}
@@ -846,8 +869,6 @@ static void init_buffer_indices(AV1_COMP *cpi) {
int fb_idx;
for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
cpi->ref_fb_idx[fb_idx] = fb_idx;
- for (fb_idx = 0; fb_idx < MAX_EXT_ARFS + 1; ++fb_idx)
- cpi->arf_map[fb_idx] = LAST_REF_FRAMES + 2 + fb_idx;
cpi->rate_index = 0;
cpi->rate_size = 0;
cpi->cur_poc = -1;
@@ -941,7 +962,8 @@ static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
// Set the maximum parameters for bitrate and buffer size for this profile,
// level, and tier
cm->op_params[i].bitrate = max_level_bitrate(
- cm->profile, major_minor_to_seq_level_idx(seq->level[i]), seq->tier[i]);
+ cm->seq_params.profile, major_minor_to_seq_level_idx(seq->level[i]),
+ seq->tier[i]);
// Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
// check
if (cm->op_params[i].bitrate == 0)
@@ -1006,15 +1028,15 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
cpi->oxcf = *oxcf;
cpi->framerate = oxcf->init_framerate;
- cm->profile = oxcf->profile;
- cm->bit_depth = oxcf->bit_depth;
- cm->use_highbitdepth = oxcf->use_highbitdepth;
- cm->color_primaries = oxcf->color_primaries;
- cm->transfer_characteristics = oxcf->transfer_characteristics;
- cm->matrix_coefficients = oxcf->matrix_coefficients;
+ cm->seq_params.profile = oxcf->profile;
+ cm->seq_params.bit_depth = oxcf->bit_depth;
+ cm->seq_params.use_highbitdepth = oxcf->use_highbitdepth;
+ cm->seq_params.color_primaries = oxcf->color_primaries;
+ cm->seq_params.transfer_characteristics = oxcf->transfer_characteristics;
+ cm->seq_params.matrix_coefficients = oxcf->matrix_coefficients;
cm->seq_params.monochrome = oxcf->monochrome;
- cm->chroma_sample_position = oxcf->chroma_sample_position;
- cm->color_range = oxcf->color_range;
+ cm->seq_params.chroma_sample_position = oxcf->chroma_sample_position;
+ cm->seq_params.color_range = oxcf->color_range;
cm->timing_info_present = oxcf->timing_info_present;
cm->timing_info.num_units_in_display_tick =
oxcf->timing_info.num_units_in_display_tick;
@@ -1032,7 +1054,7 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
// set the decoder model parameters in schedule mode
cm->buffer_model.num_units_in_decoding_tick =
oxcf->buffer_model.num_units_in_decoding_tick;
- cm->buffer_removal_delay_present = 1;
+ cm->buffer_removal_time_present = 1;
set_aom_dec_model_info(&cm->buffer_model);
set_dec_model_op_parameters(&cm->op_params[0]);
} else if (cm->timing_info_present &&
@@ -1365,8 +1387,8 @@ MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
static void highbd_set_var_fns(AV1_COMP *const cpi) {
AV1_COMMON *const cm = &cpi->common;
- if (cm->use_highbitdepth) {
- switch (cm->bit_depth) {
+ if (cm->seq_params.use_highbitdepth) {
+ switch (cm->seq_params.bit_depth) {
case AOM_BITS_8:
HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits8,
aom_highbd_sad64x16_avg_bits8, aom_highbd_8_variance64x16,
@@ -2226,7 +2248,7 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
default:
assert(0 &&
- "cm->bit_depth should be AOM_BITS_8, "
+ "cm->seq_params.bit_depth should be AOM_BITS_8, "
"AOM_BITS_10 or AOM_BITS_12");
}
}
@@ -2253,20 +2275,22 @@ static void realloc_segmentation_maps(AV1_COMP *cpi) {
void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = &cm->seq_params;
const int num_planes = av1_num_planes(cm);
RATE_CONTROL *const rc = &cpi->rc;
MACROBLOCK *const x = &cpi->td.mb;
- if (cm->profile != oxcf->profile) cm->profile = oxcf->profile;
- cm->bit_depth = oxcf->bit_depth;
- cm->color_primaries = oxcf->color_primaries;
- cm->transfer_characteristics = oxcf->transfer_characteristics;
- cm->matrix_coefficients = oxcf->matrix_coefficients;
- cm->seq_params.monochrome = oxcf->monochrome;
- cm->chroma_sample_position = oxcf->chroma_sample_position;
- cm->color_range = oxcf->color_range;
+ if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile;
+ seq_params->bit_depth = oxcf->bit_depth;
+ seq_params->color_primaries = oxcf->color_primaries;
+ seq_params->transfer_characteristics = oxcf->transfer_characteristics;
+ seq_params->matrix_coefficients = oxcf->matrix_coefficients;
+ seq_params->monochrome = oxcf->monochrome;
+ seq_params->chroma_sample_position = oxcf->chroma_sample_position;
+ seq_params->color_range = oxcf->color_range;
- assert(IMPLIES(cm->profile <= PROFILE_1, cm->bit_depth <= AOM_BITS_10));
+ assert(IMPLIES(seq_params->profile <= PROFILE_1,
+ seq_params->bit_depth <= AOM_BITS_10));
cm->timing_info_present = oxcf->timing_info_present;
cm->timing_info.num_units_in_display_tick =
@@ -2277,20 +2301,20 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
cm->timing_info.num_ticks_per_picture =
oxcf->timing_info.num_ticks_per_picture;
- cm->seq_params.display_model_info_present_flag =
+ seq_params->display_model_info_present_flag =
oxcf->display_model_info_present_flag;
- cm->seq_params.decoder_model_info_present_flag =
+ seq_params->decoder_model_info_present_flag =
oxcf->decoder_model_info_present_flag;
if (oxcf->decoder_model_info_present_flag) {
// set the decoder model parameters in schedule mode
cm->buffer_model.num_units_in_decoding_tick =
oxcf->buffer_model.num_units_in_decoding_tick;
- cm->buffer_removal_delay_present = 1;
+ cm->buffer_removal_time_present = 1;
set_aom_dec_model_info(&cm->buffer_model);
set_dec_model_op_parameters(&cm->op_params[0]);
} else if (cm->timing_info_present &&
cm->timing_info.equal_picture_interval &&
- !cm->seq_params.decoder_model_info_present_flag) {
+ !seq_params->decoder_model_info_present_flag) {
// set the decoder model parameters in resource availability mode
set_resource_availability_parameters(&cm->op_params[0]);
} else {
@@ -2302,7 +2326,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
cpi->oxcf = *oxcf;
cpi->common.options = oxcf->cfg;
- x->e_mbd.bd = (int)cm->bit_depth;
+ x->e_mbd.bd = (int)seq_params->bit_depth;
x->e_mbd.global_motion = cm->global_motion;
if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
@@ -2360,15 +2384,15 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
cm->width = cpi->oxcf.width;
cm->height = cpi->oxcf.height;
- int sb_size = cm->seq_params.sb_size;
+ int sb_size = seq_params->sb_size;
// Superblock size should not be updated after the first key frame.
if (!cpi->seq_params_locked) {
set_sb_size(&cm->seq_params, select_sb_size(cpi));
}
- if (cpi->initial_width || sb_size != cm->seq_params.sb_size) {
+ if (cpi->initial_width || sb_size != seq_params->sb_size) {
if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
- cm->seq_params.sb_size != sb_size) {
+ seq_params->sb_size != sb_size) {
av1_free_context_buffers(cm);
av1_free_pc_tree(&cpi->td, num_planes);
alloc_compressor_data(cpi);
@@ -2395,7 +2419,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
// Init sequence level coding tools
// This should not be called after the first key frame.
if (!cpi->seq_params_locked) {
- cm->seq_params.operating_points_cnt_minus_1 =
+ seq_params->operating_points_cnt_minus_1 =
cm->number_spatial_layers > 1 ? cm->number_spatial_layers - 1 : 0;
init_seq_coding_tools(&cm->seq_params, cm, oxcf);
}
@@ -2411,6 +2435,9 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
av1_zero(*cpi);
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
if (setjmp(cm->error.jmp)) {
cm->error.setjmp = 0;
av1_remove_compressor(cpi);
@@ -3082,28 +3109,52 @@ static void check_show_existing_frame(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
const FRAME_UPDATE_TYPE next_frame_update_type =
gf_group->update_type[gf_group->index];
+#if USE_SYMM_MULTI_LAYER
+ const int which_arf = (cpi->new_bwdref_update_rule == 1)
+ ? gf_group->arf_update_idx[gf_group->index] > 0
+ : gf_group->arf_update_idx[gf_group->index];
+#else
const int which_arf = gf_group->arf_update_idx[gf_group->index];
+#endif
if (cm->show_existing_frame == 1) {
cm->show_existing_frame = 0;
} else if (cpi->rc.is_last_bipred_frame) {
- // NOTE: If the current frame is a last bi-predictive frame, it is
- // needed next to show the BWDREF_FRAME, which is pointed by
- // the last_fb_idxes[0] after reference frame buffer update
- cpi->rc.is_last_bipred_frame = 0;
- cm->show_existing_frame = 1;
- cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[0];
+#if USE_SYMM_MULTI_LAYER
+ // NOTE: When new structure is used, every bwdref will have one overlay
+ // frame. Therefore, there is no need to find out which frame to
+ // show in advance.
+ if (cpi->new_bwdref_update_rule == 0) {
+#endif
+ // NOTE: If the current frame is a last bi-predictive frame, it is
+ // needed next to show the BWDREF_FRAME, which is pointed by
+ // the last_fb_idxes[0] after reference frame buffer update
+ cpi->rc.is_last_bipred_frame = 0;
+ cm->show_existing_frame = 1;
+ cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[0];
+#if USE_SYMM_MULTI_LAYER
+ }
+#endif
} else if (cpi->is_arf_filter_off[which_arf] &&
(next_frame_update_type == OVERLAY_UPDATE ||
next_frame_update_type == INTNL_OVERLAY_UPDATE)) {
+#if USE_SYMM_MULTI_LAYER
+ const int bwdref_to_show =
+ (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME;
+#else
+ const int bwdref_to_show = ALTREF2_FRAME;
+#endif
// Other parameters related to OVERLAY_UPDATE will be taken care of
// in av1_rc_get_second_pass_params(cpi)
cm->show_existing_frame = 1;
cpi->rc.is_src_frame_alt_ref = 1;
cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE)
? cpi->ref_fb_idx[ALTREF_FRAME - 1]
- : cpi->ref_fb_idx[ALTREF2_FRAME - 1];
- cpi->is_arf_filter_off[which_arf] = 0;
+ : cpi->ref_fb_idx[bwdref_to_show - 1];
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule == 0)
+#endif
+ cpi->is_arf_filter_off[which_arf] = 0;
}
cpi->rc.is_src_frame_ext_arf = 0;
}
@@ -3288,6 +3339,48 @@ static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
}
}
+#if USE_SYMM_MULTI_LAYER
+// This function is used to shift the virtual indices of bwd reference
+// frames as follows:
+// BWD_REF -> ALT2_REF -> EXT_REF
+// to clear a space to store the closest bwdref
+static INLINE void rshift_bwd_ref_frames(AV1_COMP *cpi) {
+ // TODO(isbs): shift the scaled indices as well
+ static const int ordered_bwd[3] = { BWDREF_FRAME - 1, ALTREF2_FRAME - 1,
+ EXTREF_FRAME - 1 };
+
+ for (int i = 2; i > 0; --i) {
+ cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]];
+
+ // [0] is allocated to the current coded frame, i.e. bwdref
+ memcpy(
+ cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
+ cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME],
+ sizeof(cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME]));
+ }
+}
+
+// This function is used to shift the virtual indices of bwd reference
+// frames as follows:
+// BWD_REF <- ALT2_REF <- EXT_REF
+// to update the bwd reference frame for coding the next frame.
+static INLINE void lshift_bwd_ref_frames(AV1_COMP *cpi) {
+ // TODO(isbs): shift the scaled indices as well
+ static const int ordered_bwd[3] = { BWDREF_FRAME - 1, ALTREF2_FRAME - 1,
+ EXTREF_FRAME - 1 };
+
+ for (int i = 0; i < 2; ++i) {
+ cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]];
+
+ // [0] is allocated to the current coded frame, i.e. bwdref
+ memcpy(
+ cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
+ cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME],
+ sizeof(cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME]));
+ }
+}
+#endif // USE_SYMM_MULTI_LAYER
+
#if USE_GF16_MULTI_LAYER
static void update_reference_frames_gf16(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
@@ -3343,7 +3436,9 @@ static void update_reference_frames(AV1_COMP *cpi) {
// At this point the new frame has been encoded.
// If any buffer copy / swapping is signaled it should be done here.
- if (cm->frame_type == KEY_FRAME || frame_is_sframe(cm)) {
+ // Only update all of the reference buffers if a KEY_FRAME is also a
+ // show_frame. This ensures a fwd keyframe does not update all of the buffers
+ if ((cm->frame_type == KEY_FRAME && cm->show_frame) || frame_is_sframe(cm)) {
for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
ref_cnt_fb(pool->frame_bufs,
&cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]],
@@ -3370,37 +3465,49 @@ static void update_reference_frames(AV1_COMP *cpi) {
cpi->ref_fb_idx[ALTREF_FRAME - 1] = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
cpi->ref_fb_idx[GOLDEN_FRAME - 1] = tmp;
- // We need to modify the mapping accordingly
- cpi->arf_map[0] = cpi->ref_fb_idx[ALTREF_FRAME - 1];
// TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
// cpi->interp_filter_selected[GOLDEN_FRAME]?
} else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) {
+#if CONFIG_DEBUG
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE);
+#endif
+#if USE_SYMM_MULTI_LAYER
+ const int bwdref_to_show =
+ (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME;
+#else
+ const int bwdref_to_show = ALTREF2_FRAME;
+#endif
// Deal with the special case for showing existing internal ALTREF_FRAME
// Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
// by updating the virtual indices.
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- const int which_arf = gf_group->arf_ref_idx[gf_group->index];
- assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE);
-
const int tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
shift_last_ref_frames(cpi);
- cpi->ref_fb_idx[LAST_FRAME - 1] = cpi->ref_fb_idx[ALTREF2_FRAME - 1];
- cpi->ref_fb_idx[ALTREF2_FRAME - 1] = tmp;
- // We need to modify the mapping accordingly
- cpi->arf_map[which_arf] = cpi->ref_fb_idx[ALTREF2_FRAME - 1];
+ cpi->ref_fb_idx[LAST_FRAME - 1] = cpi->ref_fb_idx[bwdref_to_show - 1];
memcpy(cpi->interp_filter_selected[LAST_FRAME],
- cpi->interp_filter_selected[ALTREF2_FRAME],
- sizeof(cpi->interp_filter_selected[ALTREF2_FRAME]));
+ cpi->interp_filter_selected[bwdref_to_show],
+ sizeof(cpi->interp_filter_selected[bwdref_to_show]));
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule == 1) {
+ lshift_bwd_ref_frames(cpi);
+ // pass outdated forward reference frame (previous LAST3) to the
+ // spared space
+ cpi->ref_fb_idx[EXTREF_FRAME - 1] = tmp;
+ } else {
+#endif
+ cpi->ref_fb_idx[bwdref_to_show - 1] = tmp;
+#if USE_SYMM_MULTI_LAYER
+ }
+#endif
} else { /* For non key/golden frames */
// === ALTREF_FRAME ===
if (cpi->refresh_alt_ref_frame) {
int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
- int which_arf = 0;
ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
- memcpy(cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
+ memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
cpi->interp_filter_selected[0],
sizeof(cpi->interp_filter_selected[0]));
}
@@ -3418,10 +3525,25 @@ static void update_reference_frames(AV1_COMP *cpi) {
// === BWDREF_FRAME ===
if (cpi->refresh_bwd_ref_frame) {
- ref_cnt_fb(pool->frame_bufs,
- &cm->ref_frame_map[cpi->ref_fb_idx[BWDREF_FRAME - 1]],
- cm->new_fb_idx);
-
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule) {
+ // We shift the backward reference frame as follows:
+ // BWDREF -> ALTREF2 -> EXTREF
+ // and assign the newly coded frame to BWDREF so that it always
+ // keeps the nearest future frame
+ int tmp = cpi->ref_fb_idx[EXTREF_FRAME - 1];
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[tmp], cm->new_fb_idx);
+
+ rshift_bwd_ref_frames(cpi);
+ cpi->ref_fb_idx[BWDREF_FRAME - 1] = tmp;
+ } else {
+#endif // USE_SYMM_MULTI_LAYER
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->ref_fb_idx[BWDREF_FRAME - 1]],
+ cm->new_fb_idx);
+#if USE_SYMM_MULTI_LAYER
+ }
+#endif
memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
cpi->interp_filter_selected[0],
sizeof(cpi->interp_filter_selected[0]));
@@ -3486,7 +3608,14 @@ static void update_reference_frames(AV1_COMP *cpi) {
cpi->interp_filter_selected[0],
sizeof(cpi->interp_filter_selected[0]));
+ // If the new structure is used, we will always have overlay frames coupled
+ // with bwdref frames. Therefore, we won't have to perform this update
+ // in advance (we do this update when the overlay frame shows up).
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule == 0 && cpi->rc.is_last_bipred_frame) {
+#else
if (cpi->rc.is_last_bipred_frame) {
+#endif
// Refresh the LAST_FRAME with the BWDREF_FRAME and retire the
// LAST3_FRAME by updating the virtual indices.
//
@@ -3555,13 +3684,14 @@ static void scale_references(AV1_COMP *cpi) {
if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
new_fb_ptr->buf.y_crop_height != cm->height) {
if (aom_realloc_frame_buffer(
- &new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x,
- cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ &new_fb_ptr->buf, cm->width, cm->height,
+ cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
+ cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
- av1_resize_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth,
- num_planes);
+ av1_resize_and_extend_frame(
+ ref, &new_fb_ptr->buf, (int)cm->seq_params.bit_depth, num_planes);
cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
alloc_frame_mvs(cm, new_fb);
}
@@ -3706,13 +3836,14 @@ static void init_ref_frame_bufs(AV1_COMMON *cm) {
static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
int subsampling_x, int subsampling_y) {
AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = &cm->seq_params;
- if (!cpi->initial_width || cm->use_highbitdepth != use_highbitdepth ||
- cm->subsampling_x != subsampling_x ||
- cm->subsampling_y != subsampling_y) {
- cm->subsampling_x = subsampling_x;
- cm->subsampling_y = subsampling_y;
- cm->use_highbitdepth = use_highbitdepth;
+ if (!cpi->initial_width || seq_params->use_highbitdepth != use_highbitdepth ||
+ seq_params->subsampling_x != subsampling_x ||
+ seq_params->subsampling_y != subsampling_y) {
+ seq_params->subsampling_x = subsampling_x;
+ seq_params->subsampling_y = subsampling_y;
+ seq_params->use_highbitdepth = use_highbitdepth;
alloc_raw_frame_buffers(cpi);
init_ref_frame_bufs(cm);
@@ -3730,8 +3861,9 @@ static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
static int set_size_literal(AV1_COMP *cpi, int width, int height) {
AV1_COMMON *cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
- check_initial_width(cpi, cm->use_highbitdepth, cm->subsampling_x,
- cm->subsampling_y);
+ check_initial_width(cpi, cm->seq_params.use_highbitdepth,
+ cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y);
if (width <= 0 || height <= 0) return 1;
@@ -3753,6 +3885,7 @@ static int set_size_literal(AV1_COMP *cpi, int width, int height) {
static void set_frame_size(AV1_COMP *cpi, int width, int height) {
AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
int ref_frame;
@@ -3782,17 +3915,19 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
}
// Reset the frame pointers to the current frame size.
- if (aom_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
- cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment, NULL, NULL, NULL))
+ if (aom_realloc_frame_buffer(
+ get_frame_new_buffer(cm), cm->width, cm->height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
const int frame_width = cm->superres_upscaled_width;
const int frame_height = cm->superres_upscaled_height;
- set_restoration_unit_size(frame_width, frame_height, cm->subsampling_x,
- cm->subsampling_y, cm->rst_info);
+ set_restoration_unit_size(frame_width, frame_height,
+ seq_params->subsampling_x,
+ seq_params->subsampling_y, cm->rst_info);
for (int i = 0; i < num_planes; ++i)
cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
@@ -4038,16 +4173,16 @@ static void superres_post_encode(AV1_COMP *cpi) {
// av1_superres_upscale
if (aom_realloc_frame_buffer(
&cpi->scaled_source, cm->superres_upscaled_width,
- cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment,
- NULL, NULL, NULL))
+ cm->superres_upscaled_height, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y, cm->seq_params.use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(
&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to reallocate scaled source buffer for superres");
assert(cpi->scaled_source.y_crop_width == cm->superres_upscaled_width);
assert(cpi->scaled_source.y_crop_height == cm->superres_upscaled_height);
av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source,
- (int)cm->bit_depth, num_planes);
+ (int)cm->seq_params.bit_depth, num_planes);
cpi->source = &cpi->scaled_source;
}
}
@@ -4331,7 +4466,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
int64_t high_err_target = cpi->ambient_err;
int64_t low_err_target = cpi->ambient_err >> 1;
- if (cm->use_highbitdepth) {
+ if (cm->seq_params.use_highbitdepth) {
kf_err = aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
} else {
kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
@@ -4574,7 +4709,11 @@ static void set_ext_overrides(AV1_COMP *cpi) {
cpi->ext_refresh_frame_flags_pending = 0;
}
cpi->common.allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs;
- cpi->common.error_resilient_mode = cpi->ext_use_error_resilient;
+ // A keyframe is already error resilient and keyframes with
+ // error_resilient_mode interferes with the use of show_existing_frame
+ // when forward reference keyframes are enabled.
+ cpi->common.error_resilient_mode =
+ cpi->ext_use_error_resilient && cpi->common.frame_type != KEY_FRAME;
}
static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
@@ -4725,10 +4864,17 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
}
#endif // DUMP_RECON_FRAMES
+static INLINE int is_frame_droppable(AV1_COMP *cpi) {
+ return !(cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
+ cpi->refresh_bwd_ref_frame || cpi->refresh_golden_frame ||
+ cpi->refresh_last_frame);
+}
+
static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
int skip_adapt,
unsigned int *frame_flags) {
AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = &cm->seq_params;
const AV1EncoderConfig *const oxcf = &cpi->oxcf;
struct segmentation *const seg = &cm->seg;
@@ -4744,7 +4890,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
cm->large_scale_tile = cpi->oxcf.large_scale_tile;
cm->single_tile_decoding = cpi->oxcf.single_tile_decoding;
- if (cm->large_scale_tile) cm->seq_params.frame_id_numbers_present_flag = 0;
+ if (cm->large_scale_tile) seq_params->frame_id_numbers_present_flag = 0;
cm->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
// cm->allow_ref_frame_mvs needs to be written into the frame header while
@@ -4756,7 +4902,8 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm);
// Reset the frame packet stamp index.
- if (cm->frame_type == KEY_FRAME) cm->current_video_frame = 0;
+ if (cm->frame_type == KEY_FRAME && cm->show_frame)
+ cm->current_video_frame = 0;
// NOTE:
// (1) Move the setup of the ref_frame_flags upfront as it would be
@@ -4770,7 +4917,11 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
if (cm->show_existing_frame) {
// NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
// BWDREF_FRAME in the reference frame buffer.
- cm->frame_type = INTER_FRAME;
+ if (cm->frame_type == KEY_FRAME) {
+ cm->reset_decoder_state = 1;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
cm->show_frame = 1;
cpi->frame_flags = *frame_flags;
@@ -4839,6 +4990,10 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
av1_rc_postencode_update(cpi, *size);
}
+ // Decrement count down till next gf
+ if (cpi->rc.frames_till_gf_update_due > 0)
+ cpi->rc.frames_till_gf_update_due--;
+
++cm->current_video_frame;
return AOM_CODEC_OK;
@@ -4889,7 +5044,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
MAX_MODES * sizeof(*cpi->mode_chosen_counts));
#endif
- if (cm->seq_params.frame_id_numbers_present_flag) {
+ if (seq_params->frame_id_numbers_present_flag) {
/* Non-normative definition of current_frame_id ("frame counter" with
* wraparound) */
const int frame_id_length = FRAME_ID_LENGTH;
@@ -4935,7 +5090,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
(frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
break;
}
- cm->timing_info_present &= !cm->seq_params.reduced_still_picture_hdr;
+ cm->timing_info_present &= !seq_params->reduced_still_picture_hdr;
if (cpi->sf.recode_loop == DISALLOW_RECODE) {
if (encode_without_recode_loop(cpi) != AOM_CODEC_OK) return AOM_CODEC_ERROR;
@@ -4957,7 +5112,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
// fixed interval. Note the reconstruction error if it is the frame before
// the force key frame
if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
- if (cm->use_highbitdepth) {
+ if (seq_params->use_highbitdepth) {
cpi->ambient_err =
aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
} else {
@@ -4966,17 +5121,19 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
}
// If the encoder forced a KEY_FRAME decision or if frame is an S_FRAME
- if (cm->frame_type == KEY_FRAME || frame_is_sframe(cm)) {
+ if ((cm->frame_type == KEY_FRAME && cm->show_frame) || frame_is_sframe(cm)) {
cpi->refresh_last_frame = 1;
}
cm->frame_to_show = get_frame_new_buffer(cm);
- cm->frame_to_show->color_primaries = cm->color_primaries;
- cm->frame_to_show->transfer_characteristics = cm->transfer_characteristics;
- cm->frame_to_show->matrix_coefficients = cm->matrix_coefficients;
- cm->frame_to_show->monochrome = cm->seq_params.monochrome;
- cm->frame_to_show->chroma_sample_position = cm->chroma_sample_position;
- cm->frame_to_show->color_range = cm->color_range;
+ cm->frame_to_show->color_primaries = seq_params->color_primaries;
+ cm->frame_to_show->transfer_characteristics =
+ seq_params->transfer_characteristics;
+ cm->frame_to_show->matrix_coefficients = seq_params->matrix_coefficients;
+ cm->frame_to_show->monochrome = seq_params->monochrome;
+ cm->frame_to_show->chroma_sample_position =
+ seq_params->chroma_sample_position;
+ cm->frame_to_show->color_range = seq_params->color_range;
cm->frame_to_show->render_width = cm->render_width;
cm->frame_to_show->render_height = cm->render_height;
@@ -5014,7 +5171,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
if (skip_adapt) return AOM_CODEC_OK;
- if (cm->seq_params.frame_id_numbers_present_flag) {
+ if (seq_params->frame_id_numbers_present_flag) {
int i;
// Update reference frame id values based on the value of refresh_frame_mask
for (i = 0; i < REF_FRAMES; i++) {
@@ -5085,6 +5242,19 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
cm->seg.update_data = 0;
cm->lf.mode_ref_delta_update = 0;
+ // A droppable frame might not be shown but it always
+ // takes a space in the gf group. Therefore, even when
+ // it is not shown, we still need update the count down.
+
+ // TODO(weitinglin): This is a work-around to handle the condition
+ // when a frame is drop. We should fix the cm->show_frame flag
+ // instead of checking the other condition to update the counter properly.
+ if (cm->show_frame || is_frame_droppable(cpi)) {
+ // Decrement count down till next gf
+ if (cpi->rc.frames_till_gf_update_due > 0)
+ cpi->rc.frames_till_gf_update_due--;
+ }
+
if (cm->show_frame) {
// TODO(zoeliu): We may only swamp mi and prev_mi for those frames that
// are
@@ -5092,6 +5262,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
swap_mi_and_prev_mi(cm);
// Don't increment frame counters if this was an altref buffer
// update not a real frame
+
++cm->current_video_frame;
}
@@ -5160,10 +5331,45 @@ static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
return AOM_CODEC_OK;
}
+#if CONFIG_DENOISE
+static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd,
+ int block_size, float noise_level,
+ int64_t time_stamp, int64_t end_time) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (!cpi->denoise_and_model) {
+ cpi->denoise_and_model = aom_denoise_and_model_alloc(
+ cm->seq_params.bit_depth, block_size, noise_level);
+ if (!cpi->denoise_and_model) {
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating denoise and model");
+ return -1;
+ }
+ }
+ if (!cpi->film_grain_table) {
+ cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+ if (!cpi->film_grain_table) {
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating grain table");
+ return -1;
+ }
+ memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table));
+ }
+ if (aom_denoise_and_model_run(cpi->denoise_and_model, sd,
+ &cm->film_grain_params)) {
+ if (cm->film_grain_params.apply_grain) {
+ aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time,
+ &cm->film_grain_params);
+ }
+ }
+ return 0;
+}
+#endif
+
int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
int64_t end_time) {
AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
struct aom_usec_timer timer;
int res = 0;
const int subsampling_x = sd->subsampling_x;
@@ -5174,25 +5380,33 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
aom_usec_timer_start(&timer);
+#if CONFIG_DENOISE
+ if (cpi->oxcf.noise_level > 0)
+ if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size,
+ cpi->oxcf.noise_level, time_stamp, end_time) < 0)
+ res = -1;
+#endif // CONFIG_DENOISE
+
if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
use_highbitdepth, frame_flags))
res = -1;
aom_usec_timer_mark(&timer);
cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
- if ((cm->profile == PROFILE_0) && !cm->seq_params.monochrome &&
+ if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
(subsampling_x != 1 || subsampling_y != 1)) {
aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
"Non-4:2:0 color format requires profile 1 or 2");
res = -1;
}
- if ((cm->profile == PROFILE_1) &&
+ if ((seq_params->profile == PROFILE_1) &&
!(subsampling_x == 0 && subsampling_y == 0)) {
aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
"Profile 1 requires 4:4:4 color format");
res = -1;
}
- if ((cm->profile == PROFILE_2) && (cm->bit_depth <= AOM_BITS_10) &&
+ if ((seq_params->profile == PROFILE_2) &&
+ (seq_params->bit_depth <= AOM_BITS_10) &&
!(subsampling_x == 1 && subsampling_y == 0)) {
aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
"Profile 2 bit-depth < 10 requires 4:2:2 color format");
@@ -5364,9 +5578,9 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
#endif
cpi->bytes += frame_bytes;
- if (cm->use_highbitdepth) {
+ if (cm->seq_params.use_highbitdepth) {
in_bit_depth = cpi->oxcf.input_bit_depth;
- bit_depth = cm->bit_depth;
+ bit_depth = cm->seq_params.bit_depth;
}
if (cm->show_frame) {
const YV12_BUFFER_CONFIG *orig = cpi->source;
@@ -5387,7 +5601,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
cpi->total_samples += psnr.samples[0];
samples = psnr.samples[0];
// TODO(yaowu): unify these two versions into one.
- if (cm->use_highbitdepth)
+ if (cm->seq_params.use_highbitdepth)
frame_ssim2 =
aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth);
else
@@ -5412,7 +5626,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
#endif
}
if (cpi->b_calculate_blockiness) {
- if (!cm->use_highbitdepth) {
+ if (!cm->seq_params.use_highbitdepth) {
const double frame_blockiness =
av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
recon->y_stride, orig->y_width, orig->y_height);
@@ -5421,7 +5635,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
}
if (cpi->b_calculate_consistency) {
- if (!cm->use_highbitdepth) {
+ if (!cm->seq_params.use_highbitdepth) {
const double this_inconsistency = aom_get_ssim_metrics(
orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
@@ -5622,18 +5836,17 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
if (oxcf->large_scale_tile)
cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
- cpi->refresh_last_frame = 1;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
+ // default reference buffers update config
+ av1_configure_buffer_updates_firstpass(cpi, LF_UPDATE);
- // TODO(zoeliu@gmail.com): To support forward-KEY_FRAME and set up the
- // following flag accordingly.
+ // Initialize fields related to forward keyframes
+ cpi->no_show_kf = 0;
cm->reset_decoder_state = 0;
// Don't allow a show_existing_frame to coincide with an error resilient or
- // S-Frame
+ // S-Frame. An exception can be made in the case of a keyframe, since it
+ // does not depend on any previous frames. We must make this exception here
+ // because of the use of show_existing_frame with forward coded keyframes.
struct lookahead_entry *lookahead_src = NULL;
if (cm->current_video_frame > 0)
lookahead_src = av1_lookahead_peek(cpi->lookahead, 0);
@@ -5641,7 +5854,8 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
((cpi->oxcf.error_resilient_mode |
((lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT) != 0)) ||
(cpi->oxcf.s_frame_mode |
- ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0)))) {
+ ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0))) &&
+ !(rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) {
cm->show_existing_frame = 0;
}
@@ -5719,22 +5933,29 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
cm->showable_frame = 1;
cpi->alt_ref_source = source;
-
- if (oxcf->arnr_max_frames > 0) {
- // Produce the filtered ARF frame.
- av1_temporal_filter(cpi, arf_src_index);
- aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
- force_src_buffer = &cpi->alt_ref_buffer;
+ // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf
+ if (arf_src_index == rc->frames_to_key) {
+ // Skip temporal filtering and mark as intra_only if we have a fwd_kf
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ int which_arf = gf_group->arf_update_idx[gf_group->index];
+ cpi->is_arf_filter_off[which_arf] = 1;
+ cpi->no_show_kf = 1;
+ } else {
+ if (oxcf->arnr_max_frames > 0) {
+ // Produce the filtered ARF frame.
+ av1_temporal_filter(cpi, arf_src_index);
+ aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
+ force_src_buffer = &cpi->alt_ref_buffer;
+ }
}
-
cm->show_frame = 0;
cm->intra_only = 0;
- cpi->refresh_alt_ref_frame = 1;
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- rc->is_src_frame_alt_ref = 0;
+
+ if (oxcf->pass < 2) {
+ // In second pass, the buffer updates configure will be set
+ // in the function av1_rc_get_second_pass_params
+ av1_configure_buffer_updates_firstpass(cpi, ARF_UPDATE);
+ }
}
rc->source_alt_ref_pending = 0;
}
@@ -5771,13 +5992,12 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
cm->show_frame = 0;
cm->intra_only = 0;
- cpi->refresh_alt2_ref_frame = 1;
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
- rc->is_src_frame_alt_ref = 0;
- rc->is_src_frame_ext_arf = 0;
+
+ if (oxcf->pass < 2) {
+ // In second pass, the buffer updates configure will be set
+ // in the function av1_rc_get_second_pass_params
+ av1_configure_buffer_updates_firstpass(cpi, INTNL_ARF_UPDATE);
+ }
}
rc->source_alt_ref_pending = 0;
}
@@ -5791,13 +6011,11 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
cm->show_frame = 0;
cm->intra_only = 0;
- cpi->refresh_bwd_ref_frame = 1;
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
-
- rc->is_bwd_ref_frame = 1;
+ if (oxcf->pass < 2) {
+ // In second pass, the buffer updates configure will be set
+ // in the function av1_rc_get_second_pass_params
+ av1_configure_buffer_updates_firstpass(cpi, BIPRED_UPDATE);
+ }
}
}
@@ -5865,16 +6083,18 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
cm->cur_frame->buf.buf_8bit_valid = 0;
- if (cm->film_grain_table) {
- cm->film_grain_params_present = aom_film_grain_table_lookup(
- cm->film_grain_table, *time_stamp, *time_end, 0 /* erase */,
+ if (cpi->film_grain_table) {
+ cm->seq_params.film_grain_params_present = aom_film_grain_table_lookup(
+ cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
&cm->film_grain_params);
}
- cm->cur_frame->film_grain_params_present = cm->film_grain_params_present;
+ cm->cur_frame->film_grain_params_present =
+ cm->seq_params.film_grain_params_present;
// only one operating point supported now
- cpi->common.tu_presentation_delay =
- ticks_to_timebase_units(timebase, *time_stamp);
+ const int64_t pts64 = ticks_to_timebase_units(timebase, *time_stamp);
+ if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
+ cpi->common.frame_presentation_time = (uint32_t)pts64;
// Start with a 0 size frame.
*size = 0;
@@ -6004,8 +6224,8 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
*dest = *cm->frame_to_show;
dest->y_width = cm->width;
dest->y_height = cm->height;
- dest->uv_width = cm->width >> cm->subsampling_x;
- dest->uv_height = cm->height >> cm->subsampling_y;
+ dest->uv_width = cm->width >> cm->seq_params.subsampling_x;
+ dest->uv_height = cm->height >> cm->seq_params.subsampling_y;
ret = 0;
} else {
ret = -1;
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
index 5212db2b17..2b7ab711d3 100644
--- a/third_party/aom/av1/encoder/encoder.h
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -41,6 +41,9 @@
#include "aom_dsp/ssim.h"
#endif
#include "aom_dsp/variance.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/noise_model.h"
+#endif
#include "aom/internal/aom_codec_internal.h"
#include "aom_util/aom_thread.h"
@@ -277,7 +280,7 @@ typedef struct AV1EncoderConfig {
aom_timing_info_t timing_info;
int decoder_model_info_present_flag;
int display_model_info_present_flag;
- int buffer_removal_delay_present;
+ int buffer_removal_time_present;
aom_dec_model_info_t buffer_model;
aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
@@ -301,6 +304,11 @@ typedef struct AV1EncoderConfig {
int allow_warped_motion;
int enable_superres;
unsigned int save_as_annexb;
+
+#if CONFIG_DENOISE
+ float noise_level;
+ int noise_block_size;
+#endif
} AV1EncoderConfig;
static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
@@ -472,6 +480,7 @@ typedef struct AV1_COMP {
AV1EncoderConfig oxcf;
struct lookahead_ctx *lookahead;
struct lookahead_entry *alt_ref_source;
+ int no_show_kf;
int optimize_speed_feature;
int optimize_seg_arr[MAX_SEGMENTS];
@@ -504,6 +513,9 @@ typedef struct AV1_COMP {
int refresh_bwd_ref_frame;
int refresh_alt2_ref_frame;
int refresh_alt_ref_frame;
+#if USE_SYMM_MULTI_LAYER
+ int new_bwdref_update_rule;
+#endif
int ext_refresh_frame_flags_pending;
int ext_refresh_last_frame;
@@ -666,7 +678,6 @@ typedef struct AV1_COMP {
int existing_fb_idx_to_show;
int is_arf_filter_off[MAX_EXT_ARFS + 1];
int num_extra_arfs;
- int arf_map[MAX_EXT_ARFS + 1];
int arf_pos_in_gf[MAX_EXT_ARFS + 1];
int arf_pos_for_ovrly[MAX_EXT_ARFS + 1];
int global_motion_search_done;
@@ -687,6 +698,11 @@ typedef struct AV1_COMP {
AV1LfSync lf_row_sync;
AV1LrSync lr_row_sync;
AV1LrStruct lr_ctxt;
+
+ aom_film_grain_table_t *film_grain_table;
+#if CONFIG_DENOISE
+ struct aom_denoise_and_model_t *denoise_and_model;
+#endif
} AV1_COMP;
void av1_initialize_enc(void);
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
index 4d4802b46a..81f3607336 100644
--- a/third_party/aom/av1/encoder/encodetxb.c
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -792,9 +792,8 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
}
int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
- const int plane, const int blk_row, const int blk_col,
- const int block, const TX_SIZE tx_size,
- const TXB_CTX *const txb_ctx) {
+ const int plane, const int block, const TX_SIZE tx_size,
+ const TX_TYPE tx_type, const TXB_CTX *const txb_ctx) {
const struct macroblock_plane *p = &x->plane[plane];
const int eob = p->eobs[block];
const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
@@ -806,8 +805,6 @@ int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
}
const MACROBLOCKD *const xd = &x->e_mbd;
- const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
- tx_size, cm->reduced_tx_set_used);
const TX_CLASS tx_class = tx_type_to_class[tx_type];
#define WAREHOUSE_EFFICIENTS_TXB_CASE(tx_class_literal) \
@@ -1583,9 +1580,14 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
const int64_t rdmult =
((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
2) >>
- (sharpness + (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
- ? 7 - mbmi->segment_id
- : 2));
+ (sharpness +
+ (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
+ ? 7 - mbmi->segment_id
+ : 2) +
+ (cpi->oxcf.aq_mode != VARIANCE_AQ &&
+ cpi->oxcf.deltaq_mode > NO_DELTA_Q && x->sb_energy_level < 0
+ ? (3 - x->sb_energy_level)
+ : 0));
uint8_t levels_buf[TX_PAD_2D];
uint8_t *const levels = set_levels(levels_buf, width);
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
index aa847ad626..0442cc613a 100644
--- a/third_party/aom/av1/encoder/encodetxb.h
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -50,9 +50,8 @@ typedef struct TxbInfo {
void av1_alloc_txb_buf(AV1_COMP *cpi);
void av1_free_txb_buf(AV1_COMP *cpi);
int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
- const int plane, const int blk_row, const int blk_col,
- const int block, const TX_SIZE tx_size,
- const TXB_CTX *const txb_ctx);
+ const int plane, const int block, const TX_SIZE tx_size,
+ const TX_TYPE tx_type, const TXB_CTX *const txb_ctx);
void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
aom_writer *w, int blk_row, int blk_col, int plane,
TX_SIZE tx_size, const tran_low_t *tcoeff,
@@ -77,9 +76,10 @@ void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
int mi_row, int mi_col);
void hbt_destroy();
-int av1_optimize_txb_new(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
int block, TX_SIZE tx_size, TX_TYPE tx_type,
- const TXB_CTX *txb_ctx, int *rate_cost, int sharpness);
+ const TXB_CTX *const txb_ctx, int *rate_cost,
+ int sharpness);
#ifdef __cplusplus
}
#endif
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
index 404af2e7c3..637d6824c9 100644
--- a/third_party/aom/av1/encoder/ethread.c
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -44,7 +44,7 @@ static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
}
- return 0;
+ return 1;
}
void av1_encode_tiles_mt(AV1_COMP *cpi) {
@@ -126,12 +126,11 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
for (i = 0; i < num_workers; i++) {
AVxWorker *const worker = &cpi->workers[i];
- EncWorkerData *thread_data;
+ EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
worker->hook = (AVxWorkerHook)enc_worker_hook;
- worker->data1 = &cpi->tile_thr_data[i];
+ worker->data1 = thread_data;
worker->data2 = NULL;
- thread_data = (EncWorkerData *)worker->data1;
// Before encoding a frame, copy the thread data from cpi.
if (thread_data->td != &cpi->td) {
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
index 113c068c19..ef0800c791 100644
--- a/third_party/aom/av1/encoder/firstpass.c
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -486,6 +486,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
int mb_row, mb_col;
MACROBLOCK *const x = &cpi->td.mb;
AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *const xd = &x->e_mbd;
TileInfo tile;
@@ -524,7 +525,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
double intra_factor;
double brightness_factor;
BufferPool *const pool = cm->buffer_pool;
- const int qindex = find_fp_qindex(cm->bit_depth);
+ const int qindex = find_fp_qindex(seq_params->bit_depth);
const int mb_scale = mi_size_wide[BLOCK_16X16];
int *raw_motion_err_list;
@@ -555,11 +556,11 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
set_first_pass_params(cpi);
av1_set_quantizer(cm, qindex);
- av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y,
- num_planes);
+ av1_setup_block_planes(&x->e_mbd, seq_params->subsampling_x,
+ seq_params->subsampling_y, num_planes);
av1_setup_src_planes(x, cpi->source, 0, 0, num_planes);
- av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, new_yv12, 0, 0, 0,
+ av1_setup_dst_planes(xd->plane, seq_params->sb_size, new_yv12, 0, 0, 0,
num_planes);
if (!frame_is_intra_only(cm)) {
@@ -654,14 +655,14 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
image_data_start_row = mb_row;
}
- if (cm->use_highbitdepth) {
- switch (cm->bit_depth) {
+ if (seq_params->use_highbitdepth) {
+ switch (seq_params->bit_depth) {
case AOM_BITS_8: break;
case AOM_BITS_10: this_error >>= 4; break;
case AOM_BITS_12: this_error >>= 8; break;
default:
assert(0 &&
- "cm->bit_depth should be AOM_BITS_8, "
+ "seq_params->bit_depth should be AOM_BITS_8, "
"AOM_BITS_10 or AOM_BITS_12");
return;
}
@@ -674,7 +675,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
else
intra_factor += 1.0;
- if (cm->use_highbitdepth)
+ if (seq_params->use_highbitdepth)
level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
else
level_sample = x->plane[0].src.buf[0];
@@ -1156,10 +1157,10 @@ static int get_twopass_worst_quality(const AV1_COMP *cpi,
for (q = rc->best_quality; q < rc->worst_quality; ++q) {
const double factor = calc_correction_factor(
av_err_per_mb, ERR_DIVISOR - ediv_size_correction, FACTOR_PT_LOW,
- FACTOR_PT_HIGH, q, cpi->common.bit_depth);
+ FACTOR_PT_HIGH, q, cpi->common.seq_params.bit_depth);
const int bits_per_mb = av1_rc_bits_per_mb(
INTER_FRAME, q, factor * speed_term * group_weight_factor,
- cpi->common.bit_depth);
+ cpi->common.seq_params.bit_depth);
if (bits_per_mb <= target_norm_bits_per_mb) break;
}
@@ -1377,7 +1378,7 @@ static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame,
double this_frame_mv_in_out, double max_boost) {
double frame_boost;
const double lq = av1_convert_qindex_to_q(
- cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
+ cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.seq_params.bit_depth);
const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
: cpi->common.MBs;
@@ -2130,6 +2131,319 @@ static void define_gf_group_structure_16(AV1_COMP *cpi) {
}
#endif // USE_GF16_MULTI_LAYER
+#if USE_SYMM_MULTI_LAYER
+void check_frame_params(GF_GROUP *const gf_group, int gf_interval,
+ int frame_nums) {
+ static const char *update_type_strings[] = {
+ "KF_UPDATE", "LF_UPDATE", "GF_UPDATE",
+ "ARF_UPDATE", "OVERLAY_UPDATE", "BRF_UPDATE",
+ "LAST_BIPRED_UPDATE", "BIPRED_UPDATE", "INTNL_OVERLAY_UPDATE",
+ "INTNL_ARF_UPDATE"
+ };
+ FILE *fid = fopen("GF_PARAMS.txt", "a");
+
+ fprintf(fid, "\n{%d}\n", gf_interval);
+ for (int i = 0; i <= frame_nums; ++i) {
+ fprintf(fid, "%s %d %d %d %d\n",
+ update_type_strings[gf_group->update_type[i]],
+ gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
+ gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
+ }
+ fclose(fid);
+}
+
+static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) {
+ // Derive rf_level from update_type
+ switch (update_type) {
+ case LF_UPDATE: return INTER_NORMAL;
+ case ARF_UPDATE: return GF_ARF_STD;
+ case OVERLAY_UPDATE: return INTER_NORMAL;
+ case BRF_UPDATE: return GF_ARF_LOW;
+ case LAST_BIPRED_UPDATE: return INTER_NORMAL;
+ case BIPRED_UPDATE: return INTER_NORMAL;
+ case INTNL_ARF_UPDATE: return GF_ARF_LOW;
+ case INTNL_OVERLAY_UPDATE: return INTER_NORMAL;
+ default: return INTER_NORMAL;
+ }
+}
+
+static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r,
+ int *frame_ind, int arf_ind, int level) {
+ if (r - l == 2) {
+ // leaf node, not a look-ahead frame
+ gf_group->update_type[*frame_ind] = LF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->arf_pos_in_gf[*frame_ind] = 0;
+ gf_group->arf_update_idx[*frame_ind] = arf_ind;
+ gf_group->pyramid_level[*frame_ind] = level;
+ ++(*frame_ind);
+ } else {
+ int m = (l + r) / 2;
+ int arf_pos_in_gf = *frame_ind;
+
+ gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = m - l - 1;
+ gf_group->arf_pos_in_gf[*frame_ind] = 0;
+ gf_group->arf_update_idx[*frame_ind] = 1; // mark all internal ARF 1
+ gf_group->pyramid_level[*frame_ind] = level;
+ ++(*frame_ind);
+
+ // set parameters for frames displayed before this frame
+ set_multi_layer_params(gf_group, l, m, frame_ind, 1, level - 1);
+
+ // for overlay frames, we need to record the position of its corresponding
+ // arf frames for bit allocation
+ gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->arf_pos_in_gf[*frame_ind] = arf_pos_in_gf;
+ gf_group->arf_update_idx[*frame_ind] = 1;
+ gf_group->pyramid_level[*frame_ind] = 0;
+ ++(*frame_ind);
+
+ // set parameters for frames displayed after this frame
+ set_multi_layer_params(gf_group, m, r, frame_ind, arf_ind, level - 1);
+ }
+}
+
+static INLINE unsigned char get_pyramid_height(int pyramid_width) {
+ assert(pyramid_width <= 16 && pyramid_width >= 4 &&
+ "invalid gf interval for pyramid structure");
+
+ return pyramid_width == 16 ? 4 : (pyramid_width >= 8 ? 3 : 2);
+}
+
+static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
+ const int gf_interval) {
+ int frame_index = 0;
+ gf_group->pyramid_height = get_pyramid_height(gf_interval);
+
+ // At the beginning of each GF group it will be a key or overlay frame,
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->arf_pos_in_gf[frame_index] = 0;
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->pyramid_level[frame_index] = 0;
+ ++frame_index;
+
+ // ALT0
+ gf_group->update_type[frame_index] = ARF_UPDATE;
+ gf_group->arf_src_offset[frame_index] = gf_interval - 1;
+ gf_group->arf_pos_in_gf[frame_index] = 0;
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->pyramid_level[frame_index] = gf_group->pyramid_height;
+ ++frame_index;
+
+ // set parameters for the rest of the frames
+ set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0,
+ gf_group->pyramid_height - 1);
+
+ // check_frame_params(gf_group, gf_interval, frame_index);
+
+ return frame_index;
+}
+
+void define_customized_gf_group_structure(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ const int key_frame = cpi->common.frame_type == KEY_FRAME;
+
+ assert(rc->baseline_gf_interval == 4 || rc->baseline_gf_interval == 8 ||
+ rc->baseline_gf_interval == 16);
+
+ const int gf_update_frames =
+ construct_multi_layer_gf_structure(gf_group, rc->baseline_gf_interval);
+ int frame_index;
+
+ cpi->num_extra_arfs = 0;
+
+ for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
+ // Set unused variables to default values
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+
+ // Special handle for the first frame for assigning update_type
+ if (frame_index == 0) {
+ // For key frames the frame target rate is already set and it
+ // is also the golden frame.
+ if (key_frame) {
+ gf_group->update_type[frame_index] = KF_UPDATE;
+ continue;
+ }
+
+ if (rc->source_alt_ref_active) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ } else {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ }
+ } else {
+ if (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+ ++cpi->num_extra_arfs;
+ }
+
+ // Assign rf level based on update type
+ gf_group->rf_level[frame_index] =
+ update_type_2_rf_level(gf_group->update_type[frame_index]);
+ }
+
+ // NOTE: We need to configure the frame at the end of the sequence + 1 that
+ // will be the start frame for the next group. Otherwise prior to the
+ // call to av1_rc_get_second_pass_params() the data will be undefined.
+ if (rc->source_alt_ref_pending) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+ } else {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_STD;
+ }
+
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+ gf_group->arf_update_idx[frame_index] = 0;
+ // This value is only used for INTNL_OVERLAY_UPDATE
+ gf_group->arf_pos_in_gf[frame_index] = 0;
+
+ // This parameter is useless?
+ gf_group->arf_ref_idx[frame_index] = 0;
+
+ check_frame_params(gf_group, rc->baseline_gf_interval, gf_update_frames);
+}
+
+// It is an example of how to define a GF stucture manually. The function will
+// result in exactly the same GF group structure as
+// define_customized_gf_group_structure() when rc->baseline_gf_interval == 4
+#if USE_MANUAL_GF4_STRUCT
+#define GF_INTERVAL_4 4
+static const unsigned char gf4_multi_layer_params[][GF_FRAME_PARAMS] = {
+ {
+ // gf_group->index == 0 (Frame 0)
+ // It can also be KEY frame. Will assign the proper value
+ // in define_gf_group_structure
+ OVERLAY_UPDATE, // update_type (default value)
+ 0, // arf_src_offset
+ 0, // arf_pos_in_gf
+ 0 // arf_update_idx
+ },
+ {
+ // gf_group->index == 1 (Frame 4)
+ ARF_UPDATE, // update_type
+ GF_INTERVAL_4 - 1, // arf_src_offset
+ 0, // arf_pos_in_gf
+ 0 // arf_update_idx
+ },
+ {
+ // gf_group->index == 2 (Frame 2)
+ INTNL_ARF_UPDATE, // update_type
+ (GF_INTERVAL_4 >> 1) - 1, // arf_src_offset
+ 0, // arf_pos_in_gf
+ 0 // arf_update_idx
+ },
+ {
+ // gf_group->index == 3 (Frame 1)
+ LAST_BIPRED_UPDATE, // update_type
+ 0, // arf_src_offset
+ 0, // arf_pos_in_gf
+ 0 // arf_update_idx
+ },
+
+ {
+ // gf_group->index == 4 (Frame 2 - OVERLAY)
+ INTNL_OVERLAY_UPDATE, // update_type
+ 0, // arf_src_offset
+ 2, // arf_pos_in_gf
+ 0 // arf_update_idx
+ },
+ {
+ // gf_group->index == 5 (Frame 3)
+ LF_UPDATE, // update_type
+ 0, // arf_src_offset
+ 0, // arf_pos_in_gf
+ 1 // arf_update_idx
+ }
+};
+
+static int define_gf_group_structure_4(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ const int key_frame = cpi->common.frame_type == KEY_FRAME;
+
+ assert(rc->baseline_gf_interval == GF_INTERVAL_4);
+
+ const int gf_update_frames = rc->baseline_gf_interval + 2;
+ int frame_index;
+
+ for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
+ int param_idx = 0;
+
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+
+ if (frame_index == 0) {
+ // gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+
+ // For key frames the frame target rate is already set and it
+ // is also the golden frame.
+ if (key_frame) continue;
+
+ gf_group->update_type[frame_index] =
+ gf4_multi_layer_params[frame_index][param_idx++];
+
+ if (rc->source_alt_ref_active) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ } else {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ }
+ param_idx++;
+ } else {
+ gf_group->update_type[frame_index] =
+ gf4_multi_layer_params[frame_index][param_idx++];
+ }
+
+ // setup other parameters
+ gf_group->rf_level[frame_index] =
+ update_type_2_rf_level(gf_group->update_type[frame_index]);
+
+ // == arf_src_offset ==
+ gf_group->arf_src_offset[frame_index] =
+ gf4_multi_layer_params[frame_index][param_idx++];
+
+ // == arf_pos_in_gf ==
+ gf_group->arf_pos_in_gf[frame_index] =
+ gf4_multi_layer_params[frame_index][param_idx++];
+
+ // == arf_update_idx ==
+ gf_group->brf_src_offset[frame_index] =
+ gf4_multi_layer_params[frame_index][param_idx];
+ }
+
+ // NOTE: We need to configure the frame at the end of the sequence + 1 that
+ // will be the start frame for the next group. Otherwise prior to the
+ // call to av1_rc_get_second_pass_params() the data will be undefined.
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->arf_ref_idx[frame_index] = 0;
+
+ if (rc->source_alt_ref_pending) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+
+ } else {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_STD;
+ }
+
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+
+ // This value is only used for INTNL_OVERLAY_UPDATE
+ gf_group->arf_pos_in_gf[frame_index] = 0;
+
+ return gf_update_frames;
+}
+#endif // USE_MANUAL_GF4_STRUCT
+#endif // USE_SYMM_MULTI_LAYER
+
static void define_gf_group_structure(AV1_COMP *cpi) {
RATE_CONTROL *const rc = &cpi->rc;
@@ -2139,6 +2453,25 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
return;
}
#endif // USE_GF16_MULTI_LAYER
+#if USE_SYMM_MULTI_LAYER
+ const int valid_customized_gf_length = rc->baseline_gf_interval == 4 ||
+ rc->baseline_gf_interval == 8 ||
+ rc->baseline_gf_interval == 16;
+ // used the new structure only if extra_arf is allowed
+ if (valid_customized_gf_length && rc->source_alt_ref_pending &&
+ cpi->extra_arf_allowed > 0) {
+#if USE_MANUAL_GF4_STRUCT
+ if (rc->baseline_gf_interval == 4)
+ define_gf_group_structure_4(cpi);
+ else
+#endif
+ define_customized_gf_group_structure(cpi);
+ cpi->new_bwdref_update_rule = 1;
+ return;
+ } else {
+ cpi->new_bwdref_update_rule = 0;
+ }
+#endif
TWO_PASS *const twopass = &cpi->twopass;
GF_GROUP *const gf_group = &twopass->gf_group;
@@ -2322,9 +2655,8 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
}
// NOTE: We need to configure the frame at the end of the sequence + 1 that
- // will
- // be the start frame for the next group. Otherwise prior to the call to
- // av1_rc_get_second_pass_params() the data will be undefined.
+ // will be the start frame for the next group. Otherwise prior to the
+ // call to av1_rc_get_second_pass_params() the data will be undefined.
gf_group->arf_update_idx[frame_index] = 0;
gf_group->arf_ref_idx[frame_index] = 0;
@@ -2438,6 +2770,17 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
// TODO(zoeliu): To investigate whether the allocated bits on
// BIPRED_UPDATE frames need to be further adjusted.
gf_group->bit_allocation[frame_index] = target_frame_size;
+#if USE_SYMM_MULTI_LAYER
+ } else if (cpi->new_bwdref_update_rule == 1 &&
+ gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
+ int arf_pos = gf_group->arf_pos_in_gf[frame_index];
+ gf_group->bit_allocation[frame_index] = 0;
+
+ // Tried boosting up the allocated bits on backward reference frame
+ // by (target_frame_size >> 2) as in the original setting. However it
+ // does not bring gains for pyramid structure with GF length = 16.
+ gf_group->bit_allocation[arf_pos] = target_frame_size;
+#endif
} else {
assert(gf_group->update_type[frame_index] == LF_UPDATE ||
gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
@@ -2453,10 +2796,11 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
}
}
- // NOTE: We need to configure the frame at the end of the sequence + 1 that
- // will be the start frame for the next group. Otherwise prior to the
- // call to av1_rc_get_second_pass_params() the data will be undefined.
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule == 0 && rc->source_alt_ref_pending) {
+#else
if (rc->source_alt_ref_pending) {
+#endif
if (cpi->num_extra_arfs) {
// NOTE: For bit allocation, move the allocated bits associated with
// INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE.
@@ -2489,7 +2833,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
int i;
double boost_score = 0.0;
+#if !FIX_GF_INTERVAL_LENGTH
double old_boost_score = 0.0;
+ double mv_ratio_accumulator_thresh;
+#endif
double gf_group_err = 0.0;
#if GROUP_ADAPTIVE_MAXQ
double gf_group_raw_error = 0.0;
@@ -2509,7 +2856,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
double this_frame_mv_in_out = 0.0;
double mv_in_out_accumulator = 0.0;
double abs_mv_in_out_accumulator = 0.0;
- double mv_ratio_accumulator_thresh;
+
unsigned int allow_alt_ref = is_altref_enabled(cpi);
int f_boost = 0;
@@ -2551,18 +2898,18 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
gf_group_skip_pct -= this_frame->intra_skip_pct;
gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
}
-
+#if !FIX_GF_INTERVAL_LENGTH
// Motion breakout threshold for loop below depends on image size.
mv_ratio_accumulator_thresh =
(cpi->initial_height + cpi->initial_width) / 4.0;
-
+#endif
// Set a maximum and minimum interval for the GF group.
// If the image appears almost completely static we can extend beyond this.
{
- int int_max_q = (int)(av1_convert_qindex_to_q(twopass->active_worst_quality,
- cpi->common.bit_depth));
- int int_lbq = (int)(av1_convert_qindex_to_q(rc->last_boosted_qindex,
- cpi->common.bit_depth));
+ int int_max_q = (int)(av1_convert_qindex_to_q(
+ twopass->active_worst_quality, cpi->common.seq_params.bit_depth));
+ int int_lbq = (int)(av1_convert_qindex_to_q(
+ rc->last_boosted_qindex, cpi->common.seq_params.bit_depth));
active_min_gf_interval = rc->min_gf_interval + AOMMIN(2, int_max_q / 200);
if (active_min_gf_interval > rc->max_gf_interval)
@@ -2643,7 +2990,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
boost_score +=
decay_accumulator *
calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
-
+#if FIX_GF_INTERVAL_LENGTH
+ if (i == (FIXED_GF_LENGTH + 1)) break;
+#else
+ // Skip breaking condition for FIX_GF_INTERVAL_LENGTH
// Break out conditions.
if (
// Break at active_max_gf_interval unless almost totally static.
@@ -2666,9 +3016,9 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
break;
}
}
-
- *this_frame = next_frame;
old_boost_score = boost_score;
+#endif // FIX_GF_INTERVAL_LENGTH
+ *this_frame = next_frame;
}
twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
@@ -2693,7 +3043,18 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
}
// Set the interval until the next gf.
- rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+ if (cpi->oxcf.fwd_kf_enabled) {
+ // Ensure the gf group before the next keyframe will contain an altref
+ if ((rc->frames_to_key - i < rc->min_gf_interval) &&
+ (rc->frames_to_key != i)) {
+ rc->baseline_gf_interval = AOMMIN(rc->frames_to_key - rc->min_gf_interval,
+ rc->static_scene_max_gf_interval);
+ } else {
+ rc->baseline_gf_interval = i;
+ }
+ } else {
+ rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+ }
if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
// Disable extra altrefs and backward refs for "still" gf group:
@@ -2711,12 +3072,23 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if (!cpi->extra_arf_allowed) {
cpi->num_extra_arfs = 0;
} else {
+#if USE_SYMM_MULTI_LAYER
+ if (rc->baseline_gf_interval == 4 && rc->source_alt_ref_pending)
+ cpi->num_extra_arfs = 1;
+ else
+ cpi->num_extra_arfs = get_number_of_extra_arfs(
+ rc->baseline_gf_interval, rc->source_alt_ref_pending);
+#else
// Compute how many extra alt_refs we can have
cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
rc->source_alt_ref_pending);
+#endif // USE_SYMM_MULTI_LAYER
}
+
+#if !USE_SYMM_MULTI_LAYER
// Currently at maximum two extra ARFs' are allowed
assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
+#endif
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
@@ -3393,12 +3765,66 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
case INTNL_ARF_UPDATE:
cpi->refresh_last_frame = 0;
cpi->refresh_golden_frame = 0;
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule == 1) {
+ cpi->refresh_bwd_ref_frame = 1;
+ cpi->refresh_alt2_ref_frame = 0;
+ } else {
+#endif
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 1;
+#if USE_SYMM_MULTI_LAYER
+ }
+#endif
+ cpi->refresh_alt_ref_frame = 0;
+ break;
+
+ default: assert(0); break;
+ }
+}
+
+void av1_configure_buffer_updates_firstpass(AV1_COMP *cpi,
+ FRAME_UPDATE_TYPE update_type) {
+ RATE_CONTROL *rc = &cpi->rc;
+
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
+ rc->is_bwd_ref_frame = 0;
+
+ switch (update_type) {
+ case ARF_UPDATE:
+ cpi->refresh_alt_ref_frame = 1;
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+
+ rc->is_src_frame_alt_ref = 0;
+ break;
+ case INTNL_ARF_UPDATE:
cpi->refresh_alt2_ref_frame = 1;
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
cpi->refresh_alt_ref_frame = 0;
+ rc->is_src_frame_alt_ref = 0;
+ rc->is_src_frame_ext_arf = 0;
+
break;
+ case BIPRED_UPDATE:
+ cpi->refresh_bwd_ref_frame = 1;
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
- default: assert(0); break;
+ rc->is_bwd_ref_frame = 1;
+ break;
+ default: break;
}
}
@@ -3444,7 +3870,12 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
rc->base_frame_target = target_rate;
- cm->frame_type = INTER_FRAME;
+ if (cpi->no_show_kf) {
+ assert(gf_group->update_type[gf_group->index] == ARF_UPDATE);
+ cm->frame_type = KEY_FRAME;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
// Do the firstpass stats indicate that this frame is skippable for the
// partition search?
@@ -3479,7 +3910,7 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
twopass->baseline_active_worst_quality = tmp_q;
rc->ni_av_qi = tmp_q;
rc->last_q[INTER_FRAME] = tmp_q;
- rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->bit_depth);
+ rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
index 4ff0f73b02..b0c1a21e4f 100644
--- a/third_party/aom/av1/encoder/firstpass.h
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -122,6 +122,11 @@ typedef struct {
unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+#if USE_SYMM_MULTI_LAYER
+ unsigned char arf_pos_in_gf[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char pyramid_level[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char pyramid_height;
+#endif
unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
unsigned char ref_fb_idx_map[(MAX_LAG_BUFFERS * 2) + 1][REF_FRAMES];
@@ -186,6 +191,8 @@ void av1_end_first_pass(struct AV1_COMP *cpi);
void av1_init_second_pass(struct AV1_COMP *cpi);
void av1_rc_get_second_pass_params(struct AV1_COMP *cpi);
+void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi,
+ FRAME_UPDATE_TYPE update_type);
// Post encode update of the rate control parameters for 2-pass
void av1_twopass_postencode_update(struct AV1_COMP *cpi);
diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c
index 5a8f8cbbaa..f2ff5b4950 100644
--- a/third_party/aom/av1/encoder/hash_motion.c
+++ b/third_party/aom/av1/encoder/hash_motion.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
#include <assert.h>
#include "config/av1_rtcd.h"
diff --git a/third_party/aom/av1/encoder/ab_partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h
index 5b918fae22..279d394957 100644
--- a/third_party/aom/av1/encoder/ab_partition_model_weights.h
+++ b/third_party/aom/av1/encoder/partition_model_weights.h
@@ -1311,6 +1311,481 @@ static const NN_CONFIG av1_ab_partition_nnconfig_16 = {
#undef FEATURE_SIZE
#undef LABEL_SIZE
+#define FEATURE_SIZE 18
+#define LABEL_SIZE 4
+
+static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 48] = {
+ 0.121894f, 0.058485f, 0.702226f, 0.015457f, -0.123380f, -0.573450f,
+ 0.319576f, 0.118808f, 0.166057f, 0.526984f, 0.015211f, -0.025050f,
+ 0.085717f, -0.028221f, -0.580062f, -0.270530f, -0.092371f, 0.037679f,
+ 0.083573f, 0.007112f, -0.358623f, -0.264443f, -0.064819f, 0.022013f,
+ -0.040077f, -0.291967f, -0.293100f, 0.072266f, -0.270572f, -0.292253f,
+ -0.260105f, -0.294472f, -0.275752f, 0.054315f, 0.000085f, 0.105115f,
+ -0.363572f, -0.016542f, 0.185943f, -0.359903f, 0.038765f, -0.377668f,
+ 0.172692f, 0.127749f, -0.031275f, -0.242528f, -0.145880f, -0.055247f,
+ -0.000265f, -0.355224f, 0.089917f, -0.377841f, -0.209766f, 0.030899f,
+ 0.039546f, -0.375030f, -0.041605f, 0.137677f, 0.021282f, -0.150442f,
+ -0.189445f, 0.009293f, -0.316033f, 0.038745f, -0.278761f, 0.005692f,
+ -0.071763f, -0.302936f, -0.224572f, -0.211841f, 0.057503f, 0.005435f,
+ -0.930979f, 0.115513f, 0.689958f, 0.221318f, 1.003891f, 0.359540f,
+ -0.640534f, -0.162373f, -0.118105f, 0.205587f, 0.019710f, 0.025067f,
+ -0.025344f, 0.002831f, 0.033078f, 0.040175f, -0.007502f, 0.026272f,
+ 0.083443f, -0.880884f, 0.436948f, 0.293297f, 0.051678f, -0.133328f,
+ -0.180323f, 0.667835f, 0.070733f, -0.003060f, -0.221804f, 0.146601f,
+ 0.064024f, 0.056758f, -0.077361f, 0.105587f, -0.185500f, -0.133552f,
+ 0.138269f, 0.165055f, 0.628284f, 0.846449f, 0.058825f, 0.223157f,
+ 0.277896f, -0.381303f, 0.408241f, 0.643301f, 0.067494f, 0.120822f,
+ -0.182491f, -0.111373f, -0.033374f, 0.131387f, -0.114654f, 0.114318f,
+ 0.094718f, -0.052232f, 0.385903f, 1.212304f, 0.425305f, -0.052993f,
+ 0.291474f, -0.319730f, 0.023090f, -0.317259f, 0.011181f, -0.034185f,
+ -0.100671f, 0.186185f, -0.432511f, -0.115957f, -0.067746f, -0.177810f,
+ -0.226700f, 0.004464f, 0.006809f, 0.171360f, -0.080723f, 0.099826f,
+ -0.062301f, -0.358755f, -0.202549f, -0.084616f, -0.042313f, -0.325560f,
+ 0.010452f, -0.341089f, -0.013566f, -0.340129f, 0.034675f, -0.036518f,
+ -0.036473f, -0.192892f, 0.650235f, 0.609437f, -0.160982f, 0.125535f,
+ -1.004575f, 0.521969f, 1.318091f, 0.614004f, -0.106622f, -0.077453f,
+ -0.037328f, -0.081940f, 0.007640f, 0.026654f, -0.080332f, -0.077356f,
+ -0.288170f, -0.319680f, -0.131712f, -0.150985f, 0.073218f, 0.089502f,
+ -0.280502f, 0.003941f, -0.249937f, 0.244263f, 0.023269f, 0.080263f,
+ 0.073172f, -0.200036f, 0.022381f, 0.008592f, -0.339517f, -0.135073f,
+ 0.177199f, 0.208363f, 0.652360f, 0.272990f, 0.609535f, 0.145805f,
+ 0.022527f, -0.088378f, 0.205008f, 0.101021f, -0.019673f, -0.252681f,
+ 0.116034f, -0.062052f, 0.009991f, 0.138933f, -0.182428f, 0.052542f,
+ -0.350825f, -0.122654f, -0.154687f, 0.066747f, 0.021541f, -0.212169f,
+ -0.087093f, -0.087488f, 0.178129f, -0.146544f, 0.013919f, -0.273899f,
+ 0.223753f, -0.187327f, -0.118795f, -0.191892f, -0.355979f, 0.023794f,
+ -0.135236f, 0.058918f, 0.069080f, 0.279287f, 0.369689f, 1.134526f,
+ 0.659511f, 0.250223f, 0.286040f, 0.515284f, 0.067791f, -0.156385f,
+ 0.143283f, 0.050884f, 0.089956f, -0.040850f, -0.003650f, -0.081162f,
+ 0.086004f, 0.116578f, 0.826254f, 0.504869f, -0.196022f, -0.207279f,
+ 0.200503f, -0.196801f, 0.008211f, 0.411158f, -0.075855f, -0.036690f,
+ 0.111519f, -0.057838f, -0.005846f, 0.111067f, 0.174712f, -0.078054f,
+ 0.765897f, 0.018670f, -0.306960f, -0.020034f, -0.332875f, 0.662707f,
+ -0.461233f, -1.007542f, -0.693995f, -1.243352f, -0.014745f, 0.004036f,
+ -0.009141f, 0.003325f, -0.011233f, -0.000819f, 0.006369f, 0.002418f,
+ -0.035906f, -0.005135f, 1.073830f, 1.020736f, -0.182611f, -1.038976f,
+ -0.226695f, -0.375663f, 0.364568f, 0.620995f, -0.018615f, 0.011347f,
+ 0.045786f, 0.041077f, 0.010886f, -0.148428f, 0.028007f, -0.022322f,
+ -0.165985f, 0.233315f, -0.277531f, -0.329683f, -0.516967f, -0.390750f,
+ 0.006948f, 0.133744f, -0.375681f, -0.116877f, -0.009441f, -0.008597f,
+ -0.160679f, 0.102150f, -0.142647f, -0.117501f, 0.035035f, 0.228687f,
+ -1.117397f, -0.005171f, -0.008708f, 0.413042f, -0.298532f, 0.614909f,
+ -0.181084f, -0.711770f, 0.344033f, 0.287220f, -0.112848f, -0.052866f,
+ -0.222466f, 0.025029f, -0.107558f, 0.137036f, -0.276661f, -0.038808f,
+ -0.057448f, 0.037563f, 0.526020f, 0.447997f, 0.288366f, 0.264815f,
+ 0.319974f, -0.193091f, 0.353830f, 0.412950f, -0.280454f, 0.092737f,
+ 0.070919f, 0.043336f, 0.041214f, -0.052147f, 0.010860f, 0.191325f,
+ 0.079783f, -0.425672f, -0.053469f, -0.005495f, 0.184526f, -0.166171f,
+ 0.084459f, -0.042165f, -0.261759f, -0.248723f, -0.073483f, -0.377884f,
+ -0.189614f, -0.054146f, -0.261279f, 0.196347f, -0.087568f, 0.070533f,
+ -0.145492f, -0.041500f, -0.465861f, 0.077369f, 0.020645f, -0.440232f,
+ -0.414585f, -0.168627f, -0.050011f, -0.336676f, -0.344943f, -0.288140f,
+ 0.085513f, -0.200425f, 0.218516f, 0.049604f, -0.280952f, -0.242674f,
+ -1.969931f, 0.013374f, -0.039643f, 1.113947f, 0.018568f, 0.916330f,
+ -0.302934f, -0.225816f, 0.189529f, -0.361971f, 0.021073f, -0.050143f,
+ -0.041415f, 0.015126f, 0.018091f, -0.082401f, 0.017152f, 0.064856f,
+ 0.156170f, 0.145323f, -0.281409f, 0.213357f, -0.058966f, 0.158668f,
+ 0.033742f, 0.378820f, -0.662875f, -0.455532f, -0.702928f, 0.234325f,
+ 0.139627f, -1.360650f, 0.040921f, -0.044373f, -0.059999f, -0.048565f,
+ 0.115339f, -0.105888f, -0.170567f, -0.206097f, -0.349537f, 0.107941f,
+ -0.356286f, -0.374928f, 0.143257f, -0.317790f, 0.079875f, -0.359345f,
+ 0.081321f, -0.219772f, -0.077213f, 0.110624f, -0.252329f, -0.266481f,
+ 0.190135f, 0.121214f, 0.661064f, -0.037820f, -0.373068f, -0.065209f,
+ -0.286154f, -0.120695f, -0.110670f, -0.193589f, -0.010867f, -0.048054f,
+ -0.032010f, 0.110627f, 0.054094f, -0.884309f, -1.171623f, -0.386911f,
+ -0.756058f, 0.030362f, 0.563628f, -0.334227f, -0.111213f, 1.143898f,
+ -0.940454f, 0.084510f, 0.671010f, 0.312244f, -0.052592f, -0.014376f,
+ 0.039965f, -0.010763f, -0.114936f, -0.146020f, 0.015874f, 0.027439f,
+ -1.702315f, 0.148702f, 0.153021f, 0.363147f, -0.488933f, 0.220772f,
+ 0.640310f, -0.173911f, -0.169523f, -0.082261f, -0.014854f, 0.024414f,
+ 0.061041f, -0.013998f, 0.086539f, 0.000466f, 0.037472f, -0.010665f,
+ -0.326646f, 0.106971f, 0.405589f, 0.555345f, -0.318315f, 0.526498f,
+ 0.119246f, 0.022213f, 0.171237f, 0.214651f, 0.062904f, -0.023764f,
+ 0.011831f, 0.079644f, -0.096530f, -0.054373f, -0.306309f, -0.203709f,
+ -0.353217f, -0.350005f, -0.329549f, 0.062679f, -0.387625f, -0.237111f,
+ -0.025050f, -0.193987f, 0.002235f, -0.380821f, -0.051036f, -0.136020f,
+ 0.077989f, -0.361691f, 0.120485f, 0.157746f, 0.073394f, -0.284401f,
+ 0.113221f, 0.109808f, 0.000197f, 0.122523f, 0.081411f, -0.048544f,
+ -0.136577f, -0.007158f, -0.208952f, -0.276831f, 0.260479f, -1.392915f,
+ -0.865248f, 0.114577f, -0.000749f, -0.060338f, -0.091176f, -0.108421f,
+ 0.221256f, 0.100176f, -0.877560f, -1.248838f, 0.643005f, 0.064580f,
+ -0.049878f, 0.267988f, -0.434340f, -0.299254f, -0.097572f, 0.009606f,
+ 0.063810f, -0.090525f, 0.027760f, 0.043484f, 0.041697f, 0.108024f,
+ -0.359586f, -0.197090f, 0.121397f, 0.152206f, -0.391126f, -0.283145f,
+ 0.008754f, -0.059022f, -0.218745f, 0.043042f, -0.056716f, 0.153051f,
+ -0.210372f, -0.029681f, -0.288354f, 0.065242f, -0.189376f, 0.115013f,
+ -0.251488f, -0.533091f, 0.037768f, -0.319107f, -0.161364f, -0.103967f,
+ 0.063271f, -0.313289f, -0.312093f, -0.045239f, 0.150607f, 0.001487f,
+ 0.019602f, -0.338031f, -0.036214f, 0.112736f, -0.367762f, 0.122367f,
+ 0.094670f, 0.175590f, 0.301041f, -0.135257f, 0.539620f, 0.328619f,
+ -0.163971f, 0.137256f, 0.238805f, 0.483722f, 0.121353f, 0.083630f,
+ -0.283568f, 0.291661f, -0.061122f, -0.195295f, 0.153459f, -0.153727f,
+ -0.238839f, -0.071736f, 0.601437f, -0.664072f, 0.230827f, 0.198753f,
+ -0.039196f, 0.206751f, 0.529020f, 0.904132f, -0.219471f, 0.186694f,
+ -0.208608f, -0.093385f, -0.161617f, 0.003930f, -0.429869f, -0.123563f,
+ 0.626098f, -0.002495f, -0.245511f, -1.069848f, 0.296115f, -0.940267f,
+ -1.649122f, -0.512937f, -0.802874f, -1.000239f, -0.027629f, 0.020434f,
+ -0.003030f, 0.035986f, -0.004812f, -0.009193f, -0.004644f, -0.024347f,
+ 0.068439f, -0.314339f, 0.095057f, -0.212372f, 0.197523f, -0.040878f,
+ -0.272164f, -0.243326f, -0.204955f, 0.157199f, -0.049964f, -0.091537f,
+ -0.058012f, -0.306650f, 0.098621f, -0.146778f, -0.154447f, -0.177889f,
+ -0.009698f, 0.025427f, 0.350576f, -0.448237f, -0.068823f, 1.224960f,
+ -0.776883f, -0.692167f, -0.948497f, -0.492598f, 0.029440f, -0.056460f,
+ 0.021654f, 0.004352f, 0.041508f, -0.027179f, 0.006789f, -0.023573f,
+ 0.207775f, -0.280273f, -0.347984f, -0.129935f, 0.151512f, -0.087294f,
+ -0.494352f, -0.341424f, 0.044084f, -0.064080f, 0.073091f, -0.145574f,
+ 0.094715f, -0.258786f, -0.020419f, -0.401823f, 0.009397f, -0.138642f,
+ -0.034953f, -0.077419f, 0.636610f, 0.314980f, 1.110610f, -0.343368f,
+ 0.696647f, -0.649667f, 0.653491f, -0.096006f, -0.090469f, -0.066975f,
+ -0.105864f, -0.015666f, 0.102056f, -0.105344f, -0.273495f, -0.014686f,
+ 0.122031f, 0.139524f, -1.042029f, -0.562510f, 0.885644f, 1.088059f,
+ 0.189223f, 0.049404f, -0.167371f, 0.018703f, -0.208390f, -0.159002f,
+ -0.377130f, -0.151118f, 0.117861f, 0.026986f, -0.032433f, 0.081603f,
+ -0.106729f, -0.040134f, 0.015161f, 0.290572f, 0.241446f, 1.390085f,
+ 0.438915f, -0.358097f, -0.171799f, 0.879758f, -0.014110f, 0.029562f,
+ -0.073583f, -0.125817f, -0.036512f, -0.040275f, 0.037997f, 0.120979f,
+ 0.064538f, -0.038841f, 0.034797f, 0.110229f, -0.239779f, -0.004558f,
+ 0.226534f, 0.111286f, -0.268198f, 0.237673f, -0.328237f, -0.090774f,
+ -0.269690f, -0.202147f, -0.181808f, -0.305238f, 0.110058f, -0.169217f,
+ -0.300125f, 0.069031f, -0.081358f, -0.376174f, -0.349980f, 0.071443f,
+ -0.396278f, -0.389503f, -0.190410f, -0.014767f, -0.265229f, -0.099787f,
+ 0.079847f, -0.214580f, -0.235661f, -0.184227f, 0.111099f, -0.083945f,
+ -0.153809f, -0.284092f, -0.132497f, -0.154841f, -0.517157f, -0.640603f,
+ -0.357036f, -0.486142f, -0.182819f, -0.475022f, 0.079282f, 0.081168f,
+ -0.120831f, -0.016048f, -0.232495f, 0.214329f, -0.055058f, 0.032856f,
+ 0.061753f, 0.003226f, 0.097028f, 0.084535f, -1.563199f, 0.434928f,
+ -0.403710f, 0.520696f, -0.401696f, 0.450568f, -0.074121f, 0.076622f,
+ -0.098421f, 0.167036f, -0.255250f, -0.526313f, -0.933693f, -0.558104f,
+ 0.194341f, 0.173326f, 0.071112f, -0.651961f, -1.327587f, -0.705289f,
+ -1.138889f, 0.197167f, -0.714654f, -0.113891f, 0.080158f, 0.000301f,
+ 0.057905f, 0.060718f, -0.635995f, 0.100026f, -0.038239f, -0.025530f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer0[48] = {
+ -0.079252f, -0.083606f, -0.112759f, -0.071622f, 0.444562f, 0.215649f,
+ -0.337661f, -0.242379f, -0.053829f, 0.165168f, -0.076613f, -0.190579f,
+ -0.060175f, -0.571661f, -0.454075f, -1.462711f, -0.161563f, -0.088748f,
+ -0.030279f, -0.456293f, -0.134473f, -0.194976f, 0.044373f, -0.503954f,
+ -0.083563f, 0.123344f, 0.011821f, 0.085445f, -0.050294f, -0.135194f,
+ 0.057815f, 0.543558f, -0.090602f, -0.104671f, -0.285075f, 0.354335f,
+ 1.037007f, -0.023879f, -0.025025f, -0.094408f, -0.101200f, -0.142105f,
+ -0.380607f, -0.059067f, -0.113017f, -0.137448f, -0.177840f, 0.468505f,
+};
+
+static const float av1_4_partition_nn_weights_16_layer1[48 * LABEL_SIZE] = {
+ 0.174954f, -0.239117f, 0.073252f, 0.258881f, 0.579781f, 0.441827f,
+ 0.372037f, -0.062362f, 0.068477f, 0.376811f, -0.130520f, 0.214951f,
+ -0.200674f, 0.240347f, 0.152954f, 1.360264f, 0.334630f, -0.064789f,
+ -0.270826f, 0.212699f, 0.045669f, -0.150852f, -0.412603f, 0.122481f,
+ -0.230246f, 0.005004f, 0.321417f, -0.554083f, -0.186742f, -0.197687f,
+ -0.028669f, -0.138559f, -0.117773f, 0.024953f, 0.326367f, -0.109951f,
+ -1.098959f, -0.136134f, 0.563218f, 0.191799f, 0.126191f, -0.093113f,
+ 0.185371f, 0.058468f, 0.245247f, -0.138064f, -0.471573f, -0.209372f,
+ -0.111171f, 0.222275f, -0.350556f, -0.106336f, 0.268877f, 0.090639f,
+ -0.083008f, -0.190791f, -0.243922f, -0.121182f, -0.133733f, -0.078450f,
+ 0.099751f, 0.353020f, -0.199079f, -0.463492f, -0.647884f, 0.166611f,
+ -0.464034f, 0.045096f, -0.312178f, -0.190972f, -0.468297f, 0.662376f,
+ -0.197071f, -0.653123f, -0.354365f, -0.088501f, -0.302671f, 0.140713f,
+ 0.885444f, 0.350273f, -0.003345f, 0.217260f, 0.219156f, 0.240653f,
+ 0.347840f, 0.101849f, -0.244565f, -0.166971f, 0.091056f, 0.319912f,
+ 0.268459f, 0.250726f, -0.155819f, -0.087588f, 0.010749f, -0.192344f,
+ 0.344808f, 0.223482f, -0.189563f, -0.067317f, -0.348191f, -0.085265f,
+ 0.259318f, 0.102408f, 0.096675f, -0.255564f, -0.168480f, -0.068189f,
+ -0.457704f, 0.010565f, 0.228573f, -0.124421f, 0.202488f, 0.148519f,
+ 0.002180f, 0.099099f, -0.179019f, 0.245414f, -0.038307f, 0.116897f,
+ -0.031377f, 0.368533f, -0.793891f, 0.148614f, 0.075441f, 0.102465f,
+ -0.310002f, -0.355369f, -0.206713f, -0.262276f, 0.068578f, -0.044980f,
+ 0.092689f, -0.181058f, 0.016279f, 0.155965f, 0.545361f, -0.390699f,
+ -0.042457f, 0.110238f, 0.114640f, 0.112525f, 0.522221f, 0.533164f,
+ -0.331720f, -0.212966f, 0.140823f, 0.251311f, -0.006092f, -0.800438f,
+ 0.007981f, -0.585140f, -0.006526f, 0.541683f, -0.298498f, 0.084322f,
+ -0.056467f, -0.361806f, -0.256347f, -1.419173f, -0.159093f, 0.023017f,
+ 0.667915f, -0.176995f, 0.022307f, -0.169493f, 0.581377f, 0.044929f,
+ 0.044914f, -0.056290f, 0.324196f, 0.648043f, -0.089381f, -0.054971f,
+ 0.064782f, 0.629356f, -0.003760f, -0.123822f, 0.144133f, -0.378821f,
+ 1.116858f, 0.128552f, -0.668783f, 0.207194f, -0.437781f, -0.283321f,
+ -0.549404f, 0.010538f, 0.208997f, 0.231396f, -0.174347f, 0.161910f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+ -0.197883f,
+ -0.136696f,
+ 0.094115f,
+ 0.612799f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 48, // num_hidden_nodes
+ },
+ {
+ av1_4_partition_nn_weights_16_layer0,
+ av1_4_partition_nn_weights_16_layer1,
+ },
+ {
+ av1_4_partition_nn_bias_16_layer0,
+ av1_4_partition_nn_bias_16_layer1,
+ },
+};
+
+static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = {
+ 0.114554f, 0.043669f, 0.313291f, 0.167688f, -0.413357f, 0.088232f,
+ 0.301915f, -0.358117f, 0.267711f, -0.252716f, -0.038531f, -0.032805f,
+ -0.025382f, 0.023624f, -0.949694f, -0.065480f, -0.375721f, -0.697319f,
+ -0.117387f, -0.204309f, -0.190797f, -0.223867f, -0.190248f, 0.026668f,
+ 0.199717f, 0.216902f, -0.239241f, -0.096894f, -0.225046f, 0.246523f,
+ 0.002333f, -0.254385f, -0.205815f, 0.123139f, -0.476923f, 0.137557f,
+ 0.059686f, -0.124013f, 0.974675f, 0.889753f, 0.378940f, 0.526413f,
+ -0.208747f, -0.001913f, 0.094081f, 0.848010f, 0.062042f, 0.159831f,
+ 0.071016f, 0.024437f, 0.212611f, 0.039501f, -0.149922f, -0.055229f,
+ -0.229270f, 0.129004f, -0.182803f, 0.291223f, -1.197804f, -0.916991f,
+ -0.024095f, 0.738729f, -0.300326f, 0.402480f, 0.023944f, -0.022613f,
+ -0.004554f, 0.001784f, 0.035143f, -0.202237f, 0.080252f, -0.003912f,
+ -0.040345f, -0.121881f, 0.126672f, 0.093507f, -0.081305f, -0.081099f,
+ -0.218824f, -0.459254f, -0.055250f, -0.095096f, 0.207278f, 0.245259f,
+ -0.380849f, -0.334458f, -0.351449f, -0.513045f, -0.407823f, -0.222423f,
+ 0.103205f, -0.299965f, -0.211472f, -0.348690f, -0.283688f, -0.152743f,
+ -0.204005f, -0.173636f, 0.020302f, -0.109112f, 0.081203f, -0.137344f,
+ -0.364582f, -0.343133f, -0.176167f, -0.446541f, 0.144844f, -0.268105f,
+ -0.003889f, -0.309560f, -0.236092f, -0.299450f, 0.248269f, 0.207510f,
+ -0.279023f, -0.272472f, -0.166427f, 0.205973f, -0.345692f, -0.238400f,
+ -0.319178f, -0.327246f, -0.321756f, 0.043191f, -0.027520f, -0.029310f,
+ 0.161379f, 0.031154f, -0.605365f, -0.230926f, 0.261142f, -0.262678f,
+ -0.373351f, -0.326245f, 0.279222f, 0.684357f, -0.864302f, 0.036132f,
+ 0.239307f, 0.136262f, 0.124002f, -0.410379f, -0.172722f, -0.376670f,
+ -0.195889f, 0.037292f, -0.055295f, 1.022308f, 0.237600f, -0.618435f,
+ 0.366154f, 0.168308f, -0.473467f, -0.756558f, -0.044830f, 0.019057f,
+ -0.084214f, -0.007789f, -0.066028f, -0.074562f, 0.002082f, 0.001007f,
+ -0.269676f, -0.164768f, -0.027271f, -0.098935f, 0.009431f, 0.254431f,
+ 0.124238f, -0.198181f, 0.142723f, -0.112997f, -0.164224f, -0.355160f,
+ 0.135330f, -0.379557f, 0.079392f, 0.210607f, -0.354927f, -0.277678f,
+ -0.931111f, 0.056208f, -0.347710f, -0.355415f, 0.826145f, 0.390625f,
+ 0.374414f, -0.205685f, 0.562485f, 0.152288f, 0.130635f, 0.056622f,
+ 0.057972f, 0.095526f, -0.082436f, -0.085938f, -0.070570f, -0.087634f,
+ 0.335934f, 0.084860f, 0.544424f, -0.278917f, 0.476740f, 0.050927f,
+ -1.288817f, -0.078320f, -0.553041f, -0.160538f, -0.109365f, -0.127146f,
+ -0.032524f, -0.105117f, -0.182965f, -0.024723f, 0.083317f, 0.060073f,
+ -0.042945f, 0.015249f, 1.241504f, 0.662613f, 0.530496f, -0.180519f,
+ -1.099086f, -0.825844f, 0.551856f, -0.025009f, -0.006619f, -0.001049f,
+ 0.014828f, -0.035166f, -0.241091f, -0.136364f, -0.003219f, -0.014581f,
+ -0.379945f, -0.226191f, -0.161241f, -0.496390f, -0.147175f, -0.118004f,
+ -0.128206f, -0.389770f, -0.184288f, -0.119076f, -0.379211f, 0.236180f,
+ -0.468730f, -0.175170f, 0.136433f, 0.167739f, -0.377602f, 0.135772f,
+ 0.040972f, -0.193974f, -0.319475f, -0.016469f, -0.412027f, -0.322605f,
+ 0.111125f, -0.078456f, -0.387234f, -0.401605f, -0.088717f, -0.340682f,
+ 0.010556f, 0.058256f, -0.127352f, 0.017665f, 0.072632f, -0.171966f,
+ -0.117342f, -0.166050f, -0.182689f, -0.073182f, 0.096279f, -0.260229f,
+ 0.025216f, -0.332236f, -0.218706f, -0.200153f, -0.110303f, 0.073499f,
+ -0.280123f, 0.132262f, -0.308330f, -0.119036f, -0.303874f, -0.065445f,
+ -0.412137f, 0.057167f, 0.044582f, -0.330952f, -0.232572f, 0.039732f,
+ -0.326877f, -0.300569f, -0.467164f, -0.371499f, 0.034430f, 0.058277f,
+ -0.042485f, -0.409028f, -0.110889f, -0.500758f, -0.343141f, 0.042023f,
+ -1.071050f, 0.086854f, -0.004932f, -0.259698f, 0.125301f, -0.742663f,
+ -0.370517f, -0.772840f, 0.193628f, 0.554676f, 0.051283f, -0.196639f,
+ 0.040344f, 0.027391f, -0.040501f, 0.038303f, 0.032972f, -0.014638f,
+ 0.097720f, -0.206897f, -0.015480f, 0.008543f, 0.034469f, 0.127234f,
+ -0.396463f, -0.390189f, 0.117538f, -0.435622f, 0.043420f, -0.241987f,
+ -0.118254f, -0.190349f, 0.190273f, -0.085625f, -0.141253f, -0.377438f,
+ -0.249211f, 0.214512f, -0.363191f, -0.754851f, 0.238045f, 1.127635f,
+ 0.173947f, -0.357620f, 0.073671f, 0.220617f, 0.072067f, -0.076214f,
+ -0.044583f, -0.018371f, 0.010952f, -0.135116f, 0.076597f, 0.034480f,
+ -0.070212f, -0.454429f, -0.135215f, 0.163851f, -0.625990f, -0.283991f,
+ 0.284051f, 0.182935f, -0.048717f, 0.002484f, -0.009086f, 0.321724f,
+ 0.125162f, -0.069624f, -0.430299f, -0.007224f, -0.284725f, -0.475662f,
+ 0.123807f, -0.313614f, -0.103142f, 0.072125f, 0.100320f, -0.185558f,
+ -0.481522f, -0.247311f, -0.386762f, -0.258850f, 0.178844f, -0.381231f,
+ -0.436001f, -0.374834f, 0.230104f, -0.500679f, 0.170880f, 0.029657f,
+ -0.105857f, -0.366671f, -0.268833f, 0.036885f, -0.026776f, 0.037837f,
+ -0.362095f, -0.254933f, 0.129650f, 0.007945f, -0.304715f, -0.100813f,
+ -0.342849f, -0.269223f, 0.178490f, 0.186735f, -0.353995f, 0.050381f,
+ -0.440186f, 0.025985f, 1.096969f, 1.132937f, 0.581545f, 0.271734f,
+ -0.109169f, -0.014239f, 0.688644f, 0.602702f, 0.048616f, 0.022335f,
+ 0.037545f, 0.081667f, -0.109038f, -0.088565f, -0.002506f, -0.041420f,
+ -0.132515f, 0.187312f, 0.677273f, 1.111182f, 0.199096f, -0.211551f,
+ -0.896508f, 0.257981f, 0.007803f, 0.160343f, -0.124864f, -0.097150f,
+ 0.225090f, 0.242900f, -0.195665f, 0.011310f, 0.160765f, 0.169195f,
+ -0.081994f, -0.017372f, -0.566190f, -0.902086f, 0.027768f, 0.511419f,
+ 0.076009f, -0.165861f, 0.240487f, 0.006298f, -0.153334f, 0.041249f,
+ 0.387092f, 0.313011f, -0.032269f, 0.019024f, 0.052568f, 0.124247f,
+ 0.197640f, 0.002537f, 0.651044f, 0.829828f, -0.446444f, -0.402042f,
+ -0.469399f, -0.019842f, 0.371960f, 0.140373f, -0.044808f, 0.008283f,
+ 0.093791f, 0.052149f, 0.143123f, -0.449571f, -0.868816f, -0.265661f,
+ -0.225232f, -0.014704f, 0.543836f, -0.374498f, 0.561647f, 1.309445f,
+ 0.056789f, -0.048447f, 0.255758f, 0.644553f, -0.124802f, 0.097419f,
+ -0.149336f, 0.021596f, -0.043699f, 0.057591f, -0.000077f, 0.034488f,
+ -0.049353f, -0.007799f, 0.437914f, 0.509369f, 0.674428f, 1.858949f,
+ -0.205964f, 0.060776f, 0.184213f, 0.037177f, -0.062535f, -0.115408f,
+ 0.076498f, 0.010235f, -0.142253f, 0.009983f, 0.073436f, 0.038716f,
+ -0.369983f, -0.185959f, -0.137867f, 0.032134f, 0.213814f, -0.125571f,
+ 0.247874f, -0.166871f, -0.160890f, 0.147029f, 0.267143f, -0.298488f,
+ -0.210203f, -0.188313f, -0.085024f, -0.244962f, -0.189833f, -0.261242f,
+ 0.399519f, 0.143200f, -0.776419f, -0.374639f, -0.022066f, 0.582904f,
+ 0.006430f, -0.139134f, -0.491894f, -0.430579f, -0.358221f, -0.231365f,
+ -0.398255f, -0.173231f, 0.211789f, -0.036121f, -0.266856f, 0.042956f,
+ -1.138513f, -0.070313f, 0.158803f, 0.406989f, -0.015974f, 0.651020f,
+ -0.468982f, -0.310019f, 0.416922f, 0.895162f, 0.019921f, 0.004023f,
+ 0.006962f, 0.000863f, -0.216395f, -0.074913f, -0.002613f, 0.026703f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer0[32] = {
+ 0.133615f, -0.113389f, -0.575989f, 0.589389f, -0.193574f, -0.132463f,
+ 0.000000f, 0.060317f, 0.264577f, -0.060599f, 0.540147f, -0.127782f,
+ -0.548802f, -0.172235f, -0.193032f, -0.026301f, -0.177527f, 0.267821f,
+ -0.115455f, -0.137162f, -0.079595f, -0.041443f, -0.043856f, -0.657220f,
+ -0.448931f, 0.446300f, 0.250002f, 0.223559f, -0.647723f, -0.014369f,
+ 0.084333f, -0.056270f,
+};
+
+static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = {
+ -0.069633f, -0.087239f, 0.365816f, -0.068579f, 0.231198f, -0.067856f,
+ -0.139892f, -0.100235f, -0.488166f, -0.150112f, -0.005546f, 0.210832f,
+ 0.778888f, 0.169624f, 0.089968f, -0.243569f, 0.353483f, 0.032296f,
+ -0.157408f, 0.286885f, -0.063537f, -0.324055f, -0.161464f, 0.430600f,
+ 0.277707f, -0.196463f, 0.154647f, 0.059804f, 0.176408f, 0.303179f,
+ -0.040156f, 0.375810f, -0.363032f, -0.186808f, -0.264561f, -0.158937f,
+ -0.007949f, -0.076394f, 0.056475f, 0.308528f, 0.695387f, 0.051336f,
+ 0.433063f, -0.229948f, -1.210712f, 0.036286f, 0.183868f, -0.117660f,
+ 0.230134f, -0.093469f, 0.237918f, 0.625986f, -0.236671f, -0.377172f,
+ 0.331091f, -0.394004f, -0.214349f, 0.243940f, -0.600348f, 0.069843f,
+ 0.088325f, 0.225775f, 0.276884f, -0.604493f, 0.769812f, 0.259574f,
+ 0.086220f, 0.511515f, -0.282584f, -0.157719f, 0.278778f, -0.332732f,
+ 0.068985f, -0.237236f, -0.006102f, -0.154883f, 0.710288f, -0.245896f,
+ -0.255895f, -0.398038f, 0.304084f, -0.317065f, 0.192609f, -0.235613f,
+ 0.461340f, 0.117194f, 0.116817f, 0.196150f, 0.421622f, -0.264495f,
+ 0.617852f, -0.351756f, -0.310016f, 0.135932f, -0.242622f, -0.073094f,
+ 0.042077f, 0.039230f, -0.482715f, 0.553187f, 0.360637f, 0.313484f,
+ -0.131540f, -0.104731f, 0.374704f, 0.222173f, 0.437657f, 0.029827f,
+ -0.545156f, -0.203176f, 0.267824f, 0.169237f, -0.057871f, 0.552197f,
+ 0.272243f, 0.025681f, -0.262192f, 0.255934f, -0.202407f, -0.483317f,
+ -0.204721f, 0.288807f, -0.030735f, -0.047161f, -0.780724f, 0.381939f,
+ -0.295318f, 0.537378f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+ -0.332518f,
+ 0.114452f,
+ 0.098949f,
+ 0.465896f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32, // num_hidden_nodes
+ },
+ {
+ av1_4_partition_nn_weights_32_layer0,
+ av1_4_partition_nn_weights_32_layer1,
+ },
+ {
+ av1_4_partition_nn_bias_32_layer0,
+ av1_4_partition_nn_bias_32_layer1,
+ },
+};
+
+static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
+ 0.256343f, -0.021774f, -0.117102f, 0.416930f, 0.188160f, 0.148768f,
+ -0.611181f, -0.121607f, -0.394825f, -0.875025f, -0.167071f, 0.016408f,
+ 0.222769f, -0.199332f, 0.058667f, -0.679529f, 0.081744f, 0.044438f,
+ -0.182941f, -0.110339f, -0.137822f, -0.096164f, -0.132319f, 0.140036f,
+ -0.049503f, -0.309894f, -0.323991f, 0.166113f, 0.138104f, -0.263629f,
+ -0.368460f, -0.273989f, 0.147239f, 0.044566f, -0.363357f, -0.030792f,
+ 0.020734f, 0.068506f, -0.434214f, 0.581644f, -1.244146f, -0.569162f,
+ 0.179499f, -0.188900f, 0.078431f, -0.392126f, -0.006431f, 0.112146f,
+ -0.065892f, -0.051319f, 0.094607f, 0.251700f, -0.000650f, 0.011911f,
+ 0.080449f, 0.022816f, 0.322382f, 0.577070f, 0.927738f, 0.178707f,
+ -0.101237f, -0.212521f, 0.560261f, -0.206492f, -0.077591f, -0.069960f,
+ 0.025727f, 0.041122f, -0.735228f, -0.506091f, -0.600776f, -0.117829f,
+ 0.103556f, 0.141823f, 0.853448f, 0.339488f, 0.994022f, 0.121693f,
+ -2.065366f, -0.352510f, -0.174323f, -0.323400f, -0.002193f, 0.004161f,
+ 0.042469f, -0.005319f, -0.305784f, -0.371353f, 0.011194f, -0.018597f,
+ 0.209260f, 0.071577f, 0.242470f, -0.856593f, 0.288842f, 1.062608f,
+ -0.300472f, 0.221623f, -0.813563f, -0.250347f, -0.081455f, -0.092779f,
+ -0.168132f, -0.180640f, -0.075130f, -0.052906f, -0.015645f, 0.127158f,
+ -0.006546f, 0.051671f, 0.545608f, 1.101804f, 0.288086f, 1.107046f,
+ -0.200012f, 0.220182f, -0.189220f, -0.554973f, 0.040711f, -0.058029f,
+ 0.043737f, 0.016164f, -0.391790f, -0.287770f, -0.046545f, 0.045071f,
+ 0.190005f, -0.076963f, 0.836839f, 1.633266f, 0.902928f, 0.991972f,
+ -0.127932f, 0.293680f, -0.035984f, 0.476179f, -0.098024f, 0.068314f,
+ -0.058365f, 0.096221f, -0.000321f, -0.128840f, 0.136441f, -0.061853f,
+ 0.270367f, -0.184129f, -0.373670f, -0.177381f, 0.262109f, -0.378013f,
+ -0.053249f, -0.456389f, 0.222972f, -0.228067f, -0.115210f, -0.277797f,
+ 0.096913f, -0.014512f, -0.015533f, 0.026389f, -0.360536f, -0.078477f,
+ -0.203186f, 0.199574f, 0.770476f, 0.595592f, 0.360828f, 0.547721f,
+ -0.804787f, 0.389690f, -0.437645f, 0.576776f, 0.081903f, 0.082750f,
+ 0.007166f, -0.143755f, 0.114462f, 0.472432f, -0.058974f, 0.077761f,
+ -2.015181f, -0.054942f, -0.110894f, 0.529188f, -0.003300f, 0.913895f,
+ -0.324643f, 0.316135f, -0.291729f, 1.072647f, -0.029236f, 0.045592f,
+ -0.039399f, 0.043472f, -0.303244f, -0.108761f, -0.011154f, 0.009693f,
+ -0.374985f, 0.027758f, 0.302075f, -0.295758f, -0.165563f, -0.297259f,
+ -0.485624f, -0.469310f, -0.028247f, -0.124440f, -0.428082f, 0.096325f,
+ 0.089003f, -0.301585f, 0.022474f, 0.077477f, -0.032233f, -0.231036f,
+ 0.143206f, 0.169113f, -0.556486f, 0.346327f, -0.667790f, 0.126983f,
+ 0.179727f, 0.397307f, -0.490612f, -1.708789f, -0.040336f, -0.028547f,
+ -0.091319f, -0.119367f, -0.518796f, -0.543383f, 0.037162f, 0.031344f,
+ -0.131692f, 0.119353f, 0.799313f, 0.443848f, -0.499919f, -1.002983f,
+ 0.375477f, 0.221096f, -0.238033f, 0.284849f, 0.021897f, 0.023338f,
+ -0.059067f, 0.117276f, 0.039540f, 0.049630f, 0.175150f, 0.014166f,
+ -0.071486f, 0.091234f, -1.007432f, -1.417378f, 0.640528f, 1.442576f,
+ -0.257183f, -0.597016f, 0.861785f, 0.276121f, -0.098017f, 0.120514f,
+ -0.133184f, 0.106529f, 0.171644f, 0.059513f, 0.215952f, -0.009441f,
+ -0.505313f, 0.063174f, 0.229148f, -0.344213f, 0.862721f, 1.549941f,
+ -0.220129f, 0.493094f, 0.264095f, 0.143641f, 0.084968f, -0.078266f,
+ 0.032335f, -0.019006f, -0.098205f, 0.119213f, -0.103465f, 0.072811f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer0[16] = {
+ 0.111611f, -0.067682f, 0.633594f, 0.143559f, -1.051284f, -0.266625f,
+ -0.829789f, -0.956123f, -0.153484f, -0.787741f, 0.004832f, -0.080769f,
+ 0.235166f, 0.449468f, 0.294689f, -0.395300f,
+};
+
+static const float av1_4_partition_nn_weights_64_layer1[16 * LABEL_SIZE] = {
+ -0.069999f, -0.093710f, -0.423714f, -0.028138f, 0.684415f, 0.141445f,
+ 0.507161f, 0.435533f, -0.263268f, 0.585105f, 0.235301f, 0.127536f,
+ -0.688639f, -0.217993f, -0.540066f, 0.406718f, 0.018210f, -0.077349f,
+ -0.124823f, -0.488220f, -0.957026f, 0.302632f, 0.285490f, -0.411356f,
+ 0.091089f, 0.103862f, -0.549291f, 0.148628f, 0.640603f, -0.601018f,
+ 0.178024f, 0.601370f, 0.313780f, 0.051938f, 0.524083f, 0.814631f,
+ -0.415522f, -0.738849f, 0.477881f, -0.342864f, 0.105181f, 0.040010f,
+ -0.177521f, 0.400646f, 0.167093f, 0.388279f, -0.898439f, -0.111936f,
+ 0.469875f, -0.099528f, -0.217370f, 0.283742f, -0.033798f, -0.142797f,
+ -0.174057f, -1.293311f, -0.038777f, -0.003846f, 0.093642f, -0.527150f,
+ -0.021259f, 0.194651f, -0.276294f, -0.109514f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+ -0.688947f,
+ 0.121075f,
+ 0.289597f,
+ 0.948091f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_4_partition_nn_weights_64_layer0,
+ av1_4_partition_nn_weights_64_layer1,
+ },
+ {
+ av1_4_partition_nn_bias_64_layer0,
+ av1_4_partition_nn_bias_64_layer1,
+ },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c
index 4f6265617d..6d154a7d22 100644
--- a/third_party/aom/av1/encoder/pickcdef.c
+++ b/third_party/aom/av1/encoder/pickcdef.c
@@ -296,7 +296,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
int ydec[3];
int pli;
int cdef_count;
- int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
+ int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
uint64_t best_tot_mse = (uint64_t)1 << 63;
uint64_t tot_mse;
int sb_count;
@@ -317,8 +317,8 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
uint16_t *in;
DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
- quantizer =
- av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
+ quantizer = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth) >>
+ (cm->seq_params.bit_depth - 8);
lambda = .12 * quantizer * quantizer / 256.;
av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
@@ -361,7 +361,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
for (r = 0; r < frame_height; ++r) {
for (c = 0; c < frame_width; ++c) {
- if (cm->use_highbitdepth) {
+ if (cm->seq_params.use_highbitdepth) {
src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR(
xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c];
ref_coeff[pli][r * stride[pli] + c] =
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
index 5f802a7076..461c3af832 100644
--- a/third_party/aom/av1/encoder/picklpf.c
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -82,10 +82,8 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
plane + 1, partial_frame);
#endif
- int highbd = 0;
- highbd = cm->use_highbitdepth;
-
- filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane, highbd);
+ filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane,
+ cm->seq_params.use_highbitdepth);
// Re-instate the unfiltered frame
yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, plane);
@@ -202,7 +200,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
} else if (method >= LPF_PICK_FROM_Q) {
const int min_filter_level = 0;
const int max_filter_level = av1_get_max_filter_level(cpi);
- const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth);
+ const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth);
// These values were determined by linear fitting the result of the
// searched level for 8 bit depth:
// Keyframes: filt_guess = q * 0.06699 - 1.60817
@@ -211,7 +209,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
// And high bit depth separately:
// filt_guess = q * 0.316206 + 3.87252
int filt_guess;
- switch (cm->bit_depth) {
+ switch (cm->seq_params.bit_depth) {
case AOM_BITS_8:
filt_guess = (cm->frame_type == KEY_FRAME)
? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
@@ -229,7 +227,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
"or AOM_BITS_12");
return;
}
- if (cm->bit_depth != AOM_BITS_8 && cm->frame_type == KEY_FRAME)
+ if (cm->seq_params.bit_depth != AOM_BITS_8 && cm->frame_type == KEY_FRAME)
filt_guess -= 4;
// TODO(chengchen): retrain the model for Y, U, V filter levels
lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
index 93ea096905..28b693b085 100644
--- a/third_party/aom/av1/encoder/pickrst.c
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -163,8 +163,8 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
const int is_uv = plane > 0;
const RestorationInfo *rsi = &cm->rst_info[plane];
RestorationLineBuffers rlbs;
- const int bit_depth = cm->bit_depth;
- const int highbd = cm->use_highbitdepth;
+ const int bit_depth = cm->seq_params.bit_depth;
+ const int highbd = cm->seq_params.use_highbitdepth;
const YV12_BUFFER_CONFIG *fts = cm->frame_to_show;
// TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
@@ -173,7 +173,8 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
av1_loop_restoration_filter_unit(
limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
- is_uv && cm->subsampling_x, is_uv && cm->subsampling_y, highbd, bit_depth,
+ is_uv && cm->seq_params.subsampling_x,
+ is_uv && cm->seq_params.subsampling_y, highbd, bit_depth,
fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr);
@@ -540,8 +541,8 @@ static void search_sgrproj(const RestorationTileLimits *limits,
const MACROBLOCK *const x = rsc->x;
const AV1_COMMON *const cm = rsc->cm;
- const int highbd = cm->use_highbitdepth;
- const int bit_depth = cm->bit_depth;
+ const int highbd = cm->seq_params.use_highbitdepth;
+ const int bit_depth = cm->seq_params.bit_depth;
uint8_t *dgd_start =
rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start;
@@ -549,8 +550,8 @@ static void search_sgrproj(const RestorationTileLimits *limits,
rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start;
const int is_uv = rsc->plane > 0;
- const int ss_x = is_uv && cm->subsampling_x;
- const int ss_y = is_uv && cm->subsampling_y;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
@@ -1067,7 +1068,7 @@ static void search_wiener(const RestorationTileLimits *limits,
double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN];
const AV1_COMMON *const cm = rsc->cm;
- if (cm->use_highbitdepth) {
+ if (cm->seq_params.use_highbitdepth) {
compute_stats_highbd(wiener_win, rsc->dgd_buffer, rsc->src_buffer,
limits->h_start, limits->h_end, limits->v_start,
limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
@@ -1149,7 +1150,7 @@ static void search_norestore(const RestorationTileLimits *limits,
RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
- const int highbd = rsc->cm->use_highbitdepth;
+ const int highbd = rsc->cm->seq_params.use_highbitdepth;
rusi->sse[RESTORE_NONE] = sse_restoration_unit(
limits, rsc->src, rsc->cm->frame_to_show, rsc->plane, highbd);
@@ -1280,7 +1281,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
double best_cost = 0;
RestorationType best_rtype = RESTORE_NONE;
- const int highbd = rsc.cm->use_highbitdepth;
+ const int highbd = rsc.cm->seq_params.use_highbitdepth;
extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
highbd);
diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h
index ef333b6d81..42a4c590bc 100644
--- a/third_party/aom/av1/encoder/pustats.h
+++ b/third_party/aom/av1/encoder/pustats.h
@@ -18,91 +18,79 @@ extern "C" {
#include "av1/encoder/ml.h"
-#define NUM_FEATURES 20
+#define NUM_FEATURES 11
#define NUM_HIDDEN_LAYERS 2
-#define HIDDEN_LAYERS_0_NODES 10
+#define HIDDEN_LAYERS_0_NODES 12
#define HIDDEN_LAYERS_1_NODES 10
#define LOGITS_NODES 1
static const float
av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES *
HIDDEN_LAYERS_0_NODES] = {
- 13.8498f, 19.6630f, 13.3036f, 5.2448f, -18.0270f, 21.6671f,
- -0.2135f, -0.0060f, 0.1211f, -0.3549f, -0.3550f, 0.0190f,
- 0.0167f, -0.1192f, 0.2003f, 8.6663f, 32.0264f, 9.9558f,
- 9.0935f, -110.4994f, 51.8056f, 64.8041f, 58.5392f, 53.0189f,
- -61.6300f, 4.7540f, -0.0140f, 0.0185f, -15.8050f, 0.0790f,
- 0.0707f, 0.0784f, 0.0766f, -0.3030f, 0.0392f, 49.3312f,
- 63.3326f, 61.4025f, 54.2723f, -62.2769f, -147.1736f, -84.9432f,
- -82.5422f, -70.4857f, 46.7622f, -1.0285f, -0.4809f, 0.0068f,
- 1.0888f, -0.0515f, -0.0384f, -0.0232f, -0.0396f, 0.2429f,
- 0.2040f, -144.4016f, -88.0868f, -80.3134f, -70.6685f, 66.8528f,
- -53.8097f, -45.4011f, -52.8680f, -58.7226f, 99.7830f, 2.3728f,
- 0.0229f, 0.0002f, -0.3288f, -0.0563f, -0.0550f, -0.0552f,
- -0.0563f, 0.2214f, 0.0139f, -60.8965f, -45.5251f, -50.4188f,
- -51.5623f, 85.7369f, 77.3415f, 47.4930f, 53.8120f, 58.2311f,
- -45.9650f, -2.4938f, 0.1639f, -0.5270f, -75.4622f, -0.0026f,
- 0.0031f, 0.0047f, 0.0015f, 0.0092f, 0.0654f, 75.6402f,
- 54.7447f, 54.8156f, 52.6834f, -9.1246f, -34.0108f, -35.6423f,
- -34.2911f, -38.5444f, 72.1123f, 10.9750f, -0.1595f, 0.1983f,
- 22.5724f, -0.0556f, -0.0618f, -0.0571f, -0.0608f, 0.2439f,
- -0.0805f, -32.5107f, -28.9688f, -33.7284f, -48.1365f, 61.5297f,
- 39.2492f, -35.1928f, -11.5000f, 7.7038f, -94.2469f, 13.5586f,
- 0.7541f, 0.0105f, 4.4041f, 0.1799f, 0.1339f, 0.1567f,
- -0.6668f, -0.7384f, 0.2185f, 17.1700f, -26.4601f, -1.8970f,
- 38.9635f, -30.1916f, 31.8139f, 14.6157f, 10.0565f, 3.3340f,
- -40.6985f, -2.1186f, 0.0116f, 0.0962f, 0.7115f, -1.4071f,
- -1.3701f, -1.4728f, -1.3404f, -1.7286f, 5.5632f, 28.4998f,
- 5.4087f, 16.2668f, 11.8693f, -39.4153f, 106.3281f, 38.3075f,
- 39.4933f, 47.3805f, -15.0514f, -21.2421f, -0.2358f, -0.0024f,
- 0.3505f, -0.0429f, -0.0377f, -0.0322f, -0.0344f, 0.2020f,
- 0.1417f, 99.6711f, 35.3896f, 43.1117f, 59.8879f, -17.8250f,
- -16.6976f, 18.5100f, 6.3383f, 25.3020f, -55.8824f, 25.1027f,
- -0.9926f, -0.0738f, -1.4892f, 0.0269f, -0.0051f, -5.8168f,
- -0.0579f, -0.1500f, 0.7224f, 8.3066f, -3.8805f, -12.1482f,
- 14.3492f, -20.8118f,
+ 21.5067f, 22.6709f, 0.0049f, 0.9288f, -0.0100f, 0.0060f, -0.0071f,
+ -0.0085f, 0.0348f, -0.1273f, 10.1154f, 6.3405f, 7.8589f, -0.0652f,
+ -4.6352f, 0.0445f, -3.2748f, 0.1025f, -0.0385f, -0.4505f, 1.1320f,
+ 3.2634f, 23.2420f, -7.9056f, 0.0522f, -18.1555f, 0.0977f, 0.1155f,
+ -0.0138f, 0.0267f, -0.3992f, 0.2735f, 22.8063f, 35.1043f, 3.8140f,
+ -0.0295f, 0.0771f, -0.6938f, 0.0302f, -0.0266f, 0.0989f, -0.0794f,
+ 0.2981f, 33.3333f, -24.1150f, 1.4986f, -0.0975f, -15.3938f, -0.0858f,
+ -0.0845f, -0.0869f, -0.0858f, 0.3542f, 0.0155f, -18.2629f, 9.6688f,
+ -11.9643f, -0.2904f, -5.3026f, -0.1011f, -0.1202f, 0.0127f, -0.0269f,
+ 0.3434f, 0.0595f, 16.6800f, 41.4730f, 6.9269f, -0.0512f, -1.4540f,
+ 0.0468f, 0.0077f, 0.0983f, 0.1265f, -0.5234f, 0.9477f, 36.6470f,
+ -0.4838f, -0.2269f, -0.1143f, -0.3907f, -0.5005f, -0.0179f, -0.1057f,
+ 0.1233f, -0.4412f, -0.0474f, 0.1140f, -21.6813f, -0.9077f, -0.0078f,
+ -3.3306f, 0.0417f, 0.0412f, 0.0427f, 0.0418f, -0.1699f, 0.0072f,
+ -22.3335f, 16.1203f, -10.1220f, -0.0019f, 0.0005f, -0.0054f, -0.0155f,
+ -0.0302f, -0.0379f, 0.1276f, 0.1568f, 21.6175f, 12.2919f, 11.0327f,
+ -0.2000f, -8.6691f, -0.5593f, -0.5952f, -0.4203f, -0.4857f, -1.1239f,
+ 3.1404f, -13.1098f, -5.9165f, 22.2060f, -0.0312f, -3.9642f, -0.0344f,
+ -0.0656f, -0.0273f, -0.0465f, 0.1412f, -6.1974f, 9.3661f,
};
static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
{
- 17.6566f, 62.2217f, -107.2644f, -56.2255f, 68.2252f,
- -37.5662f, 9.587f, 18.5206f, 69.6873f, 4.3903f,
+ -14.3065f, 2.059f, -62.9916f, -50.1209f, 57.643f, -59.3737f,
+ -30.4737f, -0.1112f, 72.5427f, 55.402f, 24.9523f, 18.5834f,
};
static const float
av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
HIDDEN_LAYERS_1_NODES] = {
- -0.0494f, 0.3505f, -0.0461f, -1.3451f, 0.0198f, -0.0746f, -0.2217f,
- -0.9525f, 0.0633f, -0.0737f, -0.3568f, 1.8569f, -0.0189f, -1.8269f,
- 0.6281f, -1.3266f, -0.9202f, 2.8978f, -0.6437f, -0.8709f, -1.5066f,
- -1.0582f, -1.9509f, -0.0417f, -0.1315f, -0.3368f, 0.0014f, -0.5734f,
- -1.4640f, -1.6042f, 3.3911f, -1.6815f, -1.9026f, -4.8702f, -0.1012f,
- -1.4517f, -3.2156f, 0.8448f, 0.2331f, -0.1593f, 2.6627f, -0.8451f,
- -1.7382f, 0.9303f, 2.3003f, -0.0659f, 0.5772f, 0.4253f, 0.2083f,
- 0.3649f, -0.9198f, -0.2183f, -0.5381f, -1.0831f, 2.0359f, 0.0040f,
- -0.0871f, -0.1715f, 2.2453f, 0.5099f, -0.5900f, -0.6313f, -1.3028f,
- -1.7257f, 1.4130f, -0.7189f, -0.4336f, 1.9266f, 1.7495f, -0.3321f,
- 0.2827f, 0.4015f, -0.5044f, -1.0420f, -0.1258f, -0.0342f, -0.1190f,
- -3.1263f, 0.7485f, -0.3161f, -0.2224f, 2.5533f, -0.2121f, -1.3389f,
- 0.5556f, -0.9407f, -0.7456f, 1.4137f, -0.0353f, -0.0521f, 2.4382f,
- 0.1493f, -11.5631f, -1.6178f, 3.5538f, -3.6538f, -0.5972f, -3.0038f,
- -2.1640f, 0.5754f,
+ 0.3883f, -0.2784f, -0.2850f, 0.4894f, -2.2450f, 0.4511f, -0.1969f,
+ -0.0077f, -1.4924f, 0.1138f, -2.9848f, 1.0211f, -0.1712f, -0.1952f,
+ -0.4774f, 0.0761f, -0.3186f, -0.1002f, 0.8663f, 0.5026f, 1.1920f,
+ 0.9337f, 0.3911f, -0.3841f, -0.0037f, 0.7295f, -0.3183f, 0.1829f,
+ -1.3670f, -0.1046f, 0.6629f, 0.0619f, -0.1551f, 0.8174f, 2.1521f,
+ -1.3323f, -0.0527f, -0.5772f, 0.2001f, -0.6270f, -1.0625f, 0.3342f,
+ 0.6676f, 0.4605f, -2.0049f, 0.7781f, 0.0713f, -0.0824f, -0.4529f,
+ 0.1757f, -0.1338f, -0.2319f, -0.2864f, 0.1248f, 0.3887f, -0.1676f,
+ 1.8422f, 0.6435f, 1.2123f, -0.5667f, -0.2423f, -0.0314f, 0.2411f,
+ -0.5013f, 0.0422f, 0.2559f, 0.4435f, -0.1223f, 1.5167f, 0.3939f,
+ 1.0898f, 0.0795f, -0.9251f, -0.0813f, -0.5929f, -0.0741f, 4.0687f,
+ -0.4368f, -0.0984f, 0.0837f, 3.6169f, 0.0662f, -0.1679f, -0.8090f,
+ -0.2610f, -0.5791f, 0.0642f, -0.2979f, -0.9036f, 0.2898f, 0.3265f,
+ 0.4660f, -1.6358f, -0.0347f, 0.1087f, 0.0353f, 0.5687f, -0.5242f,
+ -0.4895f, 0.7693f, -1.3829f, -0.2244f, -0.2880f, 0.0575f, 2.0563f,
+ -0.2322f, -1.1597f, 1.6125f, -0.0925f, 1.3540f, 0.1432f, 0.3993f,
+ -0.0303f, -1.1438f, -1.7323f, -0.4329f, 2.9443f, -0.5724f, 0.0122f,
+ -1.0829f,
};
static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
{
- 69.1995f, 41.7369f, -1.4885f, -35.785f, 26.1678f,
- 58.4472f, 36.2223f, 66.327f, 50.8867f, 2.8306f,
+ -10.3717f, 37.304f, -36.7221f, -52.7572f, 44.0877f,
+ 41.1631f, 36.3299f, -48.6087f, -4.5189f, 13.0611f,
};
static const float
av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
- 1.811f, 0.9009f, 0.0694f, -0.9985f, -0.039f,
- 0.2076f, 0.5643f, 0.5408f, 0.6071f, 0.277f,
+ 0.8362f, 1.0615f, -1.5178f, -1.2959f, 1.3233f,
+ 1.4909f, 1.3554f, -0.8626f, -0.618f, -0.9458f,
};
static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = {
- 39.5529f,
+ 30.6878f,
};
static const NN_CONFIG av1_pustats_rate_nnconfig = {
@@ -125,78 +113,70 @@ static const NN_CONFIG av1_pustats_rate_nnconfig = {
static const float
av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES *
HIDDEN_LAYERS_0_NODES] = {
- -39.0787f, -212.9998f, -174.2088f, -264.1454f, 292.7151f, -60.8750f,
- -5.9915f, 0.0712f, -60.2312f, -0.2020f, -0.2135f, -0.1663f,
- -0.0711f, 0.2267f, 0.9152f, -36.1294f, -159.9320f, -222.9809f,
- -270.2556f, 300.7162f, 159.9224f, -172.5735f, -7.6852f, 54.3985f,
- 110.6721f, 19.2907f, -15.1039f, -0.0457f, 0.3289f, 0.4529f,
- -8.2222f, 1.3213f, -0.8378f, -0.2605f, 3.9600f, 17.3407f,
- 113.1116f, 34.6326f, 11.6688f, 109.3541f, 240.8123f, 45.0615f,
- 80.7443f, 39.2500f, -21.0931f, -27.1989f, -0.4264f, -0.1345f,
- 1.6269f, -0.0716f, 0.0989f, -0.1382f, 0.0248f, 0.0913f,
- 4.3903f, 244.1014f, 32.2567f, 58.6171f, 62.2273f, -2.8647f,
- -227.5659f, 16.0031f, -70.5256f, 23.8071f, 290.7356f, 13.6094f,
- -2.1842f, 0.0104f, -2.8760f, 0.3708f, 0.8501f, -3.2964f,
- -0.2088f, -0.4474f, 1.2248f, 40.5180f, -130.7891f, -188.1583f,
- -174.0906f, 205.9622f, 0.3425f, 0.2531f, 0.2822f, 0.0488f,
- 0.1416f, -0.0433f, -0.1195f, -0.0413f, -0.0708f, -0.0787f,
- -0.0889f, -0.4022f, -0.5055f, -0.4715f, 0.2315f, 0.1021f,
- -0.3676f, -0.3499f, -0.0715f, 0.1913f, 205.7521f, 125.2265f,
- 92.0640f, 77.5566f, -164.4280f, -19.3715f, -0.1346f, -0.4060f,
- 0.5042f, -0.2395f, -0.1329f, -0.1397f, 0.2175f, 0.2895f,
- 5.5019f, 198.9799f, 114.0018f, 94.9015f, 86.8434f, -183.4237f,
- 121.5626f, 94.8945f, 65.0803f, 93.6487f, -346.5279f, -47.6168f,
- 0.0633f, 0.0135f, -0.0692f, -0.1015f, -0.1146f, -0.1341f,
- -0.1175f, 0.4186f, 0.1505f, 130.7402f, 107.8443f, 62.8497f,
- 65.3501f, -312.7407f, 282.8321f, 98.1531f, 75.6648f, 25.8733f,
- -176.9298f, -37.2695f, -0.3760f, 0.0017f, 0.1030f, -0.1483f,
- 0.0787f, -0.0962f, 0.4109f, -0.2292f, 9.1681f, 274.3607f,
- 60.9538f, 75.9405f, 68.3776f, -167.3098f, -335.1045f, -69.2583f,
- -76.3441f, -16.5793f, 218.5244f, 28.2405f, 0.9169f, -0.0026f,
- -0.8077f, -1.5756f, -0.0804f, 0.1404f, 1.2656f, 0.0272f,
- -0.2529f, -340.8659f, -112.7778f, -58.3890f, -4.1224f, 108.1709f,
- -180.7382f, -93.7114f, -77.8686f, -131.8134f, 353.3893f, 4.8233f,
- 0.0205f, 0.0000f, -1.1654f, -0.0161f, -0.0255f, -0.0358f,
- -0.0412f, 0.1103f, 0.1041f, -188.9934f, -110.1792f, -88.6301f,
- -93.7226f, 336.9746f,
+ 0.7770f, 1.0881f, 0.0177f, 0.4939f, -0.2541f, -0.2672f, -0.1705f,
+ -0.1940f, -0.6395f, 1.2928f, 3.6240f, 2.4445f, 1.6790f, 0.0265f,
+ 0.1897f, 0.1776f, 0.0422f, 0.0197f, -0.0466f, 0.0462f, -1.0827f,
+ 2.0231f, 1.8044f, 2.7022f, 0.0064f, 0.2255f, -0.0552f, -0.1010f,
+ -0.0581f, -0.0781f, 0.2614f, -3.4085f, 1.7478f, 0.1155f, -0.1458f,
+ -0.0031f, -0.1797f, -0.4378f, -0.0539f, 0.0607f, -0.1347f, -0.3142f,
+ -0.2014f, -0.4484f, -0.2808f, 1.5913f, 0.0046f, -0.0610f, -0.6479f,
+ -0.7278f, -0.5592f, -0.6695f, -0.8120f, 2.9056f, -1.1501f, 9.3618f,
+ 4.2486f, 0.0011f, -0.1499f, -0.0834f, 0.1282f, 0.0409f, 0.1670f,
+ -0.1398f, -0.4661f, 13.7700f, 8.2061f, -0.0685f, 0.0061f, -0.2951f,
+ 0.0169f, 0.0520f, 0.0040f, 0.0374f, 0.0467f, -0.0107f, 14.2664f,
+ -2.2489f, -0.2516f, -0.0061f, -0.9921f, 0.1223f, 0.1212f, 0.1199f,
+ 0.1185f, -0.4867f, 0.0325f, -5.0757f, -8.7853f, 1.0450f, 0.0169f,
+ 0.5462f, 0.0051f, 0.1330f, 0.0143f, 0.1429f, -0.0258f, 0.2769f,
+ -12.8839f, 22.3093f, 1.2761f, 0.0037f, -1.2459f, -0.0466f, 0.0003f,
+ -0.0464f, -0.0067f, 0.2361f, 0.0355f, 23.3833f, 10.9218f, 2.6811f,
+ 0.0222f, -1.1055f, 0.1825f, 0.0575f, 0.0114f, -0.1259f, 0.3148f,
+ -2.0047f, 11.9559f, 5.7375f, 0.8802f, 0.0042f, -0.2469f, -0.1040f,
+ -1.5679f, 0.1969f, -0.0184f, 0.0157f, 0.6688f, 3.4492f,
};
static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
- { -175.6918f, 43.4519f, 154.196f, -81.1015f, -0.0758f,
- 136.5695f, 110.8713f, 142.029f, -153.0901f, -145.2688f };
+ {
+ 4.5051f, -4.5858f, 1.4693f, 0.f, 3.7968f, -3.6292f,
+ -7.3112f, 10.9743f, 8.027f, -2.2692f, -8.748f, -1.3689f,
+ };
static const float
av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
HIDDEN_LAYERS_1_NODES] = {
- -0.1727f, -0.2859f, -0.3757f, -0.4260f, -0.5441f, -0.0666f, -0.3792f,
- -0.1335f, -0.1521f, -0.0821f, -3.1590f, 0.2711f, 0.5889f, 0.0878f,
- 0.4693f, 0.7773f, -9.2989f, 0.0414f, 0.4485f, 22.8958f, -3.7024f,
- -2.4672f, -43.2908f, 0.0956f, 0.4431f, 2.3429f, 1.7183f, 0.3985f,
- -0.2275f, -3.1583f, -0.3485f, 0.3280f, 0.3763f, 0.2069f, 0.4231f,
- 0.7366f, -6.9527f, 0.0713f, 0.1359f, 16.6500f, -1.7655f, -0.1651f,
- 0.1280f, -0.2678f, -0.2120f, 1.6243f, 1.8773f, -0.7543f, -0.3292f,
- -0.7627f, -0.2001f, -0.1125f, -0.8100f, -0.1866f, 0.0567f, -0.4002f,
- 3.2429f, 0.6427f, -0.3759f, -11.6518f, -2.2893f, 0.7708f, -1.8637f,
- 1.7148f, 0.3124f, -0.7129f, -0.4927f, 0.1964f, -0.2570f, -25.0783f,
- 2.5061f, 0.1457f, -1.1239f, 0.0570f, -0.2526f, -0.0669f, 0.6791f,
- 1.1531f, -0.7246f, -0.3180f, -0.0015f, -0.0061f, -0.1626f, -0.0181f,
- 0.1271f, -0.0140f, -0.6027f, 0.0736f, -0.0157f, 1.2420f, -6.4055f,
- 0.2128f, -0.0386f, 0.3446f, 0.1840f, -0.7208f, -1.6979f, -0.0442f,
- 0.3230f, -1.9745f,
+ -0.0182f, -0.0925f, -0.0311f, -0.2962f, 0.1177f, -0.0027f, -0.2136f,
+ -1.2094f, 0.0935f, -0.1403f, -0.1477f, -0.0752f, 0.1519f, -0.4726f,
+ -0.3521f, 0.4199f, -0.0168f, -0.2927f, -0.2510f, 0.0706f, -0.2920f,
+ 0.2046f, -0.0400f, -0.2114f, 0.4240f, -0.7070f, 0.4964f, 0.4471f,
+ 0.3841f, -0.0918f, -0.6140f, 0.6056f, -0.1123f, 0.3944f, -0.0178f,
+ -1.7702f, -0.4434f, 0.0560f, 0.1565f, -0.0793f, -0.0041f, 0.0052f,
+ -0.1843f, 0.2400f, -0.0605f, 0.3196f, -0.0286f, -0.0002f, -0.0595f,
+ -0.0493f, -0.2636f, -0.3994f, -0.1871f, -0.3298f, -0.0788f, -1.0685f,
+ 0.1900f, -0.5549f, -0.1350f, -0.0153f, -0.1195f, -0.5874f, 1.0468f,
+ 0.0212f, -0.2306f, -0.2677f, -0.3000f, -1.0702f, -0.1725f, -0.0656f,
+ -0.0226f, 0.0616f, -0.3453f, 0.0810f, 0.4838f, -0.3780f, -1.4486f,
+ 0.7777f, -0.0459f, -0.6568f, 0.0589f, -1.0286f, -0.6001f, 0.0826f,
+ 0.4794f, -0.0586f, -0.1759f, 0.3811f, -0.1313f, 0.3829f, -0.0968f,
+ -2.0445f, -0.3566f, -0.1491f, -0.0745f, -0.0202f, 0.0839f, 0.0470f,
+ -0.2432f, 0.3013f, -0.0743f, -0.3479f, 0.0749f, -5.2490f, 0.0209f,
+ -0.1653f, -0.0826f, -0.0535f, 0.3225f, -0.3786f, -0.0104f, 0.3091f,
+ 0.3652f, 0.1757f, -0.3252f, -1.1022f, -0.0574f, -0.4473f, 0.3469f,
+ -0.5539f,
};
static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
- { 0.f, 70.3414f, 9.6036f, -118.1096f, 49.2507f,
- 95.1849f, 81.8015f, 167.0967f, -337.7945f, 169.8344f };
+ {
+ 11.9337f, -0.3681f, -6.1324f, 12.674f, 9.0956f,
+ 4.6069f, -4.4158f, -12.4848f, 10.8473f, 5.7633f,
+ };
static const float
av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
- -0.3627f, 1.2272f, 0.2201f, -1.7406f, -0.6885f,
- 0.8487f, -0.2761f, 0.7731f, -5.2096f, -0.7351f,
+ 0.3245f, 0.2979f, -0.157f, -0.1441f, 0.1413f,
+ -0.7496f, -0.1737f, -0.5322f, 0.0748f, 0.2518f,
};
static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = {
- 48.2331f,
+ 4.6065f,
};
static const NN_CONFIG av1_pustats_dist_nnconfig = {
diff --git a/third_party/aom/av1/encoder/rate_distortion_model_params.h b/third_party/aom/av1/encoder/rate_distortion_model_params.h
new file mode 100644
index 0000000000..14d23f10fb
--- /dev/null
+++ b/third_party/aom/av1/encoder/rate_distortion_model_params.h
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+#define AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// 22 float features +
+// 2 categorical features with 4 possible values, converted to one-hot vectors.
+// So, total 22 + 2 * 4 = 30 features.
+#define NUM_FEATURES 30
+#define NUM_HIDDEN_LAYERS 1
+#define NUM_HIDDEN_NODES 96
+#define NUM_OUTPUTS 1
+
+//------------------------------------------------------------------------------
+// RDCost model
+
+static const float
+ av1_rdcost_model_nn_weights_layer0[NUM_FEATURES * NUM_HIDDEN_NODES] = {
+ -0.0699f, 0.2790f, 0.1915f, 0.2669f, 0.4637f, 0.4095f,
+ 0.2129f, 0.0634f, 0.2306f, -0.2232f, -0.5711f, -0.6493f,
+ -0.7406f, -0.8440f, 0.4105f, 0.1392f, 0.5218f, -0.1618f,
+ -0.1719f, 0.3409f, 0.1111f, -0.3609f, -0.2929f, 0.3869f,
+ -0.5373f, 0.0700f, 0.2572f, 0.2483f, -0.0314f, 0.5228f,
+ 0.0169f, -0.1357f, 0.0419f, -0.1722f, 0.1303f, 0.1198f,
+ -0.0013f, 0.1309f, 0.0293f, -0.1941f, 0.0668f, -0.0643f,
+ -0.0381f, 0.1249f, -0.0731f, -0.1649f, 0.0964f, 0.0270f,
+ 0.1354f, 0.0538f, -0.2064f, -0.2067f, -0.0569f, 0.0449f,
+ 0.1680f, -0.0732f, -0.0785f, 0.1884f, -0.2137f, -0.0189f,
+ 0.2976f, 0.2818f, -0.0222f, 0.2658f, 0.0488f, 0.2778f,
+ -0.1110f, 0.2069f, -0.0072f, -0.0095f, -0.1105f, -0.1365f,
+ -0.4245f, -0.4751f, -0.0736f, 0.2333f, 0.0653f, -0.0249f,
+ 0.0055f, -0.0838f, -0.0489f, -0.2597f, 0.2621f, -0.0251f,
+ -0.0545f, 0.0816f, -0.0816f, 0.3396f, -0.1047f, 0.3678f,
+ 0.1487f, -0.0270f, 0.2574f, 0.1018f, 0.2560f, -0.0598f,
+ -0.0446f, -0.1792f, 0.5336f, -0.1590f, -0.9820f, -0.6514f,
+ -0.6304f, -0.8359f, -0.0699f, 0.0295f, -0.0057f, -0.3088f,
+ -0.1466f, 0.2220f, -0.1980f, -0.3400f, -0.1228f, 0.2667f,
+ -0.4816f, 0.0155f, -0.0194f, 0.2051f, 0.0513f, 0.1575f,
+ -121.4240f, -126.6840f, -124.1106f, -127.6184f, -85.0333f, -26.6396f,
+ 2.7020f, 102.0452f, -85.5128f, 0.0076f, 122.2206f, 107.5265f,
+ 108.3773f, 93.4847f, 20.3705f, -89.6993f, -176.9070f, -41.7543f,
+ -123.0293f, -91.6437f, -205.7099f, -62.5346f, -83.2987f, 21.3830f,
+ 56.6341f, -120.8647f, -127.7562f, -121.6688f, -127.4225f, -74.8045f,
+ -15.9247f, -14.6468f, -14.7788f, -15.4498f, -18.5514f, -11.1579f,
+ -5.8164f, -3.4318f, 0.8100f, 0.0642f, 203.5111f, 189.6872f,
+ 190.4776f, 176.4784f, -4.9427f, -12.5324f, -7.6861f, 21.9182f,
+ -6.7864f, -7.1906f, -8.1292f, 21.4780f, -7.8016f, -5.2653f,
+ 61.8526f, -15.5105f, -14.6900f, -14.1459f, -15.4350f, -19.1379f,
+ -0.7876f, -1.8558f, -4.6035f, -6.8405f, -0.2904f, 2.3202f,
+ 1.8127f, -2.9397f, -0.8187f, -0.6098f, 22.6173f, 10.3668f,
+ 12.9363f, 2.4541f, 6.6700f, 0.3804f, -3.3117f, 8.5464f,
+ -25.8348f, 1.8698f, -9.5753f, 8.5558f, -16.3576f, 7.2217f,
+ 35.3115f, -1.1447f, -2.6530f, -4.7027f, -5.7024f, -0.9513f,
+ 0.8393f, 0.7085f, 0.7879f, 0.3728f, 3.0574f, 1.1360f,
+ 26.0531f, 4.1084f, -1.7340f, 0.1683f, -450.7927f, -444.5818f,
+ -442.5239f, -438.1168f, 2.4924f, -0.0147f, -0.0797f, -47.5322f,
+ -1.7638f, -0.8608f, -0.6500f, -44.4326f, -0.9027f, 2.5560f,
+ -267.6517f, 0.2642f, 0.9457f, 0.7944f, 0.3609f, 3.2742f,
+ -74.3400f, -81.6894f, -76.2162f, -69.2979f, -90.2476f, -39.7389f,
+ 2.2545f, 36.5095f, -60.1129f, -1.0383f, 87.0348f, 83.9940f,
+ 83.7199f, 80.8609f, 14.9075f, -78.7405f, -74.3549f, -4.2382f,
+ -23.9739f, -91.8469f, -67.2654f, -21.5293f, -9.9857f, 11.8391f,
+ 35.8223f, -74.2551f, -81.0729f, -73.8347f, -70.3798f, -86.8052f,
+ 0.1701f, -0.1136f, 0.0060f, -0.0496f, -0.1727f, 0.0195f,
+ -0.1040f, 0.1027f, 0.0467f, -0.2538f, -0.1322f, 0.0860f,
+ 0.0093f, -0.2801f, -0.0958f, 0.0497f, -0.0582f, -0.0311f,
+ 0.1840f, 0.0752f, 0.0282f, 0.0297f, 0.0607f, 0.0650f,
+ 0.0893f, 0.1297f, 0.0373f, 0.0040f, -0.0973f, 0.0248f,
+ -0.1419f, 0.0322f, -0.0712f, 0.0860f, -0.0426f, -0.1989f,
+ 0.1393f, -0.1183f, 0.0735f, -0.1895f, 0.1447f, -0.0056f,
+ -0.1833f, 0.0884f, 0.0949f, 0.0476f, 0.0551f, 0.2125f,
+ -0.1537f, -0.0141f, -0.2182f, 0.1567f, 0.0457f, -0.1485f,
+ -0.1177f, 0.0391f, 0.1982f, -0.1288f, 0.1165f, -0.2019f,
+ 0.4550f, 0.5179f, 0.4311f, 0.1861f, 0.6199f, 0.4542f,
+ 0.2034f, 0.1128f, 1.3489f, -0.2525f, -2.1139f, -2.2444f,
+ -2.3679f, -2.3378f, 0.5682f, 0.1348f, 0.3032f, -1.5835f,
+ 0.2883f, 0.1693f, 0.0439f, -1.4556f, 0.3818f, 0.4875f,
+ -1.8899f, 0.2510f, 0.6450f, 0.6082f, 0.5962f, 0.8131f,
+ 12.0281f, 13.3899f, 13.6249f, 15.8068f, -1.5453f, 6.7456f,
+ -6.0877f, 26.2596f, 6.2223f, -0.5922f, 134.1428f, 128.8985f,
+ 128.7538f, 123.0920f, 1.3207f, 18.3069f, 15.7436f, 46.5230f,
+ 24.7455f, 15.0688f, 19.9965f, 34.7236f, 19.7171f, 1.2018f,
+ 49.7274f, 11.8957f, 13.1578f, 14.0451f, 15.3544f, -3.5601f,
+ 1.0048f, 0.9479f, 1.1832f, 2.0635f, -2.9808f, 2.0803f,
+ -7.5815f, 8.4733f, -4.2008f, 0.1217f, 226.5257f, 210.7018f,
+ 211.6235f, 195.2605f, 0.8283f, 1.0977f, 1.4858f, 41.1242f,
+ 1.5822f, 0.8742f, 2.0440f, 33.6213f, 1.6177f, 0.9661f,
+ 65.0014f, 1.4197f, 1.0109f, 1.3153f, 1.5470f, -3.2833f,
+ 2.0858f, 2.0012f, 2.1088f, 2.5593f, -0.9422f, 1.8554f,
+ -6.5378f, 0.6780f, 2.3186f, 0.0506f, 218.3285f, 203.4055f,
+ 204.0362f, 188.7854f, 0.3701f, 2.5257f, 3.5172f, 28.8144f,
+ 2.1511f, 3.4676f, 2.6337f, 28.5113f, 2.4254f, -0.0548f,
+ 59.4511f, 2.0757f, 2.1551f, 2.2271f, 2.5300f, -1.4173f,
+ 91.9240f, 88.2142f, 83.6155f, 82.2482f, -9.2566f, 10.9654f,
+ -2.6974f, 62.6750f, -3.6298f, -0.1245f, 69.6721f, 67.1340f,
+ 66.9162f, 64.1994f, -83.6778f, 76.8107f, 69.7832f, 64.9261f,
+ 68.4901f, 76.3615f, 70.8108f, 63.5435f, 69.1973f, -83.6034f,
+ 24.8275f, 90.1923f, 87.6831f, 82.9783f, 81.8558f, -7.1010f,
+ 95.1656f, 88.3853f, 80.5835f, 79.5990f, -3.0720f, 8.1290f,
+ -0.6151f, 63.6425f, -4.5833f, -0.0063f, 70.1861f, 66.6250f,
+ 66.6148f, 63.0886f, -89.2863f, 74.7684f, 64.8897f, 60.4134f,
+ 62.5241f, 78.7076f, 61.7234f, 60.1688f, 61.9509f, -89.4098f,
+ 30.3361f, 92.9144f, 88.5954f, 79.6336f, 79.2453f, -0.4101f,
+ 0.6287f, 0.8050f, 0.4417f, 0.5419f, 0.5972f, 1.3037f,
+ 0.4316f, -0.0013f, -0.3673f, -0.4952f, 6.1773f, 5.7825f,
+ 6.1705f, 5.3848f, 1.7607f, -0.0152f, -0.2924f, 0.8199f,
+ 1.3326f, 0.7197f, -0.6332f, 1.1127f, 1.0472f, 1.8468f,
+ 3.4419f, 0.8233f, 0.7175f, 0.8514f, 0.6372f, 0.9472f,
+ -0.0813f, -0.0197f, -0.0096f, -0.2015f, 0.1133f, -0.0305f,
+ 0.0578f, 0.1375f, -0.0750f, -0.1702f, 0.1246f, -0.1782f,
+ 0.2017f, 0.0425f, -0.0602f, 0.1837f, 0.1044f, -0.1273f,
+ -0.1431f, 0.0672f, -0.1807f, -0.1045f, -0.1355f, -0.0497f,
+ -0.0561f, -0.0633f, 0.1907f, -0.0777f, 0.1203f, 0.0754f,
+ 0.4079f, 0.2001f, 0.0558f, 0.0622f, 0.2951f, 0.6541f,
+ -0.0068f, 0.1070f, 0.4469f, -0.1266f, -1.3035f, -1.3324f,
+ -1.3612f, -0.9966f, 0.7986f, 0.3192f, -0.5028f, -0.3844f,
+ -0.4079f, 0.6690f, -0.5109f, -0.2719f, -0.4958f, 1.0310f,
+ -0.8044f, 0.1447f, 0.4221f, 0.3194f, 0.3063f, 0.5520f,
+ 0.4667f, -5.7238f, -0.5602f, 12.6339f, -15.1865f, -14.9035f,
+ -3.0726f, 9.5347f, -24.6225f, -2.7086f, 89.8557f, 95.0657f,
+ 93.8693f, 99.1085f, -35.9483f, -18.0363f, -1.6298f, 25.3484f,
+ 39.3975f, -15.3199f, 5.7664f, 17.2367f, 25.2788f, -36.5648f,
+ 29.1426f, 0.3857f, -5.2117f, 0.0533f, 12.1707f, -11.1735f,
+ 0.2673f, 0.0090f, 0.1574f, 0.0904f, 0.0281f, 0.1144f,
+ 0.1123f, -0.0061f, 0.0954f, -0.0094f, -0.4387f, -0.5006f,
+ -0.2560f, -0.2326f, -0.1769f, 0.0465f, 0.1273f, -0.1627f,
+ 0.2987f, -0.3041f, 0.1131f, -0.3620f, 0.0932f, -0.0649f,
+ -0.4597f, 0.2535f, -0.0994f, 0.1390f, 0.1279f, 0.4207f,
+ -39.1159f, -42.6382f, -38.4225f, -31.2301f, -28.2382f, -28.1176f,
+ -9.5822f, 1.1886f, -1.2964f, -0.7908f, 154.9819f, 147.1914f,
+ 147.0482f, 138.7535f, -21.7014f, -35.7117f, -28.8802f, -3.8968f,
+ -21.5007f, -28.2213f, -28.4878f, -3.7558f, -26.8317f, -22.8491f,
+ 50.9464f, -37.0918f, -42.8811f, -39.3079f, -32.1904f, -26.6354f,
+ -72.5346f, -75.5751f, -72.6896f, -71.3671f, -35.3279f, -21.6077f,
+ -5.8259f, 38.7516f, -6.8012f, 0.0172f, 170.0685f, 157.4452f,
+ 158.2334f, 145.0102f, 10.0653f, -45.1775f, -56.4571f, -5.1165f,
+ -75.8980f, -46.8672f, -55.3642f, -6.5631f, -81.0258f, 10.1348f,
+ 55.9786f, -70.8124f, -75.7040f, -73.9831f, -70.8786f, -34.9723f,
+ 88.6239f, 86.5330f, 80.9333f, 79.6833f, -10.0096f, 10.6312f,
+ -4.2350f, 62.6230f, -3.2991f, -0.0843f, 75.8659f, 72.7886f,
+ 72.5301f, 68.8265f, -81.8276f, 70.3025f, 62.9511f, 62.5706f,
+ 69.1842f, 69.3637f, 65.4820f, 65.4357f, 71.5347f, -82.1064f,
+ 24.1925f, 86.2418f, 85.4985f, 80.4091f, 79.5378f, -9.3877f,
+ -7.6594f, -4.9581f, -10.6385f, -20.2307f, -44.2261f, -13.7557f,
+ -4.5344f, 18.1793f, -10.5522f, -1.5878f, 110.3187f, 102.4945f,
+ 102.3305f, 94.1324f, -25.2665f, 9.8172f, -4.4791f, 69.4972f,
+ -6.7571f, 5.8378f, -11.6101f, 70.7066f, -4.9327f, -24.0513f,
+ 41.4598f, -7.0600f, -7.0940f, -10.2478f, -18.9616f, -46.7505f,
+ 90.9365f, 86.0260f, 73.2934f, 69.3406f, 3.3863f, 3.8524f,
+ 0.6536f, 63.2150f, -10.6304f, 0.0291f, 73.0071f, 69.7660f,
+ 69.0457f, 65.5611f, -92.3379f, 74.2756f, 54.5025f, 84.3183f,
+ 53.7481f, 73.5624f, 55.3827f, 82.3242f, 53.5432f, -92.5355f,
+ 25.3457f, 89.1858f, 84.4763f, 72.9840f, 69.1889f, 4.6719f,
+ -0.0129f, 0.1995f, 0.2069f, 0.0358f, 0.1209f, -0.1185f,
+ -0.1217f, -0.1456f, 0.0125f, -0.1354f, 0.0510f, -0.0572f,
+ 0.1397f, 0.1453f, -0.0086f, 0.0107f, 0.0232f, 0.1508f,
+ 0.0884f, -0.0967f, -0.1786f, 0.1361f, -0.1399f, -0.2021f,
+ -0.0242f, -0.2169f, 0.0133f, 0.0116f, -0.1489f, -0.0093f,
+ -0.0796f, 0.1507f, 0.0906f, 0.0228f, -0.0166f, -0.1875f,
+ 0.0471f, 0.1184f, -0.0007f, -0.2732f, -0.1386f, -0.2057f,
+ -0.0213f, -0.1699f, 0.0996f, 0.1562f, 0.1850f, -0.0362f,
+ -0.2059f, 0.0258f, -0.0135f, -0.1276f, 0.0034f, 0.2023f,
+ 0.0857f, -0.0085f, -0.1955f, -0.1666f, -0.0920f, 0.0971f,
+ -0.0292f, -0.0512f, -0.0753f, -0.0739f, -0.0873f, -0.1200f,
+ 0.0220f, -0.1359f, 0.2013f, -0.0445f, 0.1143f, -0.1484f,
+ -0.1556f, -0.0003f, 0.1711f, -0.0724f, -0.0531f, 0.1126f,
+ 0.0476f, -0.0057f, 0.0088f, 0.0792f, -0.0438f, -0.1118f,
+ -0.0244f, 0.0712f, 0.0930f, -0.0203f, 0.1662f, -0.0695f,
+ -12.3872f, -18.7022f, -13.4237f, -1.4731f, -18.6843f, -14.1515f,
+ -7.5057f, 40.2090f, -2.7774f, -1.8433f, 123.6006f, 119.0557f,
+ 118.2758f, 113.6423f, -32.6216f, -19.5865f, -16.2897f, 17.2068f,
+ 6.3559f, -17.8742f, 0.7098f, 11.5970f, -10.1104f, -33.1830f,
+ 39.5617f, -10.5499f, -17.8137f, -14.7185f, -2.6172f, -14.6004f,
+ 0.3893f, 0.4443f, 0.5305f, 0.3049f, 0.8316f, 0.8679f,
+ 0.2265f, 0.2393f, 1.1970f, -0.2891f, -1.8666f, -1.8266f,
+ -1.6984f, -1.8787f, 0.8706f, 0.4208f, 0.5076f, -0.8436f,
+ -0.1623f, 0.8008f, 0.1512f, -1.0839f, -0.3002f, 0.9263f,
+ -1.3031f, 0.5964f, 0.3413f, 0.5551f, 0.2618f, 0.7018f,
+ -0.1320f, -0.1944f, -0.0209f, -0.0877f, 0.0721f, -0.0840f,
+ 0.0589f, 0.1019f, 0.1927f, -0.2011f, -0.1117f, 0.1575f,
+ 0.1080f, -0.0516f, 0.2154f, -0.1231f, 0.0426f, -0.0522f,
+ -0.1824f, -0.1923f, -0.1206f, -0.1724f, -0.0798f, 0.0401f,
+ -0.2170f, 0.0293f, -0.0853f, 0.1517f, 0.2128f, -0.1934f,
+ 0.0406f, 0.0517f, 0.0822f, -0.0150f, 0.0943f, -0.0989f,
+ -0.1802f, -0.1453f, -0.1967f, -0.1797f, 0.1545f, -0.1217f,
+ 0.1755f, -0.1604f, -0.0515f, 0.0509f, 0.0310f, -0.1220f,
+ -0.1770f, -0.0157f, 0.1989f, -0.0069f, 0.1766f, 0.1267f,
+ -0.0517f, -0.0396f, 0.0346f, 0.1946f, 0.1162f, -0.1345f,
+ -106.6179f, -110.5917f, -107.5476f, -108.0601f, -61.1687f, -22.4247f,
+ 2.6632f, 109.5208f, -66.1177f, 0.0062f, 159.9339f, 144.7755f,
+ 145.5032f, 128.9872f, 18.9180f, -75.3569f, -105.0866f, -52.0704f,
+ -119.1299f, -74.7543f, -109.9468f, -59.0682f, -104.5754f, 19.2878f,
+ 67.2573f, -104.8061f, -111.8610f, -106.6751f, -107.3537f, -56.4758f,
+ -0.6967f, -0.8495f, -0.9586f, -1.0461f, 1.4522f, -0.2762f,
+ 28.2828f, 2.9157f, -2.1062f, 0.1566f, -467.2388f, -461.0685f,
+ -459.0092f, -453.8370f, 1.5422f, -0.8186f, -0.4884f, -53.0399f,
+ -2.0255f, -1.1348f, -1.1039f, -50.2489f, -1.4821f, 1.8021f,
+ -258.0319f, -1.0865f, -0.5542f, -1.0443f, -1.2732f, 1.8413f,
+ 0.2377f, 0.1937f, -0.0116f, 0.0935f, -0.0599f, 0.0118f,
+ -0.0875f, 0.0455f, -0.1301f, -0.1081f, -0.2622f, -0.1960f,
+ 0.0393f, -0.1490f, 0.1852f, -0.0964f, -0.0741f, 0.0419f,
+ 0.1162f, -0.0274f, 0.1200f, -0.0333f, -0.1337f, 0.2141f,
+ 0.0664f, 0.1044f, -0.1744f, 0.1060f, -0.1468f, 0.0679f,
+ 0.0218f, 0.0494f, 0.1064f, 0.1363f, 0.0013f, 0.1331f,
+ -0.2095f, 0.2088f, -0.0399f, -0.1811f, 0.0678f, -0.1974f,
+ 0.1855f, -0.0968f, -0.2008f, 0.0162f, -0.0096f, -0.1493f,
+ 0.2170f, -0.1248f, -0.2055f, 0.1276f, -0.0269f, -0.1697f,
+ -0.0662f, 0.1073f, -0.0029f, -0.1051f, -0.1573f, 0.2106f,
+ -0.2020f, -0.1565f, 0.0335f, -0.1818f, -0.1665f, 0.2169f,
+ 0.1974f, -0.1470f, -0.1738f, -0.2038f, 0.0558f, -0.0441f,
+ 0.0065f, -0.1485f, -0.1366f, -0.2131f, 0.1042f, 0.0349f,
+ -0.1804f, -0.1361f, -0.0116f, -0.1012f, -0.0860f, 0.0606f,
+ -0.2077f, 0.1826f, -0.1014f, -0.0721f, -0.1517f, 0.1022f,
+ -0.1110f, -0.0186f, 0.1505f, 0.1797f, 0.0911f, 0.0340f,
+ 0.1702f, -0.1404f, -0.0566f, -0.2744f, -0.1943f, -0.1871f,
+ 0.0046f, 0.0306f, -0.0436f, 0.1625f, -0.1302f, 0.0175f,
+ 0.1570f, -0.1425f, 0.0779f, 0.1398f, 0.0929f, 0.0897f,
+ 0.0458f, -0.0936f, 0.1321f, -0.1355f, 0.0974f, 0.0457f,
+ -73.3516f, -75.0655f, -72.1062f, -72.4624f, -34.8640f, -14.3727f,
+ -4.4720f, 66.4982f, -18.8358f, 0.0397f, 174.2172f, 160.4959f,
+ 161.1034f, 147.3250f, 9.5507f, -45.0180f, -73.1609f, -1.5230f,
+ -74.8677f, -43.8559f, -68.7622f, -4.8971f, -82.1922f, 9.6490f,
+ 64.7115f, -71.8566f, -75.3879f, -72.5479f, -71.7161f, -34.8056f,
+ 0.1442f, 0.1558f, 0.1267f, -0.1261f, -0.0506f, -0.0823f,
+ -0.1807f, -0.0889f, -0.2098f, -0.1295f, -0.2046f, -0.1749f,
+ -0.1197f, -0.1380f, 0.0799f, -0.0889f, -0.1209f, 0.1919f,
+ 0.1947f, -0.2086f, -0.1042f, -0.0468f, 0.0232f, 0.1052f,
+ -0.0535f, 0.1398f, 0.1713f, -0.1522f, 0.1453f, 0.0286f,
+ -64.8503f, -67.6746f, -63.6497f, -60.4614f, -35.6091f, -20.1605f,
+ -3.6082f, 84.2801f, -37.8552f, -2.2371f, 132.4947f, 123.5057f,
+ 123.5776f, 113.9060f, -14.8772f, -40.7130f, -79.1391f, -10.7024f,
+ -65.7831f, -43.6078f, -79.6847f, -13.0743f, -69.2533f, -16.0171f,
+ 50.4868f, -64.3678f, -68.7061f, -64.0823f, -59.3413f, -28.9405f,
+ 77.1601f, 75.4899f, 69.8696f, 67.8764f, -22.7548f, 5.9814f,
+ -3.2826f, 57.9754f, -5.9500f, -0.0014f, 77.2251f, 74.0737f,
+ 73.7004f, 70.5072f, -80.9661f, 69.3065f, 55.8337f, 76.8831f,
+ 57.9902f, 63.4765f, 56.4748f, 70.0282f, 61.0874f, -81.3960f,
+ 26.2594f, 76.0367f, 74.9115f, 69.2361f, 66.9262f, -20.1637f,
+ 0.1886f, -0.1108f, 0.1262f, 0.0189f, 0.1382f, 0.0859f,
+ -0.1874f, -0.1986f, -0.0171f, -0.1400f, -0.2944f, -0.0750f,
+ -0.0395f, -0.2092f, -0.0878f, 0.1216f, -0.0870f, -0.1613f,
+ 0.2495f, 0.0754f, 0.0244f, -0.1205f, -0.0196f, -0.1729f,
+ 0.1170f, 0.1585f, 0.1482f, -0.1705f, -0.1337f, 0.0199f,
+ 13.0897f, 9.1111f, 6.7413f, 6.3907f, -28.1187f, 0.4556f,
+ -5.3116f, 30.7293f, -16.3644f, -0.0365f, 118.9118f, 111.6125f,
+ 111.3227f, 103.4680f, -30.1883f, 8.9328f, -4.1876f, 79.3936f,
+ -9.0522f, 12.7861f, -1.2736f, 78.0446f, -5.9485f, -30.5716f,
+ 27.8951f, 13.9613f, 6.7173f, 5.2345f, 8.3271f, -27.3705f,
+ 1.0488f, 1.0864f, 1.0710f, 1.7332f, -3.0561f, 1.1622f,
+ -7.6688f, 3.0491f, -1.3865f, 0.0769f, 222.5451f, 207.8170f,
+ 208.1767f, 193.1396f, 0.4447f, 2.1654f, 1.8929f, 35.1469f,
+ 1.1783f, 2.6199f, 1.1611f, 26.2989f, 3.4446f, 0.1551f,
+ 65.6529f, 1.2229f, 0.9851f, 1.0241f, 1.4373f, -3.3421f,
+ 0.1388f, 0.0756f, 0.2047f, 0.1140f, 0.0945f, 0.2038f,
+ 0.1038f, -0.2068f, -0.0626f, -0.1937f, 0.1347f, -0.0464f,
+ -0.0866f, 0.0250f, 0.0264f, -0.1556f, -0.1625f, 0.1028f,
+ -0.1255f, -0.0854f, 0.1033f, 0.0008f, -0.2133f, -0.0317f,
+ 0.1725f, -0.1054f, -0.1900f, 0.0383f, 0.0440f, -0.1900f,
+ -30.0811f, -30.9929f, -29.3194f, -26.8347f, -20.5957f, -4.1595f,
+ -1.9066f, 42.4707f, -9.0435f, 0.0064f, 175.7328f, 163.1350f,
+ 163.5085f, 151.1648f, 4.4620f, -20.6011f, -19.3402f, 1.5468f,
+ -32.0920f, -25.4581f, -12.3706f, -2.1636f, -32.4569f, 3.9365f,
+ 61.0117f, -28.4195f, -31.0837f, -30.2749f, -27.5522f, -22.8688f,
+ -0.3000f, 0.0092f, -0.3675f, -0.4113f, 0.0033f, 0.1138f,
+ 0.2182f, -0.5803f, 0.7507f, -0.2529f, -1.7724f, -1.4702f,
+ -1.5805f, -1.4294f, 0.1435f, -0.0168f, 0.2356f, -0.4373f,
+ -0.4500f, -0.4803f, -0.0041f, -0.3878f, 0.1321f, 0.2761f,
+ -1.1975f, -0.3509f, -0.0465f, -0.4050f, -0.1110f, 0.2233f,
+ 0.0950f, 0.0974f, -0.1600f, -0.1753f, -0.0328f, 0.0741f,
+ -0.0706f, 0.1839f, -0.0833f, -0.1367f, -0.1094f, -0.1739f,
+ -0.1069f, 0.0370f, -0.1404f, 0.1631f, -0.1570f, 0.2117f,
+ -0.1891f, 0.0395f, 0.1081f, 0.1760f, 0.0997f, 0.0853f,
+ -0.1018f, 0.1306f, -0.0924f, -0.2078f, 0.0801f, -0.0949f,
+ 0.5803f, 0.5578f, 0.4089f, 0.1912f, 0.6774f, 0.3145f,
+ 0.3992f, -0.1316f, 1.3142f, -0.2457f, -2.3536f, -2.4939f,
+ -2.3165f, -2.4879f, 0.2321f, 0.1901f, 0.1789f, -1.5215f,
+ 0.2645f, 0.2231f, 0.2411f, -1.2361f, 0.2971f, 0.1421f,
+ -1.6715f, 0.3158f, 0.2476f, 0.3596f, 0.3029f, 0.9297f,
+ -88.8401f, -89.5209f, -86.1926f, -87.4196f, -39.6504f, -17.9684f,
+ -4.2702f, 80.2017f, -29.1676f, -0.4190f, 150.2820f, 138.4751f,
+ 139.1087f, 126.6569f, 13.7188f, -57.0739f, -80.3383f, -18.8351f,
+ -87.4103f, -56.0072f, -82.7707f, -23.1871f, -93.6787f, 13.9287f,
+ 59.6213f, -87.4843f, -90.4227f, -86.2635f, -86.6841f, -37.9086f,
+ 0.1184f, -0.2169f, -0.1915f, 0.0543f, 0.1253f, -0.1370f,
+ 0.0836f, -0.1198f, 0.1544f, -0.2004f, -0.1118f, -0.0786f,
+ 0.1517f, -0.1000f, -0.1055f, 0.0936f, -0.1579f, 0.1098f,
+ -0.0234f, -0.0499f, 0.0951f, -0.1711f, 0.0186f, -0.2008f,
+ 0.1777f, 0.1386f, -0.1495f, -0.0684f, -0.2149f, -0.1198f,
+ -0.6205f, -0.7209f, -0.5487f, -0.9080f, 1.3400f, 0.0085f,
+ 28.2837f, 3.2217f, -1.8463f, 0.1620f, -464.3599f, -458.4327f,
+ -455.9967f, -451.0393f, 1.6619f, -0.6944f, -0.3167f, -52.3630f,
+ -1.6971f, -0.7340f, -0.8923f, -49.2771f, -1.1177f, 1.8810f,
+ -258.9386f, -1.0765f, -0.7279f, -0.5208f, -0.8839f, 1.8175f,
+ -78.8510f, -80.5740f, -77.8843f, -77.9798f, -36.5560f, -16.0818f,
+ -5.5362f, 66.4228f, -16.8150f, 0.0036f, 181.8365f, 167.7181f,
+ 168.2344f, 153.9725f, 11.2659f, -47.5786f, -92.6978f, 6.7573f,
+ -68.7704f, -48.3850f, -95.3637f, 8.8888f, -76.9497f, 11.2243f,
+ 60.9020f, -77.6515f, -80.7610f, -78.4537f, -77.4659f, -36.2872f,
+ -0.0936f, 0.1966f, -0.2121f, 0.0193f, 0.0489f, -0.1445f,
+ 0.0060f, 0.0358f, -0.0783f, -0.0985f, -0.2072f, -0.0802f,
+ -0.0185f, 0.1868f, -0.0631f, 0.1260f, -0.0675f, 0.2167f,
+ -0.2174f, -0.1085f, 0.1483f, -0.1655f, -0.1040f, 0.1605f,
+ -0.1673f, -0.0148f, -0.1856f, -0.1454f, 0.1603f, -0.1620f,
+ -0.9205f, -1.2716f, -3.6561f, -5.0834f, -0.7934f, 1.8710f,
+ 2.2999f, -2.9516f, -1.7631f, -0.3804f, 41.2998f, 26.2358f,
+ 28.9763f, 15.7315f, 5.2164f, 3.2963f, -5.4457f, 18.6310f,
+ -25.0076f, 5.4368f, -12.0085f, 17.1462f, -14.6992f, 5.6365f,
+ 48.6207f, -1.0921f, -1.8723f, -3.5354f, -5.1774f, -1.0200f,
+ -0.1065f, -0.2021f, 0.0332f, 0.1692f, -0.1239f, 0.1325f,
+ -0.0660f, -0.0567f, 0.2107f, -0.2084f, -0.0263f, 0.1411f,
+ 0.0178f, 0.0451f, 0.2024f, -0.1756f, -0.0771f, -0.1690f,
+ -0.2097f, -0.2130f, 0.0714f, 0.0172f, -0.0310f, 0.0649f,
+ -0.1550f, 0.0701f, 0.0306f, -0.1750f, -0.1988f, -0.2060f,
+ 0.0005f, -0.1325f, -0.1823f, -0.0900f, -0.1291f, -0.1817f,
+ 0.0144f, 0.0951f, -0.1954f, -0.0171f, -0.1985f, 0.0875f,
+ 0.0901f, -0.0857f, 0.1681f, 0.0465f, 0.1023f, 0.0985f,
+ -0.2152f, -0.1723f, -0.0825f, 0.0203f, -0.1206f, -0.1431f,
+ -0.1552f, 0.1344f, 0.0398f, 0.0169f, 0.2180f, -0.1530f,
+ 2.7964f, 2.7312f, 2.8831f, 3.4729f, -3.1366f, 2.4043f,
+ -7.2004f, 1.4128f, 2.8648f, 0.0578f, 225.5640f, 210.3712f,
+ 210.6907f, 195.0339f, 0.3140f, 1.8060f, 2.7355f, 33.6917f,
+ 3.3542f, 3.3682f, 1.7371f, 31.2424f, 3.4094f, -0.1192f,
+ 63.0864f, 3.0562f, 2.8633f, 2.6777f, 3.5495f, -4.2616f,
+ -1.4034f, 0.3930f, -4.6756f, -9.9870f, -27.8511f, 5.6071f,
+ -1.0862f, 34.4907f, -10.4831f, -0.0281f, 117.2617f, 104.9590f,
+ 106.1515f, 93.9707f, -16.8801f, 5.3036f, -21.7458f, 98.5306f,
+ -20.7596f, 6.4733f, -17.6440f, 98.3097f, -31.9540f, -17.0600f,
+ 27.4543f, -0.6140f, -1.6182f, -4.9167f, -8.9017f, -26.2485f,
+ -0.1952f, -0.0462f, -0.1958f, 0.1679f, -0.1592f, -0.1634f,
+ -0.0507f, -0.0542f, 0.0038f, -0.0343f, 0.0567f, -0.1983f,
+ 0.0250f, -0.0762f, 0.0902f, -0.0343f, 0.1240f, 0.1161f,
+ 0.1237f, 0.1870f, 0.0346f, 0.0340f, 0.0625f, -0.0355f,
+ 0.0278f, -0.1043f, 0.1755f, 0.0253f, 0.1750f, -0.2070f,
+ -5.5531f, -5.3122f, -4.9348f, -4.4782f, -7.5686f, -1.5478f,
+ -5.4341f, 0.5087f, -2.1382f, 0.0798f, 208.3677f, 194.0083f,
+ 194.4168f, 179.3082f, 1.4443f, -1.5038f, -1.4021f, 25.9363f,
+ -4.0635f, -2.6785f, -1.6640f, 22.2589f, -1.4910f, 1.4715f,
+ 59.1972f, -4.9638f, -5.1920f, -4.9193f, -5.2649f, -8.0556f,
+ 20.1226f, 12.0195f, 9.7385f, 10.7058f, -27.4201f, 8.4869f,
+ -5.0826f, 32.9212f, -2.0674f, -0.0290f, 120.5002f, 112.3222f,
+ 112.3287f, 104.1107f, -20.6293f, 14.8534f, -0.8748f, 103.1141f,
+ -1.1368f, 15.3716f, 2.7653f, 91.7285f, -0.5991f, -20.7338f,
+ 35.9363f, 20.5104f, 11.1988f, 9.0368f, 10.6355f, -26.5309f,
+ -0.2058f, -0.2176f, 0.1331f, -0.1415f, -0.0825f, -0.0470f,
+ -0.0615f, 0.1274f, 0.0076f, -0.0575f, -0.2065f, 0.0866f,
+ 0.2166f, -0.1942f, -0.1952f, 0.1323f, -0.1016f, 0.1803f,
+ -0.0424f, 0.1555f, 0.1118f, 0.1559f, 0.0337f, -0.0341f,
+ -0.0430f, 0.1988f, -0.0553f, -0.0255f, 0.1817f, 0.0608f,
+ 0.1431f, 0.0686f, -0.0245f, -0.2107f, 0.2001f, -0.0964f,
+ -0.0090f, 0.1151f, -0.0365f, -0.1986f, 0.1740f, -0.2098f,
+ 0.0013f, 0.1369f, 0.1910f, 0.1801f, -0.2019f, 0.0348f,
+ -0.1175f, 0.0627f, -0.1929f, -0.0099f, 0.1349f, 0.1804f,
+ -0.1071f, -0.1651f, -0.1146f, -0.0259f, 0.1626f, -0.0271f,
+ 0.1393f, 0.1304f, -0.0200f, 0.0924f, -0.0839f, -0.0031f,
+ -0.1311f, 0.0350f, -0.1330f, -0.0911f, 0.1949f, -0.0209f,
+ -0.1883f, 0.0269f, 0.2040f, 0.1552f, 0.1532f, 0.1157f,
+ -0.1102f, -0.1220f, -0.0808f, -0.1050f, 0.1716f, 0.0846f,
+ -0.0180f, -0.1037f, 0.2063f, 0.1237f, 0.1253f, -0.0496f,
+ -0.0183f, 0.0491f, 0.1703f, -0.0824f, -0.0702f, -0.1100f,
+ -0.0965f, 0.0130f, -0.1222f, -0.1081f, 0.0329f, 0.2115f,
+ -0.1438f, 0.0799f, -0.1602f, -0.0330f, 0.0501f, 0.1072f,
+ -0.0744f, -0.1783f, -0.0240f, 0.0777f, -0.1944f, 0.0438f,
+ -0.0033f, -0.1873f, 0.0984f, -0.0318f, 0.0773f, 0.1489f,
+ 0.3966f, 0.4711f, 0.3972f, 0.0623f, 0.5970f, 0.1018f,
+ 0.1375f, -0.1881f, 0.8921f, -0.1854f, -2.1138f, -2.1178f,
+ -1.8295f, -2.1703f, 0.5784f, -0.1937f, -0.0728f, -0.9953f,
+ 0.2442f, -0.4074f, -0.1591f, -1.1660f, 0.4832f, 0.2203f,
+ -1.4957f, 0.1544f, 0.1810f, 0.2275f, 0.4075f, 0.8153f,
+ 0.0715f, 0.0222f, 0.0463f, -0.0201f, 0.0396f, 0.5951f,
+ -0.2779f, -0.0306f, 0.7532f, -0.1596f, -4.1080f, -3.7925f,
+ -3.8522f, -3.2468f, 0.7728f, 0.0188f, -0.1448f, 0.4084f,
+ -0.4666f, -0.1036f, -1.1469f, 0.4243f, 0.2778f, 0.9023f,
+ -3.0216f, 0.0384f, -0.3348f, -0.0314f, -0.2788f, 0.0479f,
+ 139.0773f, 131.6164f, 115.0392f, 111.1817f, 41.7596f, 9.5379f,
+ 1.8542f, 46.9890f, -12.8221f, 0.0241f, 52.9779f, 51.5268f,
+ 50.8060f, 48.7028f, -132.9665f, 118.3478f, 101.1239f, 81.4608f,
+ 75.4251f, 121.0643f, 97.8947f, 86.8911f, 74.5576f, -133.7606f,
+ 29.2657f, 135.8916f, 131.3661f, 114.1687f, 111.0784f, 31.3790f,
+ -0.0807f, -0.0657f, -0.0027f, 0.0410f, 0.0765f, 0.1194f,
+ 0.0953f, -0.0060f, 0.1531f, -0.2339f, 0.1488f, -0.0615f,
+ -0.0579f, 0.0761f, 0.1250f, -0.0469f, 0.1480f, 0.0683f,
+ -0.0049f, 0.1558f, 0.2168f, -0.0736f, 0.1135f, -0.1244f,
+ 0.0725f, -0.1297f, -0.0215f, -0.0412f, -0.1632f, -0.0200f,
+ -0.1346f, -0.1954f, 0.0053f, 0.0151f, 0.1379f, -0.1497f,
+ -0.0102f, -0.0336f, 0.0900f, -0.1706f, -0.0932f, -0.2084f,
+ 0.1242f, -0.2027f, 0.0849f, -0.2139f, -0.2015f, 0.0944f,
+ -0.0984f, 0.2082f, 0.1625f, -0.0227f, -0.1676f, 0.1021f,
+ 0.1516f, 0.0245f, 0.0955f, -0.1488f, -0.0057f, 0.1783f,
+ -0.8568f, -0.8175f, -0.6282f, -1.3107f, 1.5712f, 0.1044f,
+ 28.2289f, 3.0885f, -1.9829f, 0.1600f, -465.9583f, -459.5893f,
+ -457.5055f, -452.7600f, 1.7229f, -0.6620f, -0.1065f, -52.8017f,
+ -2.0293f, -0.8224f, -1.0389f, -49.9049f, -1.2250f, 1.7647f,
+ -259.2465f, -1.0978f, -0.5169f, -0.8721f, -0.8197f, 1.9158f,
+ 16.2234f, 15.8523f, 13.8343f, 9.8509f, -21.4326f, 15.7650f,
+ -6.4451f, 34.8575f, 1.1387f, -0.0223f, 117.7213f, 109.8494f,
+ 109.7624f, 101.8532f, -20.3275f, 16.0812f, 4.9165f, 92.4919f,
+ 4.1615f, 13.8451f, 9.2112f, 97.1580f, -8.7037f, -20.4420f,
+ 27.1105f, 17.4922f, 13.9998f, 12.3888f, 11.4705f, -20.9568f,
+ 0.5457f, 0.5322f, 0.2823f, 0.3581f, 0.5359f, 0.1576f,
+ 0.1969f, -0.0136f, -0.2748f, -0.3168f, -0.3918f, -0.2167f,
+ -0.1797f, -0.1869f, 0.2986f, -0.2116f, -0.4226f, -0.2022f,
+ 0.9452f, 0.5474f, -0.1218f, 0.2067f, -0.1600f, 0.1937f,
+ 0.0808f, 0.4877f, 0.5106f, 0.2626f, 0.5076f, 0.6228f,
+ 0.5124f, 0.4044f, 0.4023f, 0.1222f, 2.5446f, 0.9623f,
+ 24.9875f, 4.7442f, -2.0551f, 0.1642f, -449.9478f, -444.1841f,
+ -442.0153f, -437.1498f, 2.3209f, -0.6986f, -0.3456f, -47.4074f,
+ -1.2374f, -1.0939f, -0.9112f, -41.1851f, -0.5064f, 2.4209f,
+ -263.4446f, -0.0433f, 0.3460f, 0.1475f, 0.3770f, 2.9154f,
+ 0.2032f, 0.1527f, 0.2161f, -0.1981f, 0.1893f, -0.2003f,
+ 0.1734f, 0.1713f, 0.1207f, -0.2073f, -0.1018f, 0.0770f,
+ 0.0728f, 0.1665f, 0.0689f, 0.1884f, -0.1399f, -0.1326f,
+ -0.0518f, -0.1948f, 0.1576f, -0.1835f, 0.1436f, 0.0497f,
+ 0.0883f, -0.1253f, -0.0417f, -0.0507f, -0.1555f, 0.2076f,
+ -2.4080f, 6.1616f, -0.8564f, -13.6773f, -32.7238f, -16.3144f,
+ -1.9828f, 20.5110f, -17.0191f, -1.7154f, 103.6642f, 95.3675f,
+ 95.5662f, 86.9504f, -35.5340f, 19.6681f, -2.4900f, 65.0847f,
+ -15.8119f, 13.7256f, -4.6753f, 63.4713f, -6.5992f, -34.2369f,
+ 41.3959f, -1.5528f, 3.8106f, -0.7762f, -12.3204f, -35.1734f,
+ -83.9509f, -87.4861f, -83.5925f, -81.5047f, -54.1256f, -45.7506f,
+ -13.5325f, -6.0331f, -8.5062f, 0.0261f, 189.9450f, 177.7870f,
+ 178.6945f, 164.9762f, 9.8521f, -68.0619f, -68.6145f, 6.5056f,
+ -55.9651f, -66.9540f, -65.3349f, -2.1954f, -57.2408f, 8.6577f,
+ 60.6966f, -82.1056f, -88.5245f, -83.3057f, -80.7283f, -50.5285f,
+ -0.1397f, 0.1862f, -0.0691f, -0.0906f, 0.1560f, 0.1377f,
+ -0.0066f, -0.0213f, 0.0708f, -0.0386f, -0.0015f, -0.0020f,
+ -0.2122f, 0.0747f, 0.0795f, 0.0229f, 0.1923f, -0.1661f,
+ 0.0895f, 0.1176f, 0.1398f, -0.0443f, 0.0934f, 0.0638f,
+ -0.1924f, 0.0602f, 0.0404f, 0.1597f, 0.1387f, -0.0601f,
+ -28.3967f, -21.8483f, -25.5175f, -29.9252f, 2.0161f, -3.0092f,
+ 7.7435f, 28.2367f, -35.0188f, -0.1578f, 105.0164f, 93.4495f,
+ 94.9134f, 81.0315f, 4.3602f, 8.1303f, -37.7665f, -16.6986f,
+ -40.8902f, 8.2542f, -33.3215f, -2.0457f, -69.0245f, 4.1016f,
+ 47.2770f, -25.8268f, -23.6034f, -26.4339f, -27.8305f, 8.4468f,
+ 13.8742f, 8.3874f, 4.2044f, 1.4619f, -40.2909f, -0.6358f,
+ -0.7982f, 36.1931f, -17.3147f, -0.3348f, 106.8135f, 96.5298f,
+ 97.8829f, 86.9994f, -25.8170f, 15.0652f, -0.9181f, 85.8544f,
+ 2.5475f, 9.8009f, -3.5931f, 89.2017f, -3.7252f, -25.2986f,
+ 22.5505f, 14.0434f, 7.0708f, 4.6646f, 1.5807f, -39.4024f,
+ -0.1436f, 0.0256f, 0.0274f, -0.2126f, 0.0401f, 0.0745f,
+ -0.0379f, -0.0357f, 0.0777f, -0.0709f, -0.1093f, -0.2047f,
+ -0.0713f, -0.0478f, -0.0908f, 0.1963f, 0.1282f, 0.0977f,
+ 0.1304f, 0.2058f, 0.0700f, 0.0518f, 0.0239f, 0.0686f,
+ -0.1909f, 0.0828f, -0.1243f, -0.1920f, 0.1908f, -0.0808f,
+ 90.8028f, 89.2894f, 84.5339f, 83.3491f, -13.3838f, 12.0240f,
+ -3.9443f, 63.0867f, -2.5321f, -0.0099f, 68.9140f, 66.3206f,
+ 66.0278f, 63.1498f, -83.7261f, 74.3448f, 73.4998f, 64.8477f,
+ 69.7701f, 74.5878f, 71.0331f, 63.2116f, 74.3162f, -83.9282f,
+ 20.8163f, 89.6818f, 88.6452f, 83.7338f, 82.9360f, -13.2357f,
+ 0.1299f, -0.1765f, -0.0168f, -0.1372f, -0.1183f, 0.0472f,
+ 0.1312f, 0.0267f, 0.0194f, -0.1593f, 0.0059f, 0.1775f,
+ 0.0668f, -0.1239f, -0.1982f, -0.1415f, -0.1659f, -0.1148f,
+ 0.0136f, 0.0913f, -0.1254f, -0.0357f, 0.0892f, 0.0835f,
+ -0.0554f, 0.1969f, -0.0888f, -0.0623f, -0.0236f, -0.1492f,
+ 0.4196f, 0.3218f, 0.2287f, 0.5095f, 0.7210f, 0.2279f,
+ 0.4523f, -0.1832f, 1.3095f, -0.2041f, -2.1443f, -2.1947f,
+ -1.9292f, -2.1142f, 0.5840f, 0.1018f, 0.1011f, -1.6565f,
+ 0.4325f, 0.0424f, 0.2836f, -1.7183f, 0.2595f, 0.2686f,
+ -1.8784f, 0.3891f, 0.3050f, 0.6195f, 0.2896f, 0.5905f,
+ -5.3024f, -3.2518f, -12.5192f, -29.1732f, 1.6538f, -1.8315f,
+ 9.9788f, 10.5155f, 6.3234f, -0.3460f, 76.9925f, 51.3785f,
+ 55.7120f, 29.0432f, 5.5901f, 25.6578f, -3.9565f, 13.0509f,
+ -106.0371f, 23.2124f, -18.2004f, 8.4618f, -69.3585f, 5.5651f,
+ 80.0565f, -6.4941f, -5.3742f, -14.4209f, -24.1565f, 6.6801f,
+ -22.0585f, -20.9909f, -26.7939f, -29.6890f, -14.5085f, 2.1866f,
+ -4.2608f, 17.3977f, -30.8824f, -0.4017f, 135.6957f, 126.9320f,
+ 127.0044f, 118.1835f, -1.8768f, -0.8629f, -32.0882f, 44.7862f,
+ -23.9174f, 1.6485f, -27.9940f, 51.9078f, -48.5279f, -1.7550f,
+ 49.9230f, -19.9785f, -22.4647f, -27.6911f, -27.3197f, -10.6545f,
+ -0.1922f, -0.1999f, -0.1396f, 0.1065f, 0.0085f, -0.1940f,
+ 0.0351f, 0.1285f, -0.0292f, -0.1296f, 0.1543f, -0.2082f,
+ -0.1758f, 0.0719f, 0.0764f, 0.1394f, -0.0255f, -0.0370f,
+ 0.1615f, -0.0568f, 0.1920f, -0.1631f, 0.0199f, 0.1884f,
+ 0.0693f, 0.1074f, -0.0273f, 0.1540f, 0.0098f, 0.2111f,
+ 0.1805f, -0.0555f, 0.1159f, 0.0469f, 0.1789f, -0.1711f,
+ -0.1304f, 0.1912f, -0.0737f, -0.1408f, 0.1804f, -0.2023f,
+ -0.0467f, -0.1019f, -0.0136f, 0.0691f, 0.1454f, -0.0213f,
+ 0.0929f, -0.0958f, 0.1299f, 0.1137f, 0.1175f, 0.1042f,
+ -0.2081f, -0.0737f, 0.0582f, 0.1640f, 0.2120f, -0.0646f,
+ -0.0326f, 0.1976f, 0.1182f, -0.1365f, -0.1784f, 0.2113f,
+ 0.0469f, 0.0763f, -0.0197f, -0.1902f, 0.1259f, 0.1598f,
+ -0.0180f, -0.1339f, -0.1675f, -0.1884f, -0.1973f, 0.1529f,
+ 0.1160f, 0.2154f, -0.1446f, -0.1395f, 0.0355f, 0.1513f,
+ -0.2086f, -0.1135f, -0.1502f, -0.0018f, 0.0486f, -0.0110f,
+ -0.0843f, -0.0716f, -0.1367f, 0.0753f, 0.0114f, 0.0475f,
+ -0.0632f, 0.2045f, -0.0512f, -0.0906f, -0.1071f, -0.1957f,
+ 0.1361f, 0.1821f, -0.1684f, -0.1383f, 0.1059f, 0.1579f,
+ -0.0064f, -0.1205f, -0.0718f, -0.1323f, -0.0174f, -0.1092f,
+ -0.1915f, 0.1978f, -0.1245f, 0.1297f, -0.1542f, 0.1556f,
+ -0.1752f, 0.0718f, -0.1020f, -0.1970f, 0.0518f, -0.0888f,
+ 0.0541f, -0.1922f, -0.1467f, -0.0653f, -0.1940f, -0.0800f,
+ -0.1096f, -0.0796f, -0.1310f, 0.0191f, -0.1077f, -0.0973f,
+ 0.1566f, 0.0074f, 0.0500f, -0.0415f, -0.2116f, 0.0227f,
+ 0.0895f, 0.1528f, 0.1404f, 0.0467f, 0.0462f, -0.0973f,
+ -0.1669f, 0.0551f, 0.1167f, -0.1470f, -0.0542f, -0.1006f,
+ 0.2104f, 0.1039f, -0.0211f, -0.1726f, -0.0694f, -0.0270f,
+ 0.0277f, -0.0715f, -0.2055f, -0.1502f, -0.1718f, -0.0043f,
+ 0.0174f, 0.1019f, -0.0233f, -0.1518f, -0.1331f, -0.0001f,
+ -0.1483f, -0.2115f, 0.0666f, 0.0014f, 0.1601f, -0.0690f,
+ };
+
+static const float av1_rdcost_model_nn_biases_layer0[NUM_HIDDEN_NODES] = {
+ 0.156824f, 0.f, 0.130013f, 0.084482f, -129.058197f, -15.090252f,
+ -3.859116f, 0.736356f, -81.361557f, -0.001922f, -0.000713f, 0.440181f,
+ 14.982646f, 1.282223f, 2.23122f, 94.26635f, 93.920929f, 0.614672f,
+ 0.f, 0.315858f, 4.746014f, 0.116901f, -35.661354f, -75.148285f,
+ 92.006989f, -14.112332f, 86.673157f, -0.000307f, -0.000544f, 0.f,
+ -7.851313f, 0.505186f, 0.f, 0.f, -111.681091f, -0.937782f,
+ 0.035789f, 0.f, 0.f, -0.00102f, -75.180527f, 0.f,
+ -63.821148f, 79.592392f, 0.085068f, 11.184906f, 1.25406f, 0.f,
+ -29.779242f, -0.181732f, 0.f, 0.425554f, -90.78405f, 0.f,
+ -0.828326f, -81.132179f, 0.f, -2.757063f, 0.f, 0.f,
+ 2.967951f, -4.440599f, 0.f, -5.105355f, 14.734543f, 0.f,
+ 0.f, 0.f, 0.f, 0.295342f, -0.026907f, 133.375412f,
+ -0.000855f, 0.f, -0.875029f, 15.665165f, 0.437296f, 0.321257f,
+ -0.001932f, -4.235782f, -87.187782f, 0.f, -28.84696f, 7.055514f,
+ 0.f, 95.548302f, -0.000425f, 0.38969f, -13.88008f, -27.347931f,
+ 0.f, 0.f, 0.f, -0.000026f, 0.f, 0.f,
+};
+
+static const float
+ av1_rdcost_model_nn_weights_layer1[NUM_HIDDEN_NODES * NUM_OUTPUTS] = {
+ -0.101706f, -0.14411f, -0.139118f, -0.132945f, 118.811302f,
+ 3.137232f, -32.969776f, -4.150725f, 26.263071f, 0.092841f,
+ 0.174125f, -0.028195f, 15.712872f, 17.722702f, 5.666006f,
+ -121.143929f, -131.933731f, -3.000318f, -0.032063f, -0.380065f,
+ -1.660653f, -0.164802f, 7.177527f, 87.759155f, -119.564224f,
+ -98.051651f, -110.581116f, -0.069982f, 0.023906f, 0.183792f,
+ 40.606274f, -0.080804f, -0.053744f, -0.187848f, 157.44313f,
+ -4.820149f, 0.089499f, 0.070232f, -0.043038f, 0.072996f,
+ 93.347313f, 0.225259f, 103.223228f, -110.682541f, 0.14314f,
+ -89.827538f, 6.505952f, -0.076949f, 73.816132f, -0.063416f,
+ -0.23736f, -0.066059f, 116.049599f, 0.120871f, -4.708246f,
+ 107.501671f, -0.206708f, -32.688675f, 0.047608f, -0.105907f,
+ 6.505825f, -75.461891f, -0.160341f, 6.532121f, -84.868111f,
+ -0.065622f, 0.044756f, 0.008672f, 0.017155f, 0.046108f,
+ -0.218818f, -126.507957f, 0.028271f, 0.180625f, -4.707376f,
+ -121.524307f, -0.03853f, -4.103166f, -0.018947f, -95.768463f,
+ 15.941695f, 0.147154f, -102.863029f, -72.521698f, -0.037133f,
+ -138.1492f, 0.210016f, -0.084692f, -68.693665f, -52.523472f,
+ -0.133385f, -0.17438f, 0.008654f, -0.035642f, -0.145202f,
+ 0.211135f,
+ };
+
+static const float av1_rdcost_model_nn_biases_layer1[NUM_OUTPUTS] = {
+ 0.251909f
+};
+
+static const NN_CONFIG av1_rdcost_model_nnconfig = {
+ NUM_FEATURES,
+ NUM_OUTPUTS,
+ NUM_HIDDEN_LAYERS,
+ {
+ NUM_HIDDEN_NODES,
+ },
+ {
+ av1_rdcost_model_nn_weights_layer0,
+ av1_rdcost_model_nn_weights_layer1,
+ },
+ {
+ av1_rdcost_model_nn_biases_layer0,
+ av1_rdcost_model_nn_biases_layer1,
+ },
+};
+
+//------------------------------------------------------------------------------
+
+#undef NUM_FEATURES
+#undef NUM_HIDDEN_LAYERS
+#undef NUM_HIDDEN_NODES
+#undef NUM_OUTPUTS
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
index ac9392fa13..3aae0144e6 100644
--- a/third_party/aom/av1/encoder/ratectrl.c
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -421,9 +421,9 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width,
projected_size_based_on_q =
av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
} else {
- projected_size_based_on_q =
- av1_estimate_bits_at_q(cpi->common.frame_type, cm->base_qindex, MBs,
- rate_correction_factor, cm->bit_depth);
+ projected_size_based_on_q = av1_estimate_bits_at_q(
+ cpi->common.frame_type, cm->base_qindex, MBs, rate_correction_factor,
+ cm->seq_params.bit_depth);
}
// Work out a size correction factor.
if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
@@ -495,7 +495,7 @@ int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
(int)av1_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
} else {
bits_per_mb_at_this_q = (int)av1_rc_bits_per_mb(
- cm->frame_type, i, correction_factor, cm->bit_depth);
+ cm->frame_type, i, correction_factor, cm->seq_params.bit_depth);
}
if (bits_per_mb_at_this_q <= target_bits_per_mb) {
@@ -643,7 +643,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
int q;
int *rtc_minq;
- ASSIGN_MINQ_TABLE(cm->bit_depth, rtc_minq);
+ const int bit_depth = cm->seq_params.bit_depth;
+ ASSIGN_MINQ_TABLE(bit_depth, rtc_minq);
if (frame_is_intra_only(cm)) {
active_best_quality = rc->best_quality;
@@ -652,17 +653,17 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
// based on the ambient Q to reduce the risk of popping.
if (rc->this_key_frame_forced) {
int qindex = rc->last_boosted_qindex;
- double last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth);
- int delta_qindex = av1_compute_qdelta(
- rc, last_boosted_q, (last_boosted_q * 0.75), cm->bit_depth);
+ double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ int delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+ (last_boosted_q * 0.75), bit_depth);
active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
} else if (cm->current_video_frame > 0) {
// not first frame of one pass and kf_boost is set
double q_adj_factor = 1.0;
double q_val;
- active_best_quality = get_kf_active_quality(
- rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth);
+ active_best_quality =
+ get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
// Allow somewhat lower kf minq with small image formats.
if ((width * height) <= (352 * 288)) {
@@ -671,9 +672,9 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
// Convert the adjustment factor to a qindex delta
// on active_best_quality.
- q_val = av1_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+ q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
active_best_quality +=
- av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
}
} else if (!rc->is_src_frame_alt_ref &&
(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
@@ -686,7 +687,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
} else {
q = active_worst_quality;
}
- active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
} else {
// Use the lower of active_worst_quality and recent/average Q.
if (cm->current_video_frame > 1) {
@@ -716,8 +717,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
!(cm->current_video_frame == 0)) {
int qdelta = 0;
aom_clear_system_state();
- qdelta = av1_compute_qdelta_by_rate(
- &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth);
+ qdelta = av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+ active_worst_quality, 2.0, bit_depth);
*top_index = active_worst_quality + qdelta;
*top_index = AOMMAX(*top_index, *bottom_index);
}
@@ -768,27 +769,27 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
int q;
int *inter_minq;
- ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+ const int bit_depth = cm->seq_params.bit_depth;
+ ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
if (frame_is_intra_only(cm)) {
if (oxcf->rc_mode == AOM_Q) {
const int qindex = cq_level;
- const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
const int delta_qindex =
- av1_compute_qdelta(rc, q_val, q_val * 0.25, cm->bit_depth);
+ av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth);
active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
} else if (rc->this_key_frame_forced) {
const int qindex = rc->last_boosted_qindex;
- const double last_boosted_q =
- av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
const int delta_qindex = av1_compute_qdelta(
- rc, last_boosted_q, last_boosted_q * 0.75, cm->bit_depth);
+ rc, last_boosted_q, last_boosted_q * 0.75, bit_depth);
active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
} else { // not first frame of one pass and kf_boost is set
double q_adj_factor = 1.0;
- active_best_quality = get_kf_active_quality(
- rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth);
+ active_best_quality =
+ get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
// Allow somewhat lower kf minq with small image formats.
if ((width * height) <= (352 * 288)) {
@@ -798,9 +799,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
// Convert the adjustment factor to a qindex delta on active_best_quality.
{
const double q_val =
- av1_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+ av1_convert_qindex_to_q(active_best_quality, bit_depth);
active_best_quality +=
- av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
}
}
} else if (!rc->is_src_frame_alt_ref &&
@@ -815,30 +816,30 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
// For constrained quality dont allow Q less than the cq level
if (oxcf->rc_mode == AOM_CQ) {
if (q < cq_level) q = cq_level;
- active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
// Constrained quality use slightly lower active best.
active_best_quality = active_best_quality * 15 / 16;
} else if (oxcf->rc_mode == AOM_Q) {
const int qindex = cq_level;
- const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
const int delta_qindex =
(cpi->refresh_alt_ref_frame)
- ? av1_compute_qdelta(rc, q_val, q_val * 0.40, cm->bit_depth)
- : av1_compute_qdelta(rc, q_val, q_val * 0.50, cm->bit_depth);
+ ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth)
+ : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth);
active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
} else {
- active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
}
} else {
if (oxcf->rc_mode == AOM_Q) {
const int qindex = cq_level;
- const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
0.70, 1.0, 0.85, 1.0 };
const int delta_qindex = av1_compute_qdelta(
rc, q_val,
q_val * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
- cm->bit_depth);
+ bit_depth);
active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
} else {
// Use the lower of active_worst_quality and recent/average Q.
@@ -868,12 +869,12 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
aom_clear_system_state();
if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
!(cm->current_video_frame == 0)) {
- qdelta = av1_compute_qdelta_by_rate(
- &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth);
+ qdelta = av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+ active_worst_quality, 2.0, bit_depth);
} else if (!rc->is_src_frame_alt_ref &&
(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
qdelta = av1_compute_qdelta_by_rate(
- &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth);
+ &cpi->rc, cm->frame_type, active_worst_quality, 1.75, bit_depth);
}
*top_index = active_worst_quality + qdelta;
*top_index = AOMMAX(*top_index, *bottom_index);
@@ -908,9 +909,9 @@ int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME
};
const AV1_COMMON *const cm = &cpi->common;
- int qdelta =
- av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
- rate_factor_deltas[rf_level], cm->bit_depth);
+ int qdelta = av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
+ rate_factor_deltas[rf_level],
+ cm->seq_params.bit_depth);
return qdelta;
}
@@ -927,7 +928,15 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
int active_worst_quality = cpi->twopass.active_worst_quality;
int q;
int *inter_minq;
- ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+ const int bit_depth = cm->seq_params.bit_depth;
+ ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+
+#if CUSTOMIZED_GF
+ const int is_intrl_arf_boost =
+ gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
+#else
+ const int is_intrl_arf_boost = cpi->refresh_alt2_ref_frame;
+#endif // CUSTOMIZED_GF
if (frame_is_intra_only(cm)) {
// Handle the special case for key frames forced when we have reached
@@ -941,16 +950,16 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
active_best_quality = qindex;
- last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
- last_boosted_q * 1.25, cm->bit_depth);
+ last_boosted_q * 1.25, bit_depth);
active_worst_quality =
AOMMIN(qindex + delta_qindex, active_worst_quality);
} else {
qindex = rc->last_boosted_qindex;
- last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
- last_boosted_q * 0.75, cm->bit_depth);
+ last_boosted_q * 0.75, bit_depth);
active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
}
} else {
@@ -960,7 +969,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
// Baseline value derived from cpi->active_worst_quality and kf boost.
active_best_quality =
- get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
+ get_kf_active_quality(rc, active_worst_quality, bit_depth);
// Allow somewhat lower kf minq with small image formats.
if ((width * height) <= (352 * 288)) {
@@ -972,12 +981,12 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
// Convert the adjustment factor to a qindex delta
// on active_best_quality.
- q_val = av1_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+ q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
active_best_quality +=
- av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
}
} else if (!rc->is_src_frame_alt_ref &&
- (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+ (cpi->refresh_golden_frame || is_intrl_arf_boost ||
cpi->refresh_alt_ref_frame)) {
// Use the lower of active_worst_quality and recent
// average Q as basis for GF/ARF best Q limit unless last frame was
@@ -992,24 +1001,45 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
if (oxcf->rc_mode == AOM_CQ) {
if (q < cq_level) q = cq_level;
- active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
// Constrained quality use slightly lower active best.
active_best_quality = active_best_quality * 15 / 16;
} else if (oxcf->rc_mode == AOM_Q) {
- if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) {
+ if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
active_best_quality = cq_level;
} else {
- active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
-
- // Modify best quality for second level arfs. For mode AOM_Q this
- // becomes the baseline frame q.
- if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
- active_best_quality = (active_best_quality + cq_level + 1) / 2;
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
+ int this_height = gf_group->pyramid_level[gf_group->index];
+ while (this_height < gf_group->pyramid_height) {
+ active_best_quality = (active_best_quality + cq_level + 1) / 2;
+ ++this_height;
+ }
+ } else {
+#endif
+ // Modify best quality for second level arfs. For mode AOM_Q this
+ // becomes the baseline frame q.
+ if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+ active_best_quality = (active_best_quality + cq_level + 1) / 2;
+#if USE_SYMM_MULTI_LAYER
+ }
+#endif
}
} else {
- active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
+ int this_height = gf_group->pyramid_level[gf_group->index];
+ while (this_height < gf_group->pyramid_height) {
+ active_best_quality =
+ (active_best_quality + active_worst_quality + 1) / 2;
+ ++this_height;
+ }
+ }
+#endif
}
} else {
if (oxcf->rc_mode == AOM_Q) {
@@ -1031,7 +1061,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
(cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
if (frame_is_intra_only(cm) ||
(!rc->is_src_frame_alt_ref &&
- (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+ (cpi->refresh_golden_frame || is_intrl_arf_boost ||
cpi->refresh_alt_ref_frame))) {
active_best_quality -=
(cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
@@ -1056,7 +1086,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
// Modify active_best_quality for downscaled normal frames.
if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
int qdelta = av1_compute_qdelta_by_rate(
- rc, cm->frame_type, active_best_quality, 2.0, cm->bit_depth);
+ rc, cm->frame_type, active_best_quality, 2.0, bit_depth);
active_best_quality =
AOMMAX(active_best_quality + qdelta, rc->best_quality);
}
@@ -1164,6 +1194,16 @@ static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
static void update_golden_frame_stats(AV1_COMP *cpi) {
RATE_CONTROL *const rc = &cpi->rc;
+#if CUSTOMIZED_GF
+ const TWO_PASS *const twopass = &cpi->twopass;
+ const GF_GROUP *const gf_group = &twopass->gf_group;
+ const int is_intrnl_arf =
+ cpi->oxcf.pass == 2
+ ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
+ : cpi->refresh_alt2_ref_frame;
+#else
+ const int is_intnl_arf = cpi->refresh_alt2_ref_frame;
+#endif
// Update the Golden frame usage counts.
// NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame,
@@ -1184,14 +1224,7 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
} else if (!rc->source_alt_ref_pending) {
rc->source_alt_ref_active = 0;
}
-
- // Decrement count down till next gf
- if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
-
- } else if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) {
- // Decrement count down till next gf
- if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
-
+ } else if (!cpi->refresh_alt_ref_frame && !is_intrnl_arf) {
rc->frames_since_golden++;
}
}
@@ -1199,6 +1232,17 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
const AV1_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
+#if CUSTOMIZED_GF
+ const TWO_PASS *const twopass = &cpi->twopass;
+ const GF_GROUP *const gf_group = &twopass->gf_group;
+ const int is_intrnl_arf =
+ cpi->oxcf.pass == 2
+ ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
+ : cpi->refresh_alt2_ref_frame;
+#else
+ const int is_intrnl_arf = cpi->refresh_alt2_ref_frame;
+#endif
+
const int qindex = cm->base_qindex;
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
@@ -1218,13 +1262,13 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
} else {
if (!rc->is_src_frame_alt_ref &&
- !(cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+ !(cpi->refresh_golden_frame || is_intrnl_arf ||
cpi->refresh_alt_ref_frame)) {
rc->last_q[INTER_FRAME] = qindex;
rc->avg_frame_qindex[INTER_FRAME] =
ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
rc->ni_frames++;
- rc->tot_q += av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params.bit_depth);
rc->avg_q = rc->tot_q / rc->ni_frames;
// Calculate the average Q for normal inter frames (not key or GFU
// frames).
@@ -1240,7 +1284,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
// This is used to help set quality in forced key frames to reduce popping
if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) ||
(!rc->constrained_gf_group &&
- (cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
+ (cpi->refresh_alt_ref_frame || is_intrnl_arf ||
(cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
rc->last_boosted_qindex = qindex;
}
@@ -1591,6 +1635,10 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
rc->max_gf_interval = rc->static_scene_max_gf_interval;
+#if FIX_GF_INTERVAL_LENGTH
+ rc->max_gf_interval = FIXED_GF_LENGTH + 1;
+#endif
+
// Clamp min to max
rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval);
}
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
index 81157ce723..f0508da9e9 100644
--- a/third_party/aom/av1/encoder/ratectrl.h
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -24,6 +24,20 @@ extern "C" {
// Bits Per MB at different Q (Multiplied by 512)
#define BPER_MB_NORMBITS 9
+#define CUSTOMIZED_GF 1
+#define FIX_GF_INTERVAL_LENGTH 0
+
+#if FIX_GF_INTERVAL_LENGTH
+#define FIXED_GF_LENGTH 16
+#define USE_SYMM_MULTI_LAYER 1
+#else
+#define USE_SYMM_MULTI_LAYER 0
+#endif
+
+#if USE_SYMM_MULTI_LAYER
+#define USE_MANUAL_GF4_STRUCT 0
+#endif
+
#define MIN_GF_INTERVAL 4
#define MAX_GF_INTERVAL 16
#define FIXED_GF_INTERVAL 8 // Used in some testing modes only
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
index 17f23e5ec7..c4d4777bfe 100644
--- a/third_party/aom/av1/encoder/rd.c
+++ b/third_party/aom/av1/encoder/rd.c
@@ -44,9 +44,6 @@
#define RD_THRESH_POW 1.25
-// Factor to weigh the rate for switchable interp filters.
-#define SWITCHABLE_INTERP_RATE_FACTOR 1
-
// The baseline rd thresholds for breaking out of the rd loop for
// certain modes are assumed to be based on 8x8 blocks.
// This table is used to correct for block size.
@@ -357,9 +354,10 @@ static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
};
int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
- const int64_t q = av1_dc_quant_Q3(qindex, 0, cpi->common.bit_depth);
+ const int64_t q =
+ av1_dc_quant_Q3(qindex, 0, cpi->common.seq_params.bit_depth);
int64_t rdmult = 0;
- switch (cpi->common.bit_depth) {
+ switch (cpi->common.seq_params.bit_depth) {
case AOM_BITS_8: rdmult = 88 * q * q / 24; break;
case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break;
case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break;
@@ -394,7 +392,7 @@ static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
}
void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
- switch (cpi->common.bit_depth) {
+ switch (cpi->common.seq_params.bit_depth) {
case AOM_BITS_8:
x->sadperbit16 = sad_per_bit16lut_8[qindex];
x->sadperbit4 = sad_per_bit4lut_8[qindex];
@@ -420,7 +418,7 @@ static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
clamp(av1_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
cm->y_dc_delta_q,
0, MAXQ);
- const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
+ const int q = compute_rd_thresh_factor(qindex, cm->seq_params.bit_depth);
for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
// Threshold here seems unnecessarily harsh but fine given actual
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
index 281b676b0f..692367d7a5 100644
--- a/third_party/aom/av1/encoder/rd.h
+++ b/third_party/aom/av1/encoder/rd.h
@@ -43,6 +43,9 @@ extern "C" {
#define RD_THRESH_MAX_FACT 64
#define RD_THRESH_INC 1
+// Factor to weigh the rate for switchable interp filters.
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
// This enumerator type needs to be kept aligned with the mode order in
// const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code.
typedef enum {
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
index 6f4fced871..fef6d28755 100644
--- a/third_party/aom/av1/encoder/rdopt.c
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -58,8 +58,11 @@
#include "av1/encoder/tokenize.h"
#include "av1/encoder/tx_prune_model_weights.h"
+#define DNN_BASED_RD_INTERP_FILTER 0
+
// Set this macro as 1 to collect data about tx size selection.
#define COLLECT_TX_SIZE_DATA 0
+
#if COLLECT_TX_SIZE_DATA
static const char av1_tx_size_data_output_file[] = "tx_size_data.txt";
#endif
@@ -916,9 +919,9 @@ static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
int activity_masking = 0;
int i, j;
- DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
- DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
- DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
+ DECLARE_ALIGNED(16, od_coeff, e[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]);
for (i = 0; i < bsize_h; i++) {
for (j = 0; j < bsize_w; j++) {
e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
@@ -944,9 +947,9 @@ static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w,
int activity_masking = 0;
- DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]);
- DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
- DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
+ DECLARE_ALIGNED(16, uint16_t, y[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]);
int i, j;
for (i = 0; i < bsize_h; i++) {
for (j = 0; j < bsize_w; j++) {
@@ -975,8 +978,8 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
int i, j;
const MACROBLOCKD *xd = &x->e_mbd;
- DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
- DECLARE_ALIGNED(16, uint16_t, rec[MAX_TX_SQUARE]);
+ DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint16_t, rec[MAX_SB_SQUARE]);
assert(bsw >= 8);
assert(bsh >= 8);
@@ -1068,8 +1071,8 @@ static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
int i, j;
const MACROBLOCKD *xd = &x->e_mbd;
- DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
- DECLARE_ALIGNED(16, int16_t, diff16[MAX_TX_SQUARE]);
+ DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, int16_t, diff16[MAX_SB_SQUARE]);
assert(bsw >= 8);
assert(bsh >= 8);
@@ -1112,7 +1115,7 @@ static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex);
} else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
int coeff_shift = AOMMAX(xd->bd - 8, 0);
- DECLARE_ALIGNED(16, uint16_t, dst16[MAX_TX_SQUARE]);
+ DECLARE_ALIGNED(16, uint16_t, dst16[MAX_SB_SQUARE]);
for (i = 0; i < bsh; i++) {
for (j = 0; j < bsw; j++) {
@@ -1146,11 +1149,15 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
const int bh = block_size_high[bsize];
unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
- const int f_index = bsize - BLOCK_16X16;
- if (f_index < 0) {
- const int w_shift = bw == 8 ? 1 : 2;
- const int h_shift = bh == 8 ? 1 : 2;
- if (cpi->common.use_highbitdepth) {
+ if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) {
+ // Special cases: calculate 'esq' values manually, as we don't have 'vf'
+ // functions for the 16 (very small) sub-blocks of this block.
+ const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3;
+ const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3;
+ assert(bw <= 32);
+ assert(bh <= 32);
+ assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
+ if (cpi->common.seq_params.use_highbitdepth) {
const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
for (int i = 0; i < bh; ++i)
@@ -1168,43 +1175,49 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
(src[j + i * src_stride] - dst[j + i * dst_stride]);
}
}
- } else {
- cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[0]);
- cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+ } else { // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks.
+ const int f_index =
+ (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16;
+ assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL);
+ const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index;
+ assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]);
+ assert(block_size_high[bsize] == 4 * block_size_high[subsize]);
+ cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
+ cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
&esq[1]);
- cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+ cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
&esq[2]);
- cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
dst_stride, &esq[3]);
src += bh / 4 * src_stride;
dst += bh / 4 * dst_stride;
- cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[4]);
- cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+ cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
+ cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
&esq[5]);
- cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+ cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
&esq[6]);
- cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
dst_stride, &esq[7]);
src += bh / 4 * src_stride;
dst += bh / 4 * dst_stride;
- cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[8]);
- cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+ cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
+ cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
&esq[9]);
- cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+ cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
&esq[10]);
- cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
dst_stride, &esq[11]);
src += bh / 4 * src_stride;
dst += bh / 4 * dst_stride;
- cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[12]);
- cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+ cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
+ cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
&esq[13]);
- cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+ cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
&esq[14]);
- cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
dst_stride, &esq[15]);
}
@@ -1371,16 +1384,27 @@ static void get_energy_distribution_finer(const int16_t *diff, int stride,
unsigned int esq[256];
const int w_shift = bw <= 8 ? 0 : 1;
const int h_shift = bh <= 8 ? 0 : 1;
- const int esq_w = bw <= 8 ? bw : bw / 2;
- const int esq_h = bh <= 8 ? bh : bh / 2;
+ const int esq_w = bw >> w_shift;
+ const int esq_h = bh >> h_shift;
const int esq_sz = esq_w * esq_h;
int i, j;
memset(esq, 0, esq_sz * sizeof(esq[0]));
- for (i = 0; i < bh; i++) {
- unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
- const int16_t *cur_diff_row = diff + i * stride;
- for (j = 0; j < bw; j++) {
- cur_esq_row[j >> w_shift] += cur_diff_row[j] * cur_diff_row[j];
+ if (w_shift) {
+ for (i = 0; i < bh; i++) {
+ unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+ const int16_t *cur_diff_row = diff + i * stride;
+ for (j = 0; j < bw; j += 2) {
+ cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] +
+ cur_diff_row[j + 1] * cur_diff_row[j + 1]);
+ }
+ }
+ } else {
+ for (i = 0; i < bh; i++) {
+ unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+ const int16_t *cur_diff_row = diff + i * stride;
+ for (j = 0; j < bw; j++) {
+ cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j];
+ }
}
}
@@ -1558,9 +1582,9 @@ static const float *prune_2D_adaptive_thresholds[] = {
NULL,
};
-static int prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
- int blk_row, int blk_col, TxSetType tx_set_type,
- TX_TYPE_PRUNE_MODE prune_mode) {
+static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ int blk_row, int blk_col, TxSetType tx_set_type,
+ TX_TYPE_PRUNE_MODE prune_mode) {
static const int tx_type_table_2D[16] = {
DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT,
ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST,
@@ -1636,7 +1660,7 @@ static int prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
const float score_thresh =
prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1];
- int prune_bitmask = 0;
+ uint16_t prune_bitmask = 0;
for (int i = 0; i < 16; i++) {
if (scores_2D[i] < score_thresh && i != max_score_i)
prune_bitmask |= (1 << tx_type_table_2D[i]);
@@ -1644,9 +1668,27 @@ static int prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
return prune_bitmask;
}
+// ((prune >> vtx_tab[tx_type]) & 1)
+static const uint16_t prune_v_mask[] = {
+ 0x0000, 0x0425, 0x108a, 0x14af, 0x4150, 0x4575, 0x51da, 0x55ff,
+ 0xaa00, 0xae25, 0xba8a, 0xbeaf, 0xeb50, 0xef75, 0xfbda, 0xffff,
+};
+
+// ((prune >> (htx_tab[tx_type] + 8)) & 1)
+static const uint16_t prune_h_mask[] = {
+ 0x0000, 0x0813, 0x210c, 0x291f, 0x80e0, 0x88f3, 0xa1ec, 0xa9ff,
+ 0x5600, 0x5e13, 0x770c, 0x7f1f, 0xd6e0, 0xdef3, 0xf7ec, 0xffff,
+};
+
+static INLINE uint16_t gen_tx_search_prune_mask(int tx_search_prune) {
+ uint8_t prune_v = tx_search_prune & 0x0F;
+ uint8_t prune_h = (tx_search_prune >> 8) & 0x0F;
+ return (prune_v_mask[prune_v] & prune_h_mask[prune_h]);
+}
+
static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
const MACROBLOCKD *const xd, int tx_set_type) {
- av1_zero(x->tx_search_prune);
+ x->tx_search_prune[tx_set_type] = 0;
x->tx_split_prune_flag = 0;
const MB_MODE_INFO *mbmi = xd->mi[0];
if (!is_inter_block(mbmi) || cpi->sf.tx_type_search.prune_mode == NO_PRUNE ||
@@ -1656,24 +1698,24 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
int tx_set = ext_tx_set_index[1][tx_set_type];
assert(tx_set >= 0);
const int *tx_set_1D = ext_tx_used_inter_1D[tx_set];
+ int prune = 0;
switch (cpi->sf.tx_type_search.prune_mode) {
case NO_PRUNE: return;
case PRUNE_ONE:
if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) return;
- x->tx_search_prune[tx_set_type] = prune_one_for_sby(cpi, bsize, x, xd);
+ prune = prune_one_for_sby(cpi, bsize, x, xd);
+ x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune);
break;
case PRUNE_TWO:
if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return;
- x->tx_search_prune[tx_set_type] =
- prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
- }
- if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) {
- x->tx_search_prune[tx_set_type] =
- prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
+ prune = prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
+ } else if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) {
+ prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
+ } else {
+ prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
}
- x->tx_search_prune[tx_set_type] =
- prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
+ x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune);
break;
case PRUNE_2D_ACCURATE:
case PRUNE_2D_FAST: break;
@@ -1681,17 +1723,6 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
}
}
-static int do_tx_type_search(TX_TYPE tx_type, int prune,
- TX_TYPE_PRUNE_MODE mode) {
- // TODO(sarahparker) implement for non ext tx
- if (mode >= PRUNE_2D_ACCURATE) {
- return !((prune >> tx_type) & 1);
- } else {
- return !(((prune >> vtx_tab[tx_type]) & 1) |
- ((prune >> (htx_tab[tx_type] + 8)) & 1));
- }
-}
-
static void model_rd_from_sse(const AV1_COMP *const cpi,
const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
int plane, int64_t sse, int *rate,
@@ -1764,9 +1795,11 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
for (plane = plane_from; plane <= plane_to; ++plane) {
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
- const BLOCK_SIZE bs =
+ const BLOCK_SIZE plane_bsize =
get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
- unsigned int sse;
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ int64_t sse;
int rate;
int64_t dist;
@@ -1774,14 +1807,14 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
// TODO(geza): Write direct sse functions that do not compute
// variance as well.
- cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
- &sse);
+ sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh);
+ sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
- if (plane == 0) x->pred_sse[ref] = sse;
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
total_sse += sse;
- model_rd_from_sse(cpi, xd, bs, plane, sse, &rate, &dist);
+ model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &rate, &dist);
rate_sum += rate;
dist_sum += dist;
@@ -1934,7 +1967,8 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
int blk_row, int blk_col,
const BLOCK_SIZE plane_bsize,
- const BLOCK_SIZE tx_bsize) {
+ const BLOCK_SIZE tx_bsize,
+ int force_sse) {
int visible_rows, visible_cols;
const MACROBLOCKD *xd = &x->e_mbd;
get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
@@ -1944,13 +1978,17 @@ static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
#if CONFIG_DIST_8X8
int txb_height = block_size_high[tx_bsize];
int txb_width = block_size_wide[tx_bsize];
- if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8) {
+ if (!force_sse && x->using_dist_8x8 && plane == 0 && txb_width >= 8 &&
+ txb_height >= 8) {
const int src_stride = x->plane[plane].src.stride;
const int src_idx = (blk_row * src_stride + blk_col)
<< tx_size_wide_log2[0];
+ const int diff_idx = (blk_row * diff_stride + blk_col)
+ << tx_size_wide_log2[0];
const uint8_t *src = &x->plane[plane].src.buf[src_idx];
- return dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width,
- txb_height, visible_cols, visible_rows, x->qindex);
+ return dist_8x8_diff(x, src, src_stride, diff + diff_idx, diff_stride,
+ txb_width, txb_height, visible_cols, visible_rows,
+ x->qindex);
}
#endif
diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]);
@@ -2182,10 +2220,14 @@ static void get_2x2_normalized_sses_and_sads(
for (int col = 0; col < 2; ++col) {
const int16_t *const this_src_diff =
src_diff + row * half_height * diff_stride + col * half_width;
- sse_norm_arr[row * 2 + col] =
- get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
- sad_norm_arr[row * 2 + col] =
- get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
+ if (sse_norm_arr) {
+ sse_norm_arr[row * 2 + col] =
+ get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
+ }
+ if (sad_norm_arr) {
+ sad_norm_arr[row * 2 + col] =
+ get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
+ }
}
}
} else { // use function pointers to calculate stats
@@ -2199,28 +2241,35 @@ static void get_2x2_normalized_sses_and_sads(
const uint8_t *const this_dst =
dst + row * half_height * dst_stride + col * half_width;
- unsigned int this_sse;
- cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
- dst_stride, &this_sse);
- sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
+ if (sse_norm_arr) {
+ unsigned int this_sse;
+ cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+ dst_stride, &this_sse);
+ sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
+ }
- const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
- this_src, src_stride, this_dst, dst_stride);
- sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
+ if (sad_norm_arr) {
+ const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
+ this_src, src_stride, this_dst, dst_stride);
+ sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
+ }
}
}
}
}
#if CONFIG_COLLECT_RD_STATS
-// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
-// 0: Do not collect any RD stats
-// 1: Collect RD stats for transform units
-// 2: Collect RD stats for partition units
+ // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+ // 0: Do not collect any RD stats
+ // 1: Collect RD stats for transform units
+ // 2: Collect RD stats for partition units
+
+#if CONFIG_COLLECT_RD_STATS == 1
static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
const RD_STATS *const rd_stats, int blk_row,
int blk_col, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, TX_TYPE tx_type) {
+ TX_SIZE tx_size, TX_TYPE tx_type,
+ int64_t rd) {
if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
// Generate small sample to restrict output size.
@@ -2304,9 +2353,12 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+ fprintf(fout, " %d %" PRId64, x->rdmult, rd);
+
fprintf(fout, "\n");
fclose(fout);
}
+#endif // CONFIG_COLLECT_RD_STATS == 1
#if CONFIG_COLLECT_RD_STATS == 2
static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -2327,12 +2379,14 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
const int plane = 0;
struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
- const int bw = block_size_wide[plane_bsize];
- const int bh = block_size_high[plane_bsize];
+ const int diff_stride = block_size_wide[plane_bsize];
+ int bw, bh;
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+ &bh);
+ const int num_samples = bw * bh;
const int dequant_shift =
(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
const int q_step = pd->dequant_Q3[1] >> dequant_shift;
- const double num_samples = bw * bh;
const double rate_norm = (double)rd_stats->rate / num_samples;
const double dist_norm = (double)rd_stats->dist / num_samples;
@@ -2343,23 +2397,28 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
const uint8_t *const src = p->src.buf;
const int dst_stride = pd->dst.stride;
const uint8_t *const dst = pd->dst.buf;
- unsigned int sse;
- cpi->fn_ptr[plane_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+ const int16_t *const src_diff = p->src_diff;
+ const int shift = (xd->bd - 8);
+
+ int64_t sse = aom_sum_squares_2d_i16(src_diff, diff_stride, bw, bh);
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
const double sse_norm = (double)sse / num_samples;
const unsigned int sad =
cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
- const double sad_norm = (double)sad / num_samples;
+ const double sad_norm =
+ (double)sad / (1 << num_pels_log2_lookup[plane_bsize]);
fprintf(fout, " %g %g", sse_norm, sad_norm);
- const int diff_stride = block_size_wide[plane_bsize];
- const int16_t *const src_diff = p->src_diff;
-
double sse_norm_arr[4], sad_norm_arr[4];
get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
dst_stride, src_diff, diff_stride,
sse_norm_arr, sad_norm_arr);
+ if (shift) {
+ for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
+ for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift);
+ }
for (int i = 0; i < 4; ++i) {
fprintf(fout, " %g", sse_norm_arr[i]);
}
@@ -2376,7 +2435,8 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
const double model_dist_norm = (double)model_dist / num_samples;
fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
- const double mean = get_mean(src_diff, diff_stride, bw, bh);
+ double mean = get_mean(src_diff, diff_stride, bw, bh);
+ mean /= (1 << shift);
double hor_corr, vert_corr;
get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
@@ -2393,20 +2453,19 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
#endif // CONFIG_COLLECT_RD_STATS == 2
#endif // CONFIG_COLLECT_RD_STATS
-static void model_rd_with_dnn(const AV1_COMP *const cpi,
- const MACROBLOCK *const x, BLOCK_SIZE bsize,
- int plane, unsigned int *rsse, int *rate,
- int64_t *dist) {
+static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane, int64_t *rsse,
+ int *rate, int64_t *dist) {
const MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
- const BLOCK_SIZE plane_bsize =
- get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
const int log_numpels = num_pels_log2_lookup[plane_bsize];
- const int num_samples = (1 << log_numpels);
const struct macroblock_plane *const p = &x->plane[plane];
- const int bw = block_size_wide[plane_bsize];
- const int bh = block_size_high[plane_bsize];
+ int bw, bh;
+ const int diff_stride = block_size_wide[plane_bsize];
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+ &bh);
+ const int num_samples = bw * bh;
const int dequant_shift =
(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
const int q_step = pd->dequant_Q3[1] >> dequant_shift;
@@ -2415,55 +2474,73 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi,
const uint8_t *const src = p->src.buf;
const int dst_stride = pd->dst.stride;
const uint8_t *const dst = pd->dst.buf;
- unsigned int sse;
- cpi->fn_ptr[plane_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+ const int16_t *const src_diff = p->src_diff;
+ const int shift = (xd->bd - 8);
+ int64_t sse = aom_sum_squares_2d_i16(p->src_diff, diff_stride, bw, bh);
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
const double sse_norm = (double)sse / num_samples;
- const int diff_stride = block_size_wide[plane_bsize];
- const int16_t *const src_diff = p->src_diff;
+ if (sse == 0) {
+ if (rate) *rate = 0;
+ if (dist) *dist = 0;
+ if (rsse) *rsse = sse;
+ return;
+ }
+ if (plane) {
+ int model_rate;
+ int64_t model_dist;
+ model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate,
+ &model_dist);
+ if (rate) *rate = model_rate;
+ if (dist) *dist = model_dist;
+ if (rsse) *rsse = sse;
+ return;
+ }
- double sse_norm_arr[4], sad_norm_arr[4];
+ double sse_norm_arr[4];
get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
dst_stride, src_diff, diff_stride,
- sse_norm_arr, sad_norm_arr);
- const double mean = get_mean(src_diff, diff_stride, bw, bh);
+ sse_norm_arr, NULL);
+ double mean = get_mean(src_diff, bw, bw, bh);
+ if (shift) {
+ for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
+ mean /= (1 << shift);
+ }
const double variance = sse_norm - mean * mean;
+ assert(variance >= 0.0);
const double q_sqr = (double)(q_step * q_step);
- const double q_sqr_by_variance = q_sqr / variance;
+ const double q_sqr_by_sse_norm = q_sqr / (sse_norm + 1.0);
double hor_corr, vert_corr;
get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
- double hdist[4] = { 0 }, vdist[4] = { 0 };
- get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
- dst_stride, 1, hdist, vdist);
- float features[20];
- features[0] = (float)hdist[0];
- features[1] = (float)hdist[1];
- features[2] = (float)hdist[2];
- features[3] = (float)hdist[3];
- features[4] = (float)hor_corr;
- features[5] = (float)log_numpels;
- features[6] = (float)mean;
- features[7] = (float)q_sqr;
- features[8] = (float)q_sqr_by_variance;
- features[9] = (float)sse_norm_arr[0];
- features[10] = (float)sse_norm_arr[1];
- features[11] = (float)sse_norm_arr[2];
- features[12] = (float)sse_norm_arr[3];
- features[13] = (float)sse_norm_arr[3];
- features[14] = (float)variance;
- features[15] = (float)vdist[0];
- features[16] = (float)vdist[1];
- features[17] = (float)vdist[2];
- features[18] = (float)vdist[3];
- features[19] = (float)vert_corr;
-
- float rate_f, dist_f;
- av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_f);
+ float features[11];
+ features[0] = (float)hor_corr;
+ features[1] = (float)log_numpels;
+ features[2] = (float)q_sqr;
+ features[3] = (float)q_sqr_by_sse_norm;
+ features[4] = (float)sse_norm_arr[0];
+ features[5] = (float)sse_norm_arr[1];
+ features[6] = (float)sse_norm_arr[2];
+ features[7] = (float)sse_norm_arr[3];
+ features[8] = (float)sse_norm;
+ features[9] = (float)variance;
+ features[10] = (float)vert_corr;
+
+ float rate_f, dist_by_sse_norm_f;
+ av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f);
av1_nn_predict(features, &av1_pustats_rate_nnconfig, &rate_f);
- const int rate_i = (int)(AOMMAX(0.0, rate_f * (1 << log_numpels)) + 0.5);
- const int64_t dist_i =
- (int64_t)(AOMMAX(0.0, dist_f * (1 << log_numpels)) + 0.5);
+ const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm));
+ int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+ int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+
+ // Check if skip is better
+ if (RDCOST(x->rdmult, rate_i, dist_i) >= RDCOST(x->rdmult, 0, (sse << 4))) {
+ dist_i = sse << 4;
+ rate_i = 0;
+ } else if (rate_i == 0) {
+ dist_i = sse << 4;
+ }
+
if (rate) *rate = rate_i;
if (dist) *dist = dist_i;
if (rsse) *rsse = sse;
@@ -2488,15 +2565,18 @@ void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
x->pred_sse[ref] = 0;
for (int plane = plane_from; plane <= plane_to; ++plane) {
- unsigned int sse;
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int64_t sse;
int rate;
int64_t dist;
if (x->skip_chroma_rd && plane) continue;
- model_rd_with_dnn(cpi, x, bsize, plane, &sse, &rate, &dist);
+ model_rd_with_dnn(cpi, x, plane_bsize, plane, &sse, &rate, &dist);
- if (plane == 0) x->pred_sse[ref] = sse;
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
total_sse += sse;
rate_sum += rate;
@@ -2586,27 +2666,16 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
int rate_cost = 0;
TX_TYPE txk_start = DCT_DCT;
TX_TYPE txk_end = TX_TYPES - 1;
- if (!(!is_inter && x->use_default_intra_tx_type) &&
- !(is_inter && x->use_default_inter_tx_type))
- if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan)
- if (plane == 0) txk_end = DCT_DCT;
+ if ((!is_inter && x->use_default_intra_tx_type) ||
+ (is_inter && x->use_default_inter_tx_type)) {
+ txk_start = txk_end = get_default_tx_type(0, xd, tx_size);
+ } else if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) {
+ if (plane == 0) txk_end = DCT_DCT;
+ }
uint8_t best_txb_ctx = 0;
const TxSetType tx_set_type =
av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
- int prune = 0;
- const int do_prune = plane == 0 && !fast_tx_search && txk_end != DCT_DCT &&
- !(!is_inter && x->use_default_intra_tx_type) &&
- !(is_inter && x->use_default_inter_tx_type) &&
- cpi->sf.tx_type_search.prune_mode > NO_PRUNE;
- if (do_prune && is_inter) {
- if (cpi->sf.tx_type_search.prune_mode >= PRUNE_2D_ACCURATE) {
- prune = prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col,
- tx_set_type, cpi->sf.tx_type_search.prune_mode);
- } else {
- prune = x->tx_search_prune[tx_set_type];
- }
- }
TX_TYPE uv_tx_type = DCT_DCT;
if (plane) {
@@ -2615,39 +2684,38 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
av1_get_tx_type(get_plane_type(plane), xd, blk_row, blk_col, tx_size,
cm->reduced_tx_set_used);
}
- if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
+ const uint16_t ext_tx_used_flag = av1_ext_tx_used_flag[tx_set_type];
+ if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
+ ext_tx_used_flag == 0x0001) {
txk_start = txk_end = DCT_DCT;
}
-
- int8_t allowed_tx_mask[TX_TYPES] = { 0 }; // 1: allow; 0: skip.
- int allowed_tx_num = 0;
- if (fast_tx_search) {
- allowed_tx_mask[DCT_DCT] = 1;
- allowed_tx_mask[H_DCT] = 1;
- allowed_tx_mask[V_DCT] = 1;
+ uint16_t allowed_tx_mask = 0; // 1: allow; 0: skip.
+ if (txk_start == txk_end) {
+ allowed_tx_mask = 1 << txk_start;
+ allowed_tx_mask &= ext_tx_used_flag;
+ } else if (fast_tx_search) {
+ allowed_tx_mask = 0x0c01; // V_DCT, H_DCT, DCT_DCT
+ allowed_tx_mask &= ext_tx_used_flag;
} else {
- memset(allowed_tx_mask + txk_start, 1, txk_end - txk_start + 1);
- }
- for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
- if (do_prune) {
- if (!do_tx_type_search(tx_type, prune, cpi->sf.tx_type_search.prune_mode))
- allowed_tx_mask[tx_type] = 0;
- }
- if (plane == 0 && allowed_tx_mask[tx_type]) {
- if (!av1_ext_tx_used[tx_set_type][tx_type])
- allowed_tx_mask[tx_type] = 0;
- else if (!is_inter && x->use_default_intra_tx_type &&
- tx_type != get_default_tx_type(0, xd, tx_size))
- allowed_tx_mask[tx_type] = 0;
- else if (is_inter && x->use_default_inter_tx_type &&
- tx_type != get_default_tx_type(0, xd, tx_size))
- allowed_tx_mask[tx_type] = 0;
- }
- allowed_tx_num += allowed_tx_mask[tx_type];
+ assert(plane == 0);
+ allowed_tx_mask = ext_tx_used_flag;
+ // !fast_tx_search && txk_end != txk_start && plane == 0
+ const int do_prune = cpi->sf.tx_type_search.prune_mode > NO_PRUNE;
+ if (do_prune && is_inter) {
+ if (cpi->sf.tx_type_search.prune_mode >= PRUNE_2D_ACCURATE) {
+ const uint16_t prune =
+ prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
+ cpi->sf.tx_type_search.prune_mode);
+ allowed_tx_mask &= (~prune);
+ } else {
+ allowed_tx_mask &= (~x->tx_search_prune[tx_set_type]);
+ }
+ }
}
// Need to have at least one transform type allowed.
- if (allowed_tx_num == 0) {
- allowed_tx_mask[plane ? uv_tx_type : DCT_DCT] = 1;
+ if (allowed_tx_mask == 0) {
+ txk_start = txk_end = (plane ? uv_tx_type : DCT_DCT);
+ allowed_tx_mask = (1 << txk_start);
}
int use_transform_domain_distortion =
@@ -2664,20 +2732,21 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
cpi->sf.use_transform_domain_distortion == 1 &&
use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD &&
!x->cb_partition_scan;
- if (calc_pixel_domain_distortion_final && allowed_tx_num <= 1)
+ if (calc_pixel_domain_distortion_final &&
+ (txk_start == txk_end || allowed_tx_mask == 0x0001))
calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0;
const uint16_t *eobs_ptr = x->plane[plane].eobs;
const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
int64_t block_sse =
- pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize);
+ pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize, 1);
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
block_sse *= 16;
for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
- if (!allowed_tx_mask[tx_type]) continue;
+ if (!(allowed_tx_mask & (1 << tx_type))) continue;
if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type;
RD_STATS this_rd_stats;
av1_invalid_rd_stats(&this_rd_stats);
@@ -2686,8 +2755,8 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
av1_xform_quant(
cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
- rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block,
- tx_size, txb_ctx, use_fast_coef_costing);
+ rate_cost = av1_cost_coeffs(cm, x, plane, block, tx_size, tx_type,
+ txb_ctx, use_fast_coef_costing);
} else {
av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
tx_size, tx_type, AV1_XFORM_QUANT_FP);
@@ -2696,13 +2765,18 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
// Calculate distortion quickly in transform domain.
dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
&this_rd_stats.sse);
- rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block,
- tx_size, txb_ctx, use_fast_coef_costing);
+
+ const int64_t best_rd_ = AOMMIN(best_rd, ref_best_rd);
+ const int64_t dist_cost_estimate =
+ RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse));
+ if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue;
+
+ rate_cost = av1_cost_coeffs(cm, x, plane, block, tx_size, tx_type,
+ txb_ctx, use_fast_coef_costing);
const int64_t rd_estimate =
AOMMIN(RDCOST(x->rdmult, rate_cost, this_rd_stats.dist),
RDCOST(x->rdmult, 0, this_rd_stats.sse));
- if (rd_estimate - (rd_estimate >> 3) > AOMMIN(best_rd, ref_best_rd))
- continue;
+ if (rd_estimate - (rd_estimate >> 3) > best_rd_) continue;
}
av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, 1,
&rate_cost);
@@ -2741,7 +2815,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
#if CONFIG_COLLECT_RD_STATS == 1
if (plane == 0) {
PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col,
- plane_bsize, tx_size, tx_type);
+ plane_bsize, tx_size, tx_type, rd);
}
#endif // CONFIG_COLLECT_RD_STATS == 1
@@ -3097,6 +3171,7 @@ static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
MACROBLOCK *x, int *r, int64_t *d, int *s,
int64_t *sse, int64_t ref_best_rd) {
RD_STATS rd_stats;
+ av1_subtract_plane(x, bs, 0);
x->rd_model = LOW_TXFM_RD;
int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs,
max_txsize_rect_lookup[bs], FTXS_NONE);
@@ -3267,7 +3342,7 @@ static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x,
const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
palette_mode_cost +=
av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
- n_cache, cpi->common.bit_depth);
+ n_cache, cpi->common.seq_params.bit_depth);
palette_mode_cost +=
av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
total_rate += palette_mode_cost;
@@ -3318,8 +3393,8 @@ static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x,
write_uniform_cost(plt_size, color_map[0]);
uint16_t color_cache[2 * PALETTE_MAX_SIZE];
const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
- palette_mode_cost += av1_palette_color_cost_uv(pmi, color_cache, n_cache,
- cpi->common.bit_depth);
+ palette_mode_cost += av1_palette_color_cost_uv(
+ pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
palette_mode_cost +=
av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
total_rate += palette_mode_cost;
@@ -3375,6 +3450,7 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
}
}
// RD estimation.
+ av1_subtract_plane(x, bsize, 0);
model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate,
&this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL,
NULL, NULL);
@@ -3458,10 +3534,10 @@ static void palette_rd_y(
return;
}
PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
- if (cpi->common.use_highbitdepth)
+ if (cpi->common.seq_params.use_highbitdepth)
for (int i = 0; i < k; ++i)
- pmi->palette_colors[i] =
- clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth);
+ pmi->palette_colors[i] = clip_pixel_highbd(
+ (int)centroids[i], cpi->common.seq_params.bit_depth);
else
for (int i = 0; i < k; ++i)
pmi->palette_colors[i] = clip_pixel(centroids[i]);
@@ -3514,6 +3590,7 @@ static int rd_pick_palette_intra_sby(
MB_MODE_INFO *const mbmi = xd->mi[0];
assert(!is_inter_block(mbmi));
assert(av1_allow_palette(cpi->common.allow_screen_content_tools, bsize));
+ const SequenceHeader *const seq_params = &cpi->common.seq_params;
int colors, n;
const int src_stride = x->plane[0].src.stride;
const uint8_t *const src = x->plane[0].src.buf;
@@ -3523,9 +3600,9 @@ static int rd_pick_palette_intra_sby(
&cols);
int count_buf[1 << 12]; // Maximum (1 << 12) color levels.
- if (cpi->common.use_highbitdepth)
+ if (seq_params->use_highbitdepth)
colors = av1_count_colors_highbd(src, src_stride, rows, cols,
- cpi->common.bit_depth, count_buf);
+ seq_params->bit_depth, count_buf);
else
colors = av1_count_colors(src, src_stride, rows, cols, count_buf);
mbmi->filter_intra_mode_info.use_filter_intra = 0;
@@ -3537,12 +3614,12 @@ static int rd_pick_palette_intra_sby(
int centroids[PALETTE_MAX_SIZE];
int lb, ub, val;
uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
- if (cpi->common.use_highbitdepth)
+ if (seq_params->use_highbitdepth)
lb = ub = src16[0];
else
lb = ub = src[0];
- if (cpi->common.use_highbitdepth) {
+ if (seq_params->use_highbitdepth) {
for (r = 0; r < rows; ++r) {
for (c = 0; c < cols; ++c) {
val = src16[r * src_stride + c];
@@ -3576,7 +3653,7 @@ static int rd_pick_palette_intra_sby(
int top_colors[PALETTE_MAX_SIZE] = { 0 };
for (i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
int max_count = 0;
- for (int j = 0; j < (1 << cpi->common.bit_depth); ++j) {
+ for (int j = 0; j < (1 << seq_params->bit_depth); ++j) {
if (count_buf[j] > max_count) {
max_count = count_buf[j];
top_colors[i] = j;
@@ -4316,6 +4393,244 @@ static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
return (int)(score * 100);
}
+typedef struct {
+ int64_t rd;
+ int txb_entropy_ctx;
+ TX_TYPE tx_type;
+} TxCandidateInfo;
+
+static void try_tx_block_no_split(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
+ const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
+ int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd,
+ FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
+ TxCandidateInfo *no_split) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct macroblock_plane *const p = &x->plane[0];
+ const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+
+ no_split->rd = INT64_MAX;
+ no_split->txb_entropy_ctx = 0;
+ no_split->tx_type = TX_TYPES;
+
+ const ENTROPY_CONTEXT *const pta = ta + blk_col;
+ const ENTROPY_CONTEXT *const ptl = tl + blk_row;
+
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
+ const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+ .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+
+ rd_stats->ref_rdcost = ref_best_rd;
+ rd_stats->zero_rate = zero_blk_rate;
+ const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
+ mbmi->inter_tx_size[index] = tx_size;
+ tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta,
+ ptl, rd_stats, ftxs_mode, ref_best_rd,
+ rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
+ assert(rd_stats->rate < INT_MAX);
+
+ if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+ RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+ rd_stats->skip == 1) &&
+ !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_RD_DEBUG
+ av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
+ zero_blk_rate - rd_stats->rate);
+#endif // CONFIG_RD_DEBUG
+ rd_stats->rate = zero_blk_rate;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip = 1;
+ x->blk_skip[blk_row * bw + blk_col] = 1;
+ p->eobs[block] = 0;
+ update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+ DCT_DCT);
+ } else {
+ x->blk_skip[blk_row * bw + blk_col] = 0;
+ rd_stats->skip = 0;
+ }
+
+ if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+ rd_stats->rate += x->txfm_partition_cost[txfm_partition_ctx][0];
+
+ no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ no_split->txb_entropy_ctx = p->txb_entropy_ctx[block];
+ const int txk_type_idx =
+ av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+ no_split->tx_type = mbmi->txk_type[txk_type_idx];
+}
+
+static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+ int blk_col, int block, TX_SIZE tx_size, int depth,
+ BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
+ TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
+ int64_t ref_best_rd, int *is_cost_valid,
+ FAST_TX_SEARCH_MODE ftxs_mode,
+ TXB_RD_INFO_NODE *rd_info_node);
+
+static void try_tx_block_split(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+ int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
+ FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
+ RD_STATS *split_rd_stats, int64_t *split_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+ struct macroblock_plane *const p = &x->plane[0];
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int sub_step = bsw * bsh;
+ RD_STATS this_rd_stats;
+ int this_cost_valid = 1;
+ int64_t tmp_rd = 0;
+#if CONFIG_DIST_8X8
+ int sub8x8_eob[4] = { 0, 0, 0, 0 };
+ struct macroblockd_plane *const pd = &xd->plane[0];
+#endif
+ split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
+
+ assert(tx_size < TX_SIZES_ALL);
+
+ int blk_idx = 0;
+ for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
+ for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) {
+ const int offsetr = blk_row + r;
+ const int offsetc = blk_col + c;
+ if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+ assert(blk_idx < 4);
+ select_tx_block(
+ cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta,
+ tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd,
+ &this_cost_valid, ftxs_mode,
+ (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
+
+#if CONFIG_DIST_8X8
+ if (!x->using_dist_8x8)
+#endif
+ if (!this_cost_valid) goto LOOP_EXIT;
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8 && tx_size == TX_8X8) {
+ sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block];
+ }
+#endif // CONFIG_DIST_8X8
+ av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
+
+ tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
+#if CONFIG_DIST_8X8
+ if (!x->using_dist_8x8)
+#endif
+ if (no_split_rd < tmp_rd) {
+ this_cost_valid = 0;
+ goto LOOP_EXIT;
+ }
+ block += sub_step;
+ }
+ }
+
+LOOP_EXIT : {}
+
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) {
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+
+ const uint8_t *src =
+ &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+ const uint8_t *dst =
+ &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+
+ int64_t dist_8x8;
+ const int qindex = x->qindex;
+ const int pred_stride = block_size_wide[plane_bsize];
+ const int pred_idx = (blk_row * pred_stride + blk_col)
+ << tx_size_wide_log2[0];
+ const int16_t *pred = &x->pred_luma[pred_idx];
+ int i, j;
+ int row, col;
+
+ uint8_t *pred8;
+ DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]);
+
+ dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, BLOCK_8X8,
+ 8, 8, 8, 8, qindex) *
+ 16;
+
+#ifdef DEBUG_DIST_8X8
+ if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
+ assert(sum_rd_stats.sse == dist_8x8);
+#endif // DEBUG_DIST_8X8
+
+ split_rd_stats->sse = dist_8x8;
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ pred8 = CONVERT_TO_BYTEPTR(pred8_16);
+ else
+ pred8 = (uint8_t *)pred8_16;
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ for (row = 0; row < 2; ++row) {
+ for (col = 0; col < 2; ++col) {
+ int idx = row * 2 + col;
+ int eob = sub8x8_eob[idx];
+
+ if (eob > 0) {
+ for (j = 0; j < 4; j++)
+ for (i = 0; i < 4; i++)
+ CONVERT_TO_SHORTPTR(pred8)
+ [(row * 4 + j) * 8 + 4 * col + i] =
+ pred[(row * 4 + j) * pred_stride + 4 * col + i];
+ } else {
+ for (j = 0; j < 4; j++)
+ for (i = 0; i < 4; i++)
+ CONVERT_TO_SHORTPTR(pred8)
+ [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR(
+ dst)[(row * 4 + j) * dst_stride + 4 * col + i];
+ }
+ }
+ }
+ } else {
+ for (row = 0; row < 2; ++row) {
+ for (col = 0; col < 2; ++col) {
+ int idx = row * 2 + col;
+ int eob = sub8x8_eob[idx];
+
+ if (eob > 0) {
+ for (j = 0; j < 4; j++)
+ for (i = 0; i < 4; i++)
+ pred8[(row * 4 + j) * 8 + 4 * col + i] =
+ (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i];
+ } else {
+ for (j = 0; j < 4; j++)
+ for (i = 0; i < 4; i++)
+ pred8[(row * 4 + j) * 8 + 4 * col + i] =
+ dst[(row * 4 + j) * dst_stride + 4 * col + i];
+ }
+ }
+ }
+ }
+ dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8, 8,
+ 8, 8, qindex) *
+ 16;
+
+#ifdef DEBUG_DIST_8X8
+ if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
+ assert(sum_rd_stats.dist == dist_8x8);
+#endif // DEBUG_DIST_8X8
+
+ split_rd_stats->dist = dist_8x8;
+ tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
+ }
+#endif // CONFIG_DIST_8X8
+ if (this_cost_valid) *split_rd = tmp_rd;
+}
+
// Search for the best tx partition/type for a given luma block.
static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
int blk_col, int block, TX_SIZE tx_size, int depth,
@@ -4338,8 +4653,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
- ENTROPY_CONTEXT *pta = ta + blk_col;
- ENTROPY_CONTEXT *ptl = tl + blk_row;
MB_MODE_INFO *const mbmi = xd->mi[0];
const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
mbmi->sb_type, tx_size);
@@ -4348,64 +4661,25 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
const int try_no_split = 1;
int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
- int64_t no_split_rd = INT64_MAX;
- int no_split_txb_entropy_ctx = 0;
- TX_TYPE no_split_tx_type = TX_TYPES;
+ TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
+
// TX no split
if (try_no_split) {
- const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
- TXB_CTX txb_ctx;
- get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
- const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
- .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+ try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+ plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
+ ftxs_mode, rd_info_node, &no_split);
- rd_stats->ref_rdcost = ref_best_rd;
- rd_stats->zero_rate = zero_blk_rate;
- const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
- mbmi->inter_tx_size[index] = tx_size;
- tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta,
- ptl, rd_stats, ftxs_mode, ref_best_rd,
- rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
- assert(rd_stats->rate < INT_MAX);
-
- if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
- RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
- rd_stats->skip == 1) &&
- !xd->lossless[mbmi->segment_id]) {
-#if CONFIG_RD_DEBUG
- av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
- zero_blk_rate - rd_stats->rate);
-#endif // CONFIG_RD_DEBUG
- rd_stats->rate = zero_blk_rate;
- rd_stats->dist = rd_stats->sse;
- rd_stats->skip = 1;
- x->blk_skip[blk_row * bw + blk_col] = 1;
- p->eobs[block] = 0;
- update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
- DCT_DCT);
- } else {
- x->blk_skip[blk_row * bw + blk_col] = 0;
- rd_stats->skip = 0;
- }
-
- if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
- rd_stats->rate += x->txfm_partition_cost[ctx][0];
- no_split_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
if (cpi->sf.adaptive_txb_search_level &&
- (no_split_rd -
- (no_split_rd >> (1 + cpi->sf.adaptive_txb_search_level))) >
+ (no_split.rd -
+ (no_split.rd >> (1 + cpi->sf.adaptive_txb_search_level))) >
ref_best_rd) {
*is_cost_valid = 0;
return;
}
- no_split_txb_entropy_ctx = p->txb_entropy_ctx[block];
- const int txk_type_idx =
- av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
- no_split_tx_type = mbmi->txk_type[txk_type_idx];
-
- if (cpi->sf.txb_split_cap)
+ if (cpi->sf.txb_split_cap) {
if (p->eobs[block] == 0) try_split = 0;
+ }
}
if (x->e_mbd.bd == 8 && !x->cb_partition_scan && try_split) {
@@ -4427,155 +4701,10 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
RD_STATS split_rd_stats;
av1_init_rd_stats(&split_rd_stats);
if (try_split) {
- const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
- const int bsw = tx_size_wide_unit[sub_txs];
- const int bsh = tx_size_high_unit[sub_txs];
- const int sub_step = bsw * bsh;
- RD_STATS this_rd_stats;
- int this_cost_valid = 1;
- int64_t tmp_rd = 0;
-#if CONFIG_DIST_8X8
- int sub8x8_eob[4] = { 0, 0, 0, 0 };
- struct macroblockd_plane *const pd = &xd->plane[0];
-#endif
- split_rd_stats.rate = x->txfm_partition_cost[ctx][1];
-
- assert(tx_size < TX_SIZES_ALL);
-
- ref_best_rd = AOMMIN(no_split_rd, ref_best_rd);
-
- int blk_idx = 0;
- for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
- for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) {
- const int offsetr = blk_row + r;
- const int offsetc = blk_col + c;
- if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
- assert(blk_idx < 4);
- select_tx_block(
- cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize,
- ta, tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd,
- &this_cost_valid, ftxs_mode,
- (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
-
-#if CONFIG_DIST_8X8
- if (!x->using_dist_8x8)
-#endif
- if (!this_cost_valid) goto LOOP_EXIT;
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && tx_size == TX_8X8) {
- sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block];
- }
-#endif // CONFIG_DIST_8X8
- av1_merge_rd_stats(&split_rd_stats, &this_rd_stats);
-
- tmp_rd = RDCOST(x->rdmult, split_rd_stats.rate, split_rd_stats.dist);
-#if CONFIG_DIST_8X8
- if (!x->using_dist_8x8)
-#endif
- if (no_split_rd < tmp_rd) {
- this_cost_valid = 0;
- goto LOOP_EXIT;
- }
- block += sub_step;
- }
- }
-
- LOOP_EXIT : {}
-
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) {
- const int src_stride = p->src.stride;
- const int dst_stride = pd->dst.stride;
-
- const uint8_t *src =
- &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
- const uint8_t *dst =
- &pd->dst
- .buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-
- int64_t dist_8x8;
- const int qindex = x->qindex;
- const int pred_stride = block_size_wide[plane_bsize];
- const int pred_idx = (blk_row * pred_stride + blk_col)
- << tx_size_wide_log2[0];
- const int16_t *pred = &x->pred_luma[pred_idx];
- int i, j;
- int row, col;
-
- uint8_t *pred8;
- DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]);
-
- dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
- BLOCK_8X8, 8, 8, 8, 8, qindex) *
- 16;
-
-#ifdef DEBUG_DIST_8X8
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
- assert(sum_rd_stats.sse == dist_8x8);
-#endif // DEBUG_DIST_8X8
-
- split_rd_stats.sse = dist_8x8;
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- pred8 = CONVERT_TO_BYTEPTR(pred8_16);
- else
- pred8 = (uint8_t *)pred8_16;
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- for (row = 0; row < 2; ++row) {
- for (col = 0; col < 2; ++col) {
- int idx = row * 2 + col;
- int eob = sub8x8_eob[idx];
-
- if (eob > 0) {
- for (j = 0; j < 4; j++)
- for (i = 0; i < 4; i++)
- CONVERT_TO_SHORTPTR(pred8)
- [(row * 4 + j) * 8 + 4 * col + i] =
- pred[(row * 4 + j) * pred_stride + 4 * col + i];
- } else {
- for (j = 0; j < 4; j++)
- for (i = 0; i < 4; i++)
- CONVERT_TO_SHORTPTR(pred8)
- [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR(
- dst)[(row * 4 + j) * dst_stride + 4 * col + i];
- }
- }
- }
- } else {
- for (row = 0; row < 2; ++row) {
- for (col = 0; col < 2; ++col) {
- int idx = row * 2 + col;
- int eob = sub8x8_eob[idx];
-
- if (eob > 0) {
- for (j = 0; j < 4; j++)
- for (i = 0; i < 4; i++)
- pred8[(row * 4 + j) * 8 + 4 * col + i] =
- (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i];
- } else {
- for (j = 0; j < 4; j++)
- for (i = 0; i < 4; i++)
- pred8[(row * 4 + j) * 8 + 4 * col + i] =
- dst[(row * 4 + j) * dst_stride + 4 * col + i];
- }
- }
- }
- }
- dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8,
- 8, 8, 8, qindex) *
- 16;
-
-#ifdef DEBUG_DIST_8X8
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
- assert(sum_rd_stats.dist == dist_8x8);
-#endif // DEBUG_DIST_8X8
-
- split_rd_stats.dist = dist_8x8;
- tmp_rd = RDCOST(x->rdmult, split_rd_stats.rate, split_rd_stats.dist);
- }
-#endif // CONFIG_DIST_8X8
- if (this_cost_valid) split_rd = tmp_rd;
+ try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+ plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
+ AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
+ rd_info_node, &split_rd_stats, &split_rd);
}
#if COLLECT_TX_SIZE_DATA
@@ -4626,9 +4755,11 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
} while (0);
#endif // COLLECT_TX_SIZE_DATA
- if (no_split_rd < split_rd) {
+ if (no_split.rd < split_rd) {
+ ENTROPY_CONTEXT *pta = ta + blk_col;
+ ENTROPY_CONTEXT *ptl = tl + blk_row;
const TX_SIZE tx_size_selected = tx_size;
- p->txb_entropy_ctx[block] = no_split_txb_entropy_ctx;
+ p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx;
av1_set_txb_context(x, 0, block, tx_size_selected, pta, ptl);
txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
tx_size);
@@ -4641,7 +4772,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
}
mbmi->tx_size = tx_size_selected;
update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
- no_split_tx_type);
+ no_split.tx_type);
x->blk_skip[blk_row * bw + blk_col] = rd_stats->skip;
} else {
*rd_stats = split_rd_stats;
@@ -4707,13 +4838,19 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
}
}
}
- int64_t zero_rd = RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse);
- this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
- if (zero_rd < this_rd) {
- this_rd = zero_rd;
- rd_stats->rate = rd_stats->zero_rate;
+
+ const int skip_ctx = av1_get_skip_context(xd);
+ const int s0 = x->skip_cost[skip_ctx][0];
+ const int s1 = x->skip_cost[skip_ctx][1];
+ int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+ this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+ if (skip_rd <= this_rd) {
+ this_rd = skip_rd;
+ rd_stats->rate = 0;
rd_stats->dist = rd_stats->sse;
rd_stats->skip = 1;
+ } else {
+ rd_stats->skip = 0;
}
if (this_rd > ref_best_rd) is_cost_valid = 0;
@@ -4921,11 +5058,15 @@ static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
}
}
}
- int64_t zero_rd = RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse);
- this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
- if (zero_rd < this_rd) {
- this_rd = zero_rd;
- rd_stats->rate = rd_stats->zero_rate;
+
+ const int skip_ctx = av1_get_skip_context(xd);
+ const int s0 = x->skip_cost[skip_ctx][0];
+ const int s1 = x->skip_cost[skip_ctx][1];
+ int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+ this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+ if (skip_rd < this_rd) {
+ this_rd = skip_rd;
+ rd_stats->rate = 0;
rd_stats->dist = rd_stats->sse;
rd_stats->skip = 1;
}
@@ -5159,7 +5300,7 @@ static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
const MACROBLOCKD *xd = &x->e_mbd;
const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
- *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize);
+ *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, 1);
const int64_t mse = *dist / bw / bh;
// Normalized quantizer takes the transform upscaling factor (8 for tx size
// smaller than 32) into account.
@@ -5215,23 +5356,7 @@ static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
mbmi->tx_size = tx_size;
memset(x->blk_skip, 1, sizeof(x->blk_skip[0]) * n4);
rd_stats->skip = 1;
-
- // Rate.
- const int tx_size_ctx = get_txsize_entropy_ctx(tx_size);
- ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
- ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
- av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl);
- TXB_CTX txb_ctx;
- // Because plane is 0, plane_bsize equal to bsize
- get_txb_ctx(bsize, tx_size, 0, ctxa, ctxl, &txb_ctx);
- int rate = x->coeff_costs[tx_size_ctx][PLANE_TYPE_Y]
- .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
- if (tx_size > TX_4X4) {
- int ctx = txfm_partition_context(
- xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
- rate += x->txfm_partition_cost[ctx][0];
- }
- rd_stats->rate = rate;
+ rd_stats->rate = 0;
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
rd_stats->dist = rd_stats->sse = (dist << 4);
@@ -5322,6 +5447,8 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
found_rd_info ? matched_rd_info : NULL);
+ assert(IMPLIES(this_rd_stats.skip && !this_rd_stats.invalid_rate,
+ this_rd_stats.rate == 0));
ref_best_rd = AOMMIN(rd, ref_best_rd);
if (rd < best_rd) {
@@ -5455,6 +5582,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type));
PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
const BLOCK_SIZE bsize = mbmi->sb_type;
+ const SequenceHeader *const seq_params = &cpi->common.seq_params;
int this_rate;
int64_t this_rd;
int colors_u, colors_v, colors;
@@ -5470,11 +5598,11 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
mbmi->uv_mode = UV_DC_PRED;
int count_buf[1 << 12]; // Maximum (1 << 12) color levels.
- if (cpi->common.use_highbitdepth) {
+ if (seq_params->use_highbitdepth) {
colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
- cpi->common.bit_depth, count_buf);
+ seq_params->bit_depth, count_buf);
colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
- cpi->common.bit_depth, count_buf);
+ seq_params->bit_depth, count_buf);
} else {
colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf);
colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf);
@@ -5494,7 +5622,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
- if (cpi->common.use_highbitdepth) {
+ if (seq_params->use_highbitdepth) {
lb_u = src_u16[0];
ub_u = src_u16[0];
lb_v = src_v16[0];
@@ -5508,7 +5636,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
for (r = 0; r < rows; ++r) {
for (c = 0; c < cols; ++c) {
- if (cpi->common.use_highbitdepth) {
+ if (seq_params->use_highbitdepth) {
val_u = src_u16[r * src_stride + c];
val_v = src_v16[r * src_stride + c];
data[(r * cols + c) * 2] = val_u;
@@ -5557,9 +5685,9 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
pmi->palette_size[1] = n;
for (i = 1; i < 3; ++i) {
for (j = 0; j < n; ++j) {
- if (cpi->common.use_highbitdepth)
+ if (seq_params->use_highbitdepth)
pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
- (int)centroids[j * 2 + i - 1], cpi->common.bit_depth);
+ (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
else
pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
clip_pixel((int)centroids[j * 2 + i - 1]);
@@ -5907,8 +6035,9 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
*mode_uv = UV_DC_PRED;
return;
}
- xd->cfl.is_chroma_reference = is_chroma_reference(
- mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+ xd->cfl.is_chroma_reference =
+ is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y);
bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
xd->plane[AOM_PLANE_U].subsampling_y);
// Only store reconstructed luma when there's chroma RDO. When there's no
@@ -7038,7 +7167,9 @@ static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
// Choose the best wedge index and sign
static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
const BLOCK_SIZE bsize, const uint8_t *const p0,
- const uint8_t *const p1, int *const best_wedge_sign,
+ const int16_t *const residual1,
+ const int16_t *const diff10,
+ int *const best_wedge_sign,
int *const best_wedge_index) {
const MACROBLOCKD *const xd = &x->e_mbd;
const struct buf_2d *const src = &x->plane[0].src;
@@ -7056,34 +7187,22 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
- DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
-
- int64_t sign_limit;
-
+ DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]); // src - pred0
if (hbd) {
- aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
- CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
- aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
- CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
- aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+ aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride,
CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
} else {
- aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
- aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
- aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
+ aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
}
- sign_limit = ((int64_t)aom_sum_squares_i16(r0, N) -
- (int64_t)aom_sum_squares_i16(r1, N)) *
- (1 << WEDGE_WEIGHT_BITS) / 2;
-
+ int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) -
+ (int64_t)aom_sum_squares_i16(residual1, N)) *
+ (1 << WEDGE_WEIGHT_BITS) / 2;
+ int16_t *ds = residual0;
if (N < 64)
- av1_wedge_compute_delta_squares_c(ds, r0, r1, N);
+ av1_wedge_compute_delta_squares_c(ds, residual0, residual1, N);
else
- av1_wedge_compute_delta_squares(ds, r0, r1, N);
+ av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
@@ -7096,9 +7215,9 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
if (N < 64)
- sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N);
+ sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N);
else
- sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
+ sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
sse = ROUND_POWER_OF_TWO(sse, bd_round);
model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
@@ -7117,12 +7236,15 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
}
// Choose the best wedge index the specified sign
-static int64_t pick_wedge_fixed_sign(
- const AV1_COMP *const cpi, const MACROBLOCK *const x,
- const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1,
- const int wedge_sign, int *const best_wedge_index) {
+static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ const BLOCK_SIZE bsize,
+ const int16_t *const residual1,
+ const int16_t *const diff10,
+ const int wedge_sign,
+ int *const best_wedge_index) {
const MACROBLOCKD *const xd = &x->e_mbd;
- const struct buf_2d *const src = &x->plane[0].src;
+
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
const int N = bw * bh;
@@ -7135,26 +7257,12 @@ static int64_t pick_wedge_fixed_sign(
uint64_t sse;
const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
-
- DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
-
- if (hbd) {
- aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
- CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
- aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
- CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
- } else {
- aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
- aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
- }
-
for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
if (N < 64)
- sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N);
+ sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N);
else
- sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
+ sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
sse = ROUND_POWER_OF_TWO(sse, bd_round);
model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
@@ -7166,16 +7274,14 @@ static int64_t pick_wedge_fixed_sign(
best_rd = rd;
}
}
-
return best_rd -
RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
}
-static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
- MACROBLOCK *const x,
- const BLOCK_SIZE bsize,
- const uint8_t *const p0,
- const uint8_t *const p1) {
+static int64_t pick_interinter_wedge(
+ const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize,
+ const uint8_t *const p0, const uint8_t *const p1,
+ const int16_t *const residual1, const int16_t *const diff10) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
const int bw = block_size_wide[bsize];
@@ -7189,9 +7295,11 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
if (cpi->sf.fast_wedge_sign_estimate) {
wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
- rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, wedge_sign, &wedge_index);
+ rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign,
+ &wedge_index);
} else {
- rd = pick_wedge(cpi, x, bsize, p0, p1, &wedge_sign, &wedge_index);
+ rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign,
+ &wedge_index);
}
mbmi->interinter_comp.wedge_sign = wedge_sign;
@@ -7202,10 +7310,11 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
MACROBLOCK *const x, const BLOCK_SIZE bsize,
const uint8_t *const p0,
- const uint8_t *const p1) {
+ const uint8_t *const p1,
+ const int16_t *const residual1,
+ const int16_t *const diff10) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
- const struct buf_2d *const src = &x->plane[0].src;
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
const int N = bw * bh;
@@ -7218,23 +7327,6 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
DIFFWTD_MASK_TYPE best_mask_type = 0;
const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
- DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
-
- if (hbd) {
- aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
- CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
- aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
- CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
- aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
- CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
- } else {
- aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
- aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
- aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
- }
-
// try each mask type and its inverse
for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
// build mask and inverse
@@ -7247,7 +7339,7 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
bw, bh, bw);
// compute rd for mask
- sse = av1_wedge_sse_from_residuals(r1, d10, xd->seg_mask, N);
+ sse = av1_wedge_sse_from_residuals(residual1, diff10, xd->seg_mask, N);
sse = ROUND_POWER_OF_TWO(sse, bd_round);
model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
@@ -7279,14 +7371,26 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
const uint8_t *const p1) {
const MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
-
- int64_t rd;
- int wedge_index = -1;
-
assert(is_interintra_wedge_used(bsize));
assert(cpi->common.seq_params.enable_interintra_compound);
- rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index);
+ const struct buf_2d *const src = &x->plane[0].src;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1
+ DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0
+ if (get_bitdepth_data_path_index(xd)) {
+ aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+ aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+ CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+ } else {
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+ aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+ }
+ int wedge_index = -1;
+ int64_t rd =
+ pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0, &wedge_index);
mbmi->interintra_wedge_sign = 0;
mbmi->interintra_wedge_index = wedge_index;
@@ -7296,11 +7400,15 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
const BLOCK_SIZE bsize,
const uint8_t *const p0,
- const uint8_t *const p1) {
+ const uint8_t *const p1,
+ const int16_t *const residual1,
+ const int16_t *const diff10) {
const COMPOUND_TYPE compound_type = x->e_mbd.mi[0]->interinter_comp.type;
switch (compound_type) {
- case COMPOUND_WEDGE: return pick_interinter_wedge(cpi, x, bsize, p0, p1);
- case COMPOUND_DIFFWTD: return pick_interinter_seg(cpi, x, bsize, p0, p1);
+ case COMPOUND_WEDGE:
+ return pick_interinter_wedge(cpi, x, bsize, p0, p1, residual1, diff10);
+ case COMPOUND_DIFFWTD:
+ return pick_interinter_seg(cpi, x, bsize, p0, p1, residual1, diff10);
default: assert(0); return 0;
}
}
@@ -7336,7 +7444,7 @@ static int64_t build_and_cost_compound_type(
const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
const BLOCK_SIZE bsize, const int this_mode, int *rs2, int rate_mv,
BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1,
- int *strides, int mi_row, int mi_col) {
+ int16_t *residual1, int16_t *diff10, int *strides, int mi_row, int mi_col) {
const AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -7348,7 +7456,8 @@ static int64_t build_and_cost_compound_type(
int64_t tmp_skip_sse_sb;
const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
- best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1);
+ best_rd_cur =
+ pick_interinter_mask(cpi, x, bsize, *preds0, *preds1, residual1, diff10);
*rs2 += get_interinter_compound_mask_rate(x, mbmi);
best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
@@ -7357,6 +7466,7 @@ static int64_t build_and_cost_compound_type(
*out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize,
this_mode, mi_row, mi_col);
av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+ av1_subtract_plane(x, bsize, 0);
model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
&tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
@@ -7367,7 +7477,6 @@ static int64_t build_and_cost_compound_type(
av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
preds1, strides);
}
- av1_subtract_plane(x, bsize, 0);
rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
&tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
if (rd != INT64_MAX)
@@ -7377,7 +7486,6 @@ static int64_t build_and_cost_compound_type(
} else {
av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
preds1, strides);
- av1_subtract_plane(x, bsize, 0);
rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
&tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
if (rd != INT64_MAX)
@@ -7393,11 +7501,11 @@ typedef struct {
int above_pred_stride[MAX_MB_PLANE];
uint8_t *left_pred_buf[MAX_MB_PLANE];
int left_pred_stride[MAX_MB_PLANE];
- int_mv *single_newmv;
+ int_mv (*single_newmv)[REF_FRAMES];
// Pointer to array of motion vectors to use for each ref and their rates
// Should point to first of 2 arrays in 2D array
- int *single_newmv_rate;
- int *single_newmv_valid;
+ int (*single_newmv_rate)[REF_FRAMES];
+ int (*single_newmv_valid)[REF_FRAMES];
// Pointer to array of predicted rate-distortion
// Should point to first of 2 arrays in 2D array
int64_t (*modelled_rd)[REF_FRAMES];
@@ -7428,14 +7536,15 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
const PREDICTION_MODE this_mode = mbmi->mode;
const int refs[2] = { mbmi->ref_frame[0],
mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
+ const int ref_mv_idx = mbmi->ref_mv_idx;
int i;
(void)args;
if (is_comp_pred) {
if (this_mode == NEW_NEWMV) {
- cur_mv[0].as_int = args->single_newmv[refs[0]].as_int;
- cur_mv[1].as_int = args->single_newmv[refs[1]].as_int;
+ cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+ cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
joint_motion_search(cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, NULL,
@@ -7451,7 +7560,7 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
}
}
} else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
- cur_mv[1].as_int = args->single_newmv[refs[1]].as_int;
+ cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
compound_single_motion_search_interinter(
cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 1);
@@ -7464,7 +7573,7 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
}
} else {
assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
- cur_mv[0].as_int = args->single_newmv[refs[0]].as_int;
+ cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
compound_single_motion_search_interinter(
cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 0);
@@ -7480,9 +7589,9 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv);
if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
- args->single_newmv[refs[0]] = x->best_mv;
- args->single_newmv_rate[refs[0]] = *rate_mv;
- args->single_newmv_valid[refs[0]] = 1;
+ args->single_newmv[ref_mv_idx][refs[0]] = x->best_mv;
+ args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv;
+ args->single_newmv_valid[ref_mv_idx][refs[0]] = 1;
cur_mv[0].as_int = x->best_mv.as_int;
@@ -7508,12 +7617,25 @@ static INLINE void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2],
restore_dst_buf(xd, *dst_bufs[0], num_planes);
}
+static INLINE int get_switchable_rate(MACROBLOCK *const x,
+ const InterpFilters filters,
+ const int ctx[2]) {
+ int inter_filter_cost;
+ const InterpFilter filter0 = av1_extract_interp_filter(filters, 0);
+ const InterpFilter filter1 = av1_extract_interp_filter(filters, 1);
+ inter_filter_cost = x->switchable_interp_costs[ctx[0]][filter0];
+ inter_filter_cost += x->switchable_interp_costs[ctx[1]][filter1];
+ return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+}
+
// calculate the rdcost of given interpolation_filter
static INLINE int64_t interpolation_filter_rd(
MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
int *const switchable_rate, int *const skip_txfm_sb,
- int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx) {
+ int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx,
+ const int switchable_ctx[2], const int skip_pred, int *rate,
+ int64_t *dist) {
const AV1_COMMON *cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *const xd = &x->e_mbd;
@@ -7523,23 +7645,136 @@ static INLINE int64_t interpolation_filter_rd(
const InterpFilters last_best = mbmi->interp_filters;
mbmi->interp_filters = filter_sets[filter_idx];
- const int tmp_rs = av1_get_switchable_rate(cm, x, xd);
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
- model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist,
- &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL);
+ const int tmp_rs =
+ get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+
+ if (!skip_pred) {
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ av1_subtract_plane(x, bsize, 0);
+#if DNN_BASED_RD_INTERP_FILTER
+ model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist,
+ &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL);
+#else
+ model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist, &tmp_skip_sb,
+ &tmp_skip_sse, NULL, NULL, NULL);
+#endif
+ if (num_planes > 1) {
+ int64_t tmp_y_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
+ if (tmp_y_rd > *rd) {
+ mbmi->interp_filters = last_best;
+ return 0;
+ }
+ int tmp_rate_uv, tmp_skip_sb_uv;
+ int64_t tmp_dist_uv, tmp_skip_sse_uv;
+ av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ for (int plane = 1; plane < num_planes; ++plane)
+ av1_subtract_plane(x, bsize, plane);
+#if DNN_BASED_RD_INTERP_FILTER
+ model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 1, num_planes - 1,
+ &tmp_rate_uv, &tmp_dist_uv, &tmp_skip_sb_uv,
+ &tmp_skip_sse_uv, NULL, NULL, NULL);
+#else
+ model_rd_for_sb(cpi, bsize, x, xd, 1, num_planes - 1, &tmp_rate_uv,
+ &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL,
+ NULL, NULL);
+#endif
+ tmp_rate += tmp_rate_uv;
+ tmp_skip_sb &= tmp_skip_sb_uv;
+ tmp_dist += tmp_dist_uv;
+ tmp_skip_sse += tmp_skip_sse_uv;
+ }
+ } else {
+ tmp_rate = *rate;
+ tmp_dist = *dist;
+ }
int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
if (tmp_rd < *rd) {
*rd = tmp_rd;
*switchable_rate = tmp_rs;
*skip_txfm_sb = tmp_skip_sb;
*skip_sse_sb = tmp_skip_sse;
- swap_dst_buf(xd, dst_bufs, num_planes);
+ *rate = tmp_rate;
+ *dist = tmp_dist;
+ if (!skip_pred) {
+ swap_dst_buf(xd, dst_bufs, num_planes);
+ }
return 1;
}
mbmi->interp_filters = last_best;
return 0;
}
+// Find the best rd filter in horizontal direction
+static INLINE int find_best_horiz_interp_filter_rd(
+ MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+ int *const switchable_rate, int *const skip_txfm_sb,
+ int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
+ const int switchable_ctx[2], const int skip_hor, int *rate, int64_t *dist,
+ int best_dual_mode) {
+ int i;
+ const int bw = block_size_wide[bsize];
+ assert(best_dual_mode == 0);
+ if ((bw <= 4) && (!skip_hor)) {
+ int skip_pred = 1;
+ // Process the filters in reverse order to enable reusing rate and
+ // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
+ for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
+ if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+ switchable_rate, skip_txfm_sb, skip_sse_sb,
+ dst_bufs, i, switchable_ctx, skip_pred, rate,
+ dist)) {
+ best_dual_mode = i;
+ }
+ skip_pred = 0;
+ }
+ } else {
+ for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
+ if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+ switchable_rate, skip_txfm_sb, skip_sse_sb,
+ dst_bufs, i, switchable_ctx, skip_hor, rate,
+ dist)) {
+ best_dual_mode = i;
+ }
+ }
+ }
+ return best_dual_mode;
+}
+
+// Find the best rd filter in vertical direction
+static INLINE void find_best_vert_interp_filter_rd(
+ MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+ int *const switchable_rate, int *const skip_txfm_sb,
+ int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
+ const int switchable_ctx[2], const int skip_ver, int *rate, int64_t *dist,
+ int best_dual_mode, int filter_set_size) {
+ int i;
+ const int bh = block_size_high[bsize];
+ if ((bh <= 4) && (!skip_ver)) {
+ int skip_pred = 1;
+ // Process the filters in reverse order to enable reusing rate and
+ // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
+ assert(filter_set_size == DUAL_FILTER_SET_SIZE);
+ for (i = (filter_set_size - SWITCHABLE_FILTERS + best_dual_mode);
+ i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) {
+ interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+ switchable_rate, skip_txfm_sb, skip_sse_sb,
+ dst_bufs, i, switchable_ctx, skip_pred, rate,
+ dist);
+ skip_pred = 0;
+ }
+ } else {
+ for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
+ i += SWITCHABLE_FILTERS) {
+ interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+ switchable_rate, skip_txfm_sb, skip_sse_sb,
+ dst_bufs, i, switchable_ctx, skip_ver, rate,
+ dist);
+ }
+ }
+}
+
// check if there is saved result match with this search
static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st,
MB_MODE_INFO *const mi) {
@@ -7605,10 +7840,22 @@ static int64_t interpolation_filter_search(
if (!need_search || match_found == -1) {
set_default_interp_filters(mbmi, assign_filter);
}
- *switchable_rate = av1_get_switchable_rate(cm, x, xd);
+ int switchable_ctx[2];
+ switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0);
+ switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
+ *switchable_rate =
+ get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ for (int plane = 0; plane < num_planes; ++plane)
+ av1_subtract_plane(x, bsize, plane);
+#if DNN_BASED_RD_INTERP_FILTER
+ model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
+ &tmp_dist, skip_txfm_sb, skip_sse_sb, NULL, NULL,
+ NULL);
+#else
model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist,
skip_txfm_sb, skip_sse_sb, NULL, NULL, NULL);
+#endif // DNN_BASED_RD_INTERP_FILTER
*rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist);
if (assign_filter != SWITCHABLE || match_found != -1) {
@@ -7619,6 +7866,23 @@ static int64_t interpolation_filter_search(
av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
return 0;
}
+ int skip_hor = 1;
+ int skip_ver = 1;
+ const int is_compound = has_second_ref(mbmi);
+ for (int k = 0; k < num_planes - 1; ++k) {
+ struct macroblockd_plane *const pd = &xd->plane[k];
+ const int bw = pd->width;
+ const int bh = pd->height;
+ for (int j = 0; j < 1 + is_compound; ++j) {
+ const MV mv = mbmi->mv[j].as_mv;
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(
+ xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+ const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ skip_hor &= (sub_x == 0);
+ skip_ver &= (sub_y == 0);
+ }
+ }
// do interp_filter search
const int filter_set_size = DUAL_FILTER_SET_SIZE;
restore_dst_buf(xd, *tmp_dst, num_planes);
@@ -7629,20 +7893,16 @@ static int64_t interpolation_filter_search(
int best_dual_mode = 0;
// Find best of {R}x{R,Sm,Sh}
// EIGHTTAP_REGULAR mode is calculated beforehand
- for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
- if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
- switchable_rate, skip_txfm_sb, skip_sse_sb,
- dst_bufs, i)) {
- best_dual_mode = i;
- }
- }
+ best_dual_mode = find_best_horiz_interp_filter_rd(
+ x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
+ skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_hor,
+ &tmp_rate, &tmp_dist, best_dual_mode);
+
// From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
- for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
- i += SWITCHABLE_FILTERS) {
- interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
- switchable_rate, skip_txfm_sb, skip_sse_sb,
- dst_bufs, i);
- }
+ find_best_vert_interp_filter_rd(
+ x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
+ skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_ver,
+ &tmp_rate, &tmp_dist, best_dual_mode, filter_set_size);
} else {
// EIGHTTAP_REGULAR mode is calculated beforehand
for (i = 1; i < filter_set_size; ++i) {
@@ -7653,7 +7913,8 @@ static int64_t interpolation_filter_search(
}
interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
switchable_rate, skip_txfm_sb, skip_sse_sb,
- dst_bufs, i);
+ dst_bufs, i, switchable_ctx, 0, &tmp_rate,
+ &tmp_dist);
}
}
swap_dst_buf(xd, dst_bufs, num_planes);
@@ -7848,6 +8109,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
intrapred, bw);
av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ av1_subtract_plane(x, bsize, 0);
model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
&tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
@@ -7861,7 +8123,6 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
intrapred, bw);
av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
- av1_subtract_plane(x, bsize, 0);
rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
&tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
if (rd != INT64_MAX)
@@ -7908,6 +8169,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
mbmi->mv[0].as_int = tmp_mv.as_int;
av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
bsize);
+ av1_subtract_plane(x, bsize, 0);
model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
&tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL,
NULL);
@@ -7925,7 +8187,6 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
}
// Evaluate closer to true rd
- av1_subtract_plane(x, bsize, 0);
rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
&tmp_skip_txfm_sb, &tmp_skip_sse_sb,
INT64_MAX);
@@ -8323,6 +8584,148 @@ static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
return cost;
}
+static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_col, int mi_row,
+ int_mv *cur_mv, int masked_compound_used,
+ BUFFER_SET *orig_dst, BUFFER_SET *tmp_dst,
+ int *rate_mv, int64_t *rd,
+ RD_STATS *rd_stats, int64_t ref_best_rd) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int this_mode = mbmi->mode;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ int rate_sum, rs2;
+ int64_t dist_sum;
+
+ int_mv best_mv[2];
+ int best_tmp_rate_mv = *rate_mv;
+ int tmp_skip_txfm_sb;
+ int64_t tmp_skip_sse_sb;
+ INTERINTER_COMPOUND_DATA best_compound_data;
+ best_compound_data.type = COMPOUND_AVERAGE;
+ DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1
+ DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0
+ uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE];
+ uint8_t *preds0[1] = { pred0 };
+ uint8_t *preds1[1] = { pred1 };
+ int strides[1] = { bw };
+ int tmp_rate_mv;
+ const int num_pix = 1 << num_pels_log2_lookup[bsize];
+ const int mask_len = 2 * num_pix * sizeof(uint8_t);
+ COMPOUND_TYPE cur_type;
+ int best_compmode_interinter_cost = 0;
+ int can_use_previous = cm->allow_warped_motion;
+
+ best_mv[0].as_int = cur_mv[0].as_int;
+ best_mv[1].as_int = cur_mv[1].as_int;
+ *rd = INT64_MAX;
+ if (masked_compound_used) {
+ // get inter predictors to use for masked compound modes
+ av1_build_inter_predictors_for_planes_single_buf(
+ xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous);
+ av1_build_inter_predictors_for_planes_single_buf(
+ xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous);
+ const struct buf_2d *const src = &x->plane[0].src;
+ if (get_bitdepth_data_path_index(xd)) {
+ aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(pred1), bw, xd->bd);
+ aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(pred1),
+ bw, CONVERT_TO_BYTEPTR(pred0), bw, xd->bd);
+ } else {
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, pred1,
+ bw);
+ aom_subtract_block(bh, bw, diff10, bw, pred1, bw, pred0, bw);
+ }
+ }
+ const int orig_is_best = xd->plane[0].dst.buf == orig_dst->plane[0];
+ const BUFFER_SET *backup_buf = orig_is_best ? tmp_dst : orig_dst;
+ const BUFFER_SET *best_buf = orig_is_best ? orig_dst : tmp_dst;
+ for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
+ if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
+ if (!is_interinter_compound_used(cur_type, bsize)) continue;
+ tmp_rate_mv = *rate_mv;
+ int64_t best_rd_cur = INT64_MAX;
+ mbmi->interinter_comp.type = cur_type;
+ int masked_type_cost = 0;
+
+ const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+ mbmi->compound_idx = 1;
+ if (cur_type == COMPOUND_AVERAGE) {
+ mbmi->comp_group_idx = 0;
+ if (masked_compound_used) {
+ masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0];
+ }
+ masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
+ rs2 = masked_type_cost;
+ // No need to call av1_build_inter_predictors_sby here
+ // 1. COMPOUND_AVERAGE is always the first candidate
+ // 2. av1_build_inter_predictors_sby has been called by
+ // interpolation_filter_search
+ int64_t est_rd =
+ estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ // use spare buffer for following compound type try
+ restore_dst_buf(xd, *backup_buf, 1);
+ if (est_rd != INT64_MAX)
+ best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+ } else {
+ mbmi->comp_group_idx = 1;
+ masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1];
+ masked_type_cost += x->compound_type_cost[bsize][cur_type - 1];
+ rs2 = masked_type_cost;
+ if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+ *rd / 3 < ref_best_rd) {
+ best_rd_cur = build_and_cost_compound_type(
+ cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
+ &tmp_rate_mv, preds0, preds1, residual1, diff10, strides, mi_row,
+ mi_col);
+ }
+ }
+ if (best_rd_cur < *rd) {
+ *rd = best_rd_cur;
+ best_compound_data = mbmi->interinter_comp;
+ if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) {
+ memcpy(tmp_best_mask_buf, xd->seg_mask, mask_len);
+ }
+ best_compmode_interinter_cost = rs2;
+ if (have_newmv_in_inter_mode(this_mode)) {
+ if (use_masked_motion_search(cur_type)) {
+ best_tmp_rate_mv = tmp_rate_mv;
+ best_mv[0].as_int = mbmi->mv[0].as_int;
+ best_mv[1].as_int = mbmi->mv[1].as_int;
+ } else {
+ best_mv[0].as_int = cur_mv[0].as_int;
+ best_mv[1].as_int = cur_mv[1].as_int;
+ }
+ }
+ }
+ // reset to original mvs for next iteration
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ }
+ if (mbmi->interinter_comp.type != best_compound_data.type) {
+ mbmi->comp_group_idx =
+ (best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1;
+ mbmi->interinter_comp = best_compound_data;
+ memcpy(xd->seg_mask, tmp_best_mask_buf, mask_len);
+ }
+ if (have_newmv_in_inter_mode(this_mode)) {
+ mbmi->mv[0].as_int = best_mv[0].as_int;
+ mbmi->mv[1].as_int = best_mv[1].as_int;
+ if (use_masked_motion_search(mbmi->interinter_comp.type)) {
+ rd_stats->rate += best_tmp_rate_mv - *rate_mv;
+ *rate_mv = best_tmp_rate_mv;
+ }
+ }
+ restore_dst_buf(xd, *best_buf, 1);
+ return best_compmode_interinter_cost;
+}
+
static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, RD_STATS *rd_stats,
RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
@@ -8344,63 +8747,24 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
int refs[2] = { mbmi->ref_frame[0],
(mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
int rate_mv = 0;
- const int bw = block_size_wide[bsize];
DECLARE_ALIGNED(32, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
- uint8_t *tmp_buf;
+ uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
int64_t rd = INT64_MAX;
BUFFER_SET orig_dst, tmp_dst;
int skip_txfm_sb = 0;
int64_t skip_sse_sb = INT64_MAX;
int16_t mode_ctx;
-
- mbmi->interinter_comp.type = COMPOUND_AVERAGE;
- mbmi->comp_group_idx = 0;
- mbmi->compound_idx = 1;
- if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
-
- mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
- else
- tmp_buf = tmp_buf_;
- // Make sure that we didn't leave the plane destination buffers set
- // to tmp_buf at the end of the last iteration
- assert(xd->plane[0].dst.buf != tmp_buf);
-
- mbmi->num_proj_ref[0] = 0;
- mbmi->num_proj_ref[1] = 0;
-
- if (is_comp_pred) {
- for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) {
- const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred);
- if (single_mode == NEWMV &&
- args->single_newmv[mbmi->ref_frame[ref_idx]].as_int == INVALID_MV)
- return INT64_MAX;
- }
- }
-
- mbmi->motion_mode = SIMPLE_TRANSLATION;
const int masked_compound_used = is_any_masked_compound_used(bsize) &&
cm->seq_params.enable_masked_compound;
int64_t ret_val = INT64_MAX;
const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
- rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
- rd_stats->rate +=
- get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
- const RD_STATS backup_rd_stats = *rd_stats;
- const RD_STATS backup_rd_stats_y = *rd_stats_y;
- const RD_STATS backup_rd_stats_uv = *rd_stats_uv;
- const MB_MODE_INFO backup_mbmi = *mbmi;
- INTERINTER_COMPOUND_DATA best_compound_data;
- uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE];
RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
int64_t best_rd = INT64_MAX;
- int64_t best_ret_val = INT64_MAX;
uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
MB_MODE_INFO best_mbmi = *mbmi;
- int64_t early_terminate = 0;
+ int best_disable_skip;
+ int best_xskip;
int plane_rate[MAX_MB_PLANE] = { 0 };
int64_t plane_sse[MAX_MB_PLANE] = { 0 };
int64_t plane_dist[MAX_MB_PLANE] = { 0 };
@@ -8411,387 +8775,311 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
int comp_idx;
const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp &
(mbmi->mode != GLOBAL_GLOBALMV);
- // If !search_jnt_comp, we need to force mbmi->compound_idx = 1.
- for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) {
- int rs = 0;
- int compmode_interinter_cost = 0;
- early_terminate = 0;
- *rd_stats = backup_rd_stats;
- *rd_stats_y = backup_rd_stats_y;
- *rd_stats_uv = backup_rd_stats_uv;
- *mbmi = backup_mbmi;
- mbmi->compound_idx = comp_idx;
-
- if (is_comp_pred && comp_idx == 0) {
- mbmi->comp_group_idx = 0;
- mbmi->compound_idx = 0;
- const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
- const int comp_index_ctx = get_comp_index_context(cm, xd);
- if (masked_compound_used) {
- compmode_interinter_cost +=
- x->comp_group_idx_cost[comp_group_idx_ctx][0];
+ const int has_drl = (have_nearmv_in_inter_mode(mbmi->mode) &&
+ mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
+ ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
+ mbmi_ext->ref_mv_count[ref_frame_type] > 1);
+
+ // TODO(jingning): This should be deprecated shortly.
+ const int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+ const int ref_set =
+ has_drl ? AOMMIN(MAX_REF_MV_SERCH,
+ mbmi_ext->ref_mv_count[ref_frame_type] - idx_offset)
+ : 1;
+
+ for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ if (cpi->sf.reduce_inter_modes && ref_mv_idx > 0) {
+ if (mbmi->ref_frame[0] == LAST2_FRAME ||
+ mbmi->ref_frame[0] == LAST3_FRAME ||
+ mbmi->ref_frame[1] == LAST2_FRAME ||
+ mbmi->ref_frame[1] == LAST3_FRAME) {
+ if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + idx_offset]
+ .weight < REF_CAT_LEVEL) {
+ continue;
+ }
}
- compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0];
}
- int_mv cur_mv[2];
- if (!build_cur_mv(cur_mv, this_mode, cm, x)) {
- early_terminate = INT64_MAX;
- continue;
- }
- if (have_newmv_in_inter_mode(this_mode)) {
- if (comp_idx == 0) {
- cur_mv[0] = backup_mv[0];
- cur_mv[1] = backup_mv[1];
- rate_mv = backup_rate_mv;
- }
+ av1_init_rd_stats(rd_stats);
- // when jnt_comp_skip_mv_search flag is on, new mv will be searched once
- if (!(search_jnt_comp && cpi->sf.jnt_comp_skip_mv_search &&
- comp_idx == 0)) {
- newmv_ret_val =
- handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col, &rate_mv, args);
-
- // Store cur_mv and rate_mv so that they can be restored in the next
- // iteration of the loop
- backup_mv[0] = cur_mv[0];
- backup_mv[1] = cur_mv[1];
- backup_rate_mv = rate_mv;
- }
-
- if (newmv_ret_val != 0) {
- early_terminate = INT64_MAX;
- continue;
- } else {
- rd_stats->rate += rate_mv;
- }
- }
- for (i = 0; i < is_comp_pred + 1; ++i) {
- mbmi->mv[i].as_int = cur_mv[i].as_int;
- }
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = 1;
+ if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
- // Initialise tmp_dst and orig_dst buffers to prevent "may be used
- // uninitialized" warnings in GCC when the stream is monochrome.
- memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane));
- memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride));
- memset(orig_dst.plane, 0, sizeof(tmp_dst.plane));
- memset(orig_dst.stride, 0, sizeof(tmp_dst.stride));
+ mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
- // do first prediction into the destination buffer. Do the next
- // prediction into a temporary buffer. Then keep track of which one
- // of these currently holds the best predictor, and use the other
- // one for future predictions. In the end, copy from tmp_buf to
- // dst if necessary.
- for (i = 0; i < num_planes; i++) {
- tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
- tmp_dst.stride[i] = MAX_SB_SIZE;
- }
- for (i = 0; i < num_planes; i++) {
- orig_dst.plane[i] = xd->plane[i].dst.buf;
- orig_dst.stride[i] = xd->plane[i].dst.stride;
- }
+ mbmi->num_proj_ref[0] = 0;
+ mbmi->num_proj_ref[1] = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->ref_mv_idx = ref_mv_idx;
- const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
-#if USE_DISCOUNT_NEWMV_TEST
- // We don't include the cost of the second reference here, because there
- // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
- // words if you present them in that order, the second one is always known
- // if the first is known.
- //
- // Under some circumstances we discount the cost of new mv mode to encourage
- // initiation of a motion field.
- if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
- // discount_newmv_test only applies discount on NEWMV mode.
- assert(this_mode == NEWMV);
- rd_stats->rate += AOMMIN(cost_mv_ref(x, this_mode, mode_ctx),
- cost_mv_ref(x, NEARESTMV, mode_ctx));
- } else {
- rd_stats->rate += ref_mv_cost;
+ if (is_comp_pred) {
+ for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) {
+ const int single_mode =
+ get_single_mode(this_mode, ref_idx, is_comp_pred);
+ if (single_mode == NEWMV &&
+ args->single_newmv[mbmi->ref_mv_idx][mbmi->ref_frame[ref_idx]]
+ .as_int == INVALID_MV)
+ continue;
+ }
}
-#else
- rd_stats->rate += ref_mv_cost;
-#endif
- if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
- mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
- early_terminate = INT64_MAX;
- continue;
- }
+ rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
+ rd_stats->rate +=
+ get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
- ret_val = interpolation_filter_search(
- x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter,
- &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
- if (ret_val != 0) {
- early_terminate = INT64_MAX;
- restore_dst_buf(xd, orig_dst, num_planes);
- continue;
- } else if (cpi->sf.model_based_post_interp_filter_breakout &&
- ref_best_rd != INT64_MAX && (rd / 6) > ref_best_rd) {
- early_terminate = INT64_MAX;
- restore_dst_buf(xd, orig_dst, num_planes);
- if ((rd >> 4) > ref_best_rd) break;
- continue;
- }
+ const RD_STATS backup_rd_stats = *rd_stats;
+ const MB_MODE_INFO backup_mbmi = *mbmi;
+ int64_t best_rd2 = INT64_MAX;
- if (is_comp_pred && comp_idx) {
- int rate_sum, rs2;
- int64_t dist_sum;
- int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX;
- int_mv best_mv[2];
- int best_tmp_rate_mv = rate_mv;
- int tmp_skip_txfm_sb;
- int64_t tmp_skip_sse_sb;
- DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
- uint8_t *preds0[1] = { pred0 };
- uint8_t *preds1[1] = { pred1 };
- int strides[1] = { bw };
- int tmp_rate_mv;
- const int num_pix = 1 << num_pels_log2_lookup[bsize];
- COMPOUND_TYPE cur_type;
- int best_compmode_interinter_cost = 0;
- int can_use_previous = cm->allow_warped_motion;
-
- best_mv[0].as_int = cur_mv[0].as_int;
- best_mv[1].as_int = cur_mv[1].as_int;
+ // If !search_jnt_comp, we need to force mbmi->compound_idx = 1.
+ for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) {
+ int rs = 0;
+ int compmode_interinter_cost = 0;
+ *rd_stats = backup_rd_stats;
+ *mbmi = backup_mbmi;
+ mbmi->compound_idx = comp_idx;
- if (masked_compound_used) {
- // get inter predictors to use for masked compound modes
- av1_build_inter_predictors_for_planes_single_buf(
- xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides,
- can_use_previous);
- av1_build_inter_predictors_for_planes_single_buf(
- xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides,
- can_use_previous);
- }
-
- int best_comp_group_idx = 0;
- int best_compound_idx = 1;
- for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
- if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
- if (!is_interinter_compound_used(cur_type, bsize)) continue;
- tmp_rate_mv = rate_mv;
- best_rd_cur = INT64_MAX;
- mbmi->interinter_comp.type = cur_type;
- int masked_type_cost = 0;
+ if (is_comp_pred && comp_idx == 0) {
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = 0;
const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
const int comp_index_ctx = get_comp_index_context(cm, xd);
if (masked_compound_used) {
- if (cur_type == COMPOUND_AVERAGE) {
- mbmi->comp_group_idx = 0;
- mbmi->compound_idx = 1;
-
- masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0];
- masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
- } else {
- mbmi->comp_group_idx = 1;
- mbmi->compound_idx = 1;
-
- masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1];
- masked_type_cost +=
- x->compound_type_cost[bsize][mbmi->interinter_comp.type - 1];
- }
- } else {
- mbmi->comp_group_idx = 0;
- mbmi->compound_idx = 1;
-
- masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
+ compmode_interinter_cost +=
+ x->comp_group_idx_cost[comp_group_idx_ctx][0];
}
- rs2 = masked_type_cost;
+ compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0];
+ }
- switch (cur_type) {
- case COMPOUND_AVERAGE:
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
- bsize);
- av1_subtract_plane(x, bsize, 0);
- rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
- INT64_MAX);
- if (rd != INT64_MAX)
- best_rd_cur =
- RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
- break;
- case COMPOUND_WEDGE:
- if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
- best_rd_compound / 3 < ref_best_rd) {
- best_rd_cur = build_and_cost_compound_type(
- cpi, x, cur_mv, bsize, this_mode, &rs2, rate_mv, &orig_dst,
- &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
- }
- break;
- case COMPOUND_DIFFWTD:
- if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
- best_rd_compound / 3 < ref_best_rd) {
- best_rd_cur = build_and_cost_compound_type(
- cpi, x, cur_mv, bsize, this_mode, &rs2, rate_mv, &orig_dst,
- &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
- }
- break;
- default: assert(0); return INT64_MAX;
+ int_mv cur_mv[2];
+ if (!build_cur_mv(cur_mv, this_mode, cm, x)) {
+ continue;
+ }
+ if (have_newmv_in_inter_mode(this_mode)) {
+ if (comp_idx == 0) {
+ cur_mv[0] = backup_mv[0];
+ cur_mv[1] = backup_mv[1];
+ rate_mv = backup_rate_mv;
}
- if (best_rd_cur < best_rd_compound) {
- best_comp_group_idx = mbmi->comp_group_idx;
- best_compound_idx = mbmi->compound_idx;
- best_rd_compound = best_rd_cur;
- best_compound_data = mbmi->interinter_comp;
- memcpy(tmp_best_mask_buf, xd->seg_mask,
- 2 * num_pix * sizeof(uint8_t));
- best_compmode_interinter_cost = rs2;
- if (have_newmv_in_inter_mode(this_mode)) {
- if (use_masked_motion_search(cur_type)) {
- best_tmp_rate_mv = tmp_rate_mv;
- best_mv[0].as_int = mbmi->mv[0].as_int;
- best_mv[1].as_int = mbmi->mv[1].as_int;
- } else {
- best_mv[0].as_int = cur_mv[0].as_int;
- best_mv[1].as_int = cur_mv[1].as_int;
- }
- }
+ // when jnt_comp_skip_mv_search flag is on, new mv will be searched once
+ if (!(search_jnt_comp && cpi->sf.jnt_comp_skip_mv_search &&
+ comp_idx == 0)) {
+ newmv_ret_val = handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col,
+ &rate_mv, args);
+
+ // Store cur_mv and rate_mv so that they can be restored in the next
+ // iteration of the loop
+ backup_mv[0] = cur_mv[0];
+ backup_mv[1] = cur_mv[1];
+ backup_rate_mv = rate_mv;
}
- // reset to original mvs for next iteration
- mbmi->mv[0].as_int = cur_mv[0].as_int;
- mbmi->mv[1].as_int = cur_mv[1].as_int;
- }
- mbmi->comp_group_idx = best_comp_group_idx;
- mbmi->compound_idx = best_compound_idx;
- mbmi->interinter_comp = best_compound_data;
- assert(IMPLIES(mbmi->comp_group_idx == 1,
- mbmi->interinter_comp.type != COMPOUND_AVERAGE));
- memcpy(xd->seg_mask, tmp_best_mask_buf, 2 * num_pix * sizeof(uint8_t));
- if (have_newmv_in_inter_mode(this_mode)) {
- mbmi->mv[0].as_int = best_mv[0].as_int;
- mbmi->mv[1].as_int = best_mv[1].as_int;
- if (use_masked_motion_search(mbmi->interinter_comp.type)) {
- rd_stats->rate += best_tmp_rate_mv - rate_mv;
- rate_mv = best_tmp_rate_mv;
+
+ if (newmv_ret_val != 0) {
+ continue;
+ } else {
+ rd_stats->rate += rate_mv;
}
}
+ for (i = 0; i < is_comp_pred + 1; ++i) {
+ mbmi->mv[i].as_int = cur_mv[i].as_int;
+ }
- if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
- restore_dst_buf(xd, orig_dst, num_planes);
- early_terminate = INT64_MAX;
+ // Initialise tmp_dst and orig_dst buffers to prevent "may be used
+ // uninitialized" warnings in GCC when the stream is monochrome.
+ memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane));
+ memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride));
+ memset(orig_dst.plane, 0, sizeof(tmp_dst.plane));
+ memset(orig_dst.stride, 0, sizeof(tmp_dst.stride));
+
+ // do first prediction into the destination buffer. Do the next
+ // prediction into a temporary buffer. Then keep track of which one
+ // of these currently holds the best predictor, and use the other
+ // one for future predictions. In the end, copy from tmp_buf to
+ // dst if necessary.
+ for (i = 0; i < num_planes; i++) {
+ tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
+ tmp_dst.stride[i] = MAX_SB_SIZE;
+ }
+ for (i = 0; i < num_planes; i++) {
+ orig_dst.plane[i] = xd->plane[i].dst.buf;
+ orig_dst.stride[i] = xd->plane[i].dst.stride;
+ }
+
+ const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
+#if USE_DISCOUNT_NEWMV_TEST
+ // We don't include the cost of the second reference here, because there
+ // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in
+ // other words if you present them in that order, the second one is always
+ // known if the first is known.
+ //
+ // Under some circumstances we discount the cost of new mv mode to
+ // encourage initiation of a motion field.
+ if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
+ // discount_newmv_test only applies discount on NEWMV mode.
+ assert(this_mode == NEWMV);
+ rd_stats->rate += AOMMIN(cost_mv_ref(x, this_mode, mode_ctx),
+ cost_mv_ref(x, NEARESTMV, mode_ctx));
+ } else {
+ rd_stats->rate += ref_mv_cost;
+ }
+#else
+ rd_stats->rate += ref_mv_cost;
+#endif
+
+ if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+ mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
continue;
}
- compmode_interinter_cost = best_compmode_interinter_cost;
- }
- if (is_comp_pred) {
- int tmp_rate;
- int64_t tmp_dist;
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize);
- model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
- &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate,
- plane_sse, plane_dist);
- rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
- }
-
- if (search_jnt_comp) {
- // if 1/2 model rd is larger than best_rd in jnt_comp mode,
- // use jnt_comp mode, save additional search
- if ((rd >> 1) > best_rd) {
+ ret_val = interpolation_filter_search(
+ x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
+ args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
+ if (ret_val != 0) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ } else if (cpi->sf.model_based_post_interp_filter_breakout &&
+ ref_best_rd != INT64_MAX && (rd / 6 > ref_best_rd)) {
restore_dst_buf(xd, orig_dst, num_planes);
+ if ((rd >> 4) > ref_best_rd) break;
continue;
}
- }
- if (!is_comp_pred)
- args->single_filter[this_mode][refs[0]] =
- av1_extract_interp_filter(mbmi->interp_filters, 0);
+ if (is_comp_pred && comp_idx) {
+ int64_t best_rd_compound;
+ compmode_interinter_cost = compound_type_rd(
+ cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used,
+ &orig_dst, &tmp_dst, &rate_mv, &best_rd_compound, rd_stats,
+ ref_best_rd);
+ if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+ if (mbmi->interinter_comp.type != COMPOUND_AVERAGE) {
+ int tmp_rate;
+ int64_t tmp_dist;
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst,
+ bsize);
+ for (int plane = 0; plane < num_planes; ++plane)
+ av1_subtract_plane(x, bsize, plane);
+ model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
+ &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate,
+ plane_sse, plane_dist);
+ rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
+ }
+ }
- if (args->modelled_rd != NULL) {
- if (is_comp_pred) {
- const int mode0 = compound_ref0_mode(this_mode);
- const int mode1 = compound_ref1_mode(this_mode);
- const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
- args->modelled_rd[mode1][refs[1]]);
- if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
+ if (search_jnt_comp) {
+ // if 1/2 model rd is larger than best_rd in jnt_comp mode,
+ // use jnt_comp mode, save additional search
+ if ((rd >> 1) > best_rd) {
restore_dst_buf(xd, orig_dst, num_planes);
- early_terminate = INT64_MAX;
continue;
}
- } else {
- args->modelled_rd[this_mode][refs[0]] = rd;
}
- }
- if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
- // if current pred_error modeled rd is substantially more than the best
- // so far, do not bother doing full rd
- if (rd / 2 > ref_best_rd) {
- restore_dst_buf(xd, orig_dst, num_planes);
- early_terminate = INT64_MAX;
- continue;
+ if (!is_comp_pred)
+ args->single_filter[this_mode][refs[0]] =
+ av1_extract_interp_filter(mbmi->interp_filters, 0);
+
+ if (args->modelled_rd != NULL) {
+ if (is_comp_pred) {
+ const int mode0 = compound_ref0_mode(this_mode);
+ const int mode1 = compound_ref1_mode(this_mode);
+ const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
+ args->modelled_rd[mode1][refs[1]]);
+ if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+ } else {
+ args->modelled_rd[this_mode][refs[0]] = rd;
+ }
}
- }
- rd_stats->rate += compmode_interinter_cost;
+ if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+ // if current pred_error modeled rd is substantially more than the best
+ // so far, do not bother doing full rd
+ if (rd / 2 > ref_best_rd) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+ }
- if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) {
- // TODO(chengchen): this speed feature introduces big loss.
- // Need better estimation of rate distortion.
- rd_stats->rate += rs;
- rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2];
- rd_stats_y->rate = plane_rate[0];
- rd_stats_uv->rate = plane_rate[1] + plane_rate[2];
- rd_stats->sse = plane_sse[0] + plane_sse[1] + plane_sse[2];
- rd_stats_y->sse = plane_sse[0];
- rd_stats_uv->sse = plane_sse[1] + plane_sse[2];
- rd_stats->dist = plane_dist[0] + plane_dist[1] + plane_dist[2];
- rd_stats_y->dist = plane_dist[0];
- rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
- } else {
+ rd_stats->rate += compmode_interinter_cost;
+
+ if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) {
+ // TODO(chengchen): this speed feature introduces big loss.
+ // Need better estimation of rate distortion.
+ rd_stats->rate += rs;
+ rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2];
+ rd_stats_y->rate = plane_rate[0];
+ rd_stats_uv->rate = plane_rate[1] + plane_rate[2];
+ rd_stats->sse = plane_sse[0] + plane_sse[1] + plane_sse[2];
+ rd_stats_y->sse = plane_sse[0];
+ rd_stats_uv->sse = plane_sse[1] + plane_sse[2];
+ rd_stats->dist = plane_dist[0] + plane_dist[1] + plane_dist[2];
+ rd_stats_y->dist = plane_dist[0];
+ rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
+ } else {
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
- disable_skip, mi_row, mi_col, args, ref_best_rd,
- refs, rate_mv, &orig_dst, best_est_rd);
+ ret_val =
+ motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
+ disable_skip, mi_row, mi_col, args, ref_best_rd,
+ refs, rate_mv, &orig_dst, best_est_rd);
#else
- ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
- disable_skip, mi_row, mi_col, args, ref_best_rd,
- refs, rate_mv, &orig_dst);
+ ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y,
+ rd_stats_uv, disable_skip, mi_row, mi_col,
+ args, ref_best_rd, refs, rate_mv, &orig_dst);
#endif
- }
- if (ret_val != INT64_MAX) {
- if (search_jnt_comp) {
+ }
+ if (ret_val != INT64_MAX) {
int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
if (tmp_rd < best_rd) {
best_rd_stats = *rd_stats;
best_rd_stats_y = *rd_stats_y;
best_rd_stats_uv = *rd_stats_uv;
- best_ret_val = ret_val;
best_rd = tmp_rd;
best_mbmi = *mbmi;
+ best_disable_skip = *disable_skip;
+ best_xskip = x->skip;
memcpy(best_blk_skip, x->blk_skip,
sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
}
+
+ if (tmp_rd < best_rd2) {
+ best_rd2 = tmp_rd;
+ }
+
if (tmp_rd < ref_best_rd) {
ref_best_rd = tmp_rd;
}
}
- }
- if (!search_jnt_comp && ret_val != 0) {
restore_dst_buf(xd, orig_dst, num_planes);
- return ret_val;
}
- restore_dst_buf(xd, orig_dst, num_planes);
+
+ args->modelled_rd = NULL;
}
+ if (best_rd == INT64_MAX) return INT64_MAX;
+
// re-instate status of the best choice
- if (is_comp_pred && best_ret_val != INT64_MAX) {
- *rd_stats = best_rd_stats;
- *rd_stats_y = best_rd_stats_y;
- *rd_stats_uv = best_rd_stats_uv;
- ret_val = best_ret_val;
- *mbmi = best_mbmi;
- assert(IMPLIES(mbmi->comp_group_idx == 1,
- mbmi->interinter_comp.type != COMPOUND_AVERAGE));
- memcpy(x->blk_skip, best_blk_skip,
- sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
- }
- if (early_terminate == INT64_MAX) return INT64_MAX;
- if (ret_val != 0) return ret_val;
+ *rd_stats = best_rd_stats;
+ *rd_stats_y = best_rd_stats_y;
+ *rd_stats_uv = best_rd_stats_uv;
+ *mbmi = best_mbmi;
+ *disable_skip = best_disable_skip;
+ x->skip = best_xskip;
+ assert(IMPLIES(mbmi->comp_group_idx == 1,
+ mbmi->interinter_comp.type != COMPOUND_AVERAGE));
+ memcpy(x->blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
+
return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
}
@@ -8822,6 +9110,13 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv,
0);
+ if (nearestmv.as_int == INVALID_MV) {
+ nearestmv.as_int = 0;
+ }
+ if (nearmv.as_int == INVALID_MV) {
+ nearmv.as_int = 0;
+ }
+
int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
if (dv_ref.as_int == 0)
av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row, mi_col);
@@ -9013,8 +9308,9 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
if (intra_yrd < best_rd) {
// Only store reconstructed luma when there's chroma RDO. When there's no
// chroma RDO, the reconstructed luma will be stored in encode_superblock().
- xd->cfl.is_chroma_reference = is_chroma_reference(
- mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+ xd->cfl.is_chroma_reference =
+ is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y);
xd->cfl.store_y = store_cfl_required_rdo(cm, x);
if (xd->cfl.store_y) {
// Restore reconstructed luma values.
@@ -9081,7 +9377,7 @@ static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
for (r = 0; r < rows; ++r) {
for (c = 0; c < cols; ++c) {
- if (cpi->common.use_highbitdepth) {
+ if (cpi->common.seq_params.use_highbitdepth) {
data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
} else {
@@ -9760,6 +10056,8 @@ static int inter_mode_search_order_independent_skip(
if (comp_pred) {
if (!cpi->allow_comp_inter_inter) return 1;
+ if (cm->reference_mode == SINGLE_REFERENCE) return 1;
+
// Skip compound inter modes if ARF is not available.
if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame[1]])) return 1;
@@ -9857,7 +10155,7 @@ static int handle_intra_mode(InterModeSearchState *search_state,
av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
const int intra_cost_penalty = av1_get_intra_cost_penalty(
- cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+ cm->base_qindex, cm->y_dc_delta_q, cm->seq_params.bit_depth);
const int rows = block_size_high[bsize];
const int cols = block_size_wide[bsize];
const int num_planes = av1_num_planes(cm);
@@ -10050,7 +10348,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
const int try_palette =
av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
- MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
const struct segmentation *const seg = &cm->seg;
PREDICTION_MODE this_mode;
MV_REFERENCE_FRAME ref_frame, second_ref_frame;
@@ -10097,7 +10394,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
int64_t distortion2 = 0;
int skippable = 0;
int this_skip2 = 0;
- uint8_t ref_frame_type;
this_mode = av1_mode_order[mode_index].mode;
ref_frame = av1_mode_order[mode_index].ref_frame[0];
@@ -10195,7 +10491,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
mbmi->angle_delta[PLANE_TYPE_UV] = 0;
mbmi->filter_intra_mode_info.use_filter_intra = 0;
mbmi->ref_mv_idx = 0;
- ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
int64_t ref_best_rd = search_state.best_rd;
{
RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
@@ -10203,9 +10498,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
rd_stats.rate = rate2;
// Point to variables that are maintained between loop iterations
- args.single_newmv = search_state.single_newmv[0];
- args.single_newmv_rate = search_state.single_newmv_rate[0];
- args.single_newmv_valid = search_state.single_newmv_valid[0];
+ args.single_newmv = search_state.single_newmv;
+ args.single_newmv_rate = search_state.single_newmv_rate;
+ args.single_newmv_valid = search_state.single_newmv_valid;
args.modelled_rd = search_state.modelled_rd;
args.single_comp_cost = real_compmode_cost;
args.ref_frame_cost = ref_frame_cost;
@@ -10218,10 +10513,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
&rd_stats_uv, &disable_skip, mi_row, mi_col,
&args, ref_best_rd);
#endif
- if (this_rd < ref_best_rd) {
- ref_best_rd = this_rd;
- }
-
rate2 = rd_stats.rate;
skippable = rd_stats.skip;
distortion2 = rd_stats.dist;
@@ -10229,108 +10520,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
rate_uv = rd_stats_uv.rate;
}
- // TODO(jingning): This needs some refactoring to improve code quality
- // and reduce redundant steps.
- if ((have_nearmv_in_inter_mode(mbmi->mode) &&
- mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
- ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
- mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
- MB_MODE_INFO backup_mbmi = *mbmi;
- int backup_skip = x->skip;
- int64_t tmp_ref_rd = this_rd;
- int ref_idx;
-
- // TODO(jingning): This should be deprecated shortly.
- int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
- int ref_set =
- AOMMIN(MAX_REF_MV_SERCH - 1,
- mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
- memcpy(x->blk_skip_drl, x->blk_skip,
- sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-
- for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) {
- int64_t tmp_alt_rd = INT64_MAX;
- int dummy_disable_skip = 0;
- int_mv cur_mv;
- RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv;
-
- av1_invalid_rd_stats(&tmp_rd_stats);
-
- x->skip = 0;
-
- mbmi->ref_mv_idx = 1 + ref_idx;
-
- if (cpi->sf.reduce_inter_modes) {
- if (mbmi->ref_frame[0] == LAST2_FRAME ||
- mbmi->ref_frame[0] == LAST3_FRAME ||
- mbmi->ref_frame[1] == LAST2_FRAME ||
- mbmi->ref_frame[1] == LAST3_FRAME) {
- if (mbmi_ext
- ->ref_mv_stack[ref_frame_type]
- [mbmi->ref_mv_idx + idx_offset]
- .weight < REF_CAT_LEVEL) {
- *mbmi = backup_mbmi;
- x->skip = backup_skip;
- continue;
- }
- }
- }
-
- cur_mv =
- mbmi_ext->ref_mv_stack[ref_frame][mbmi->ref_mv_idx + idx_offset]
- .this_mv;
- clamp_mv2(&cur_mv.as_mv, xd);
-
- if (!mv_check_bounds(&x->mv_limits, &cur_mv.as_mv)) {
- av1_init_rd_stats(&tmp_rd_stats);
-
- args.modelled_rd = NULL;
- args.single_newmv = search_state.single_newmv[mbmi->ref_mv_idx];
- args.single_newmv_rate =
- search_state.single_newmv_rate[mbmi->ref_mv_idx];
- args.single_newmv_valid =
- search_state.single_newmv_valid[mbmi->ref_mv_idx];
- args.single_comp_cost = real_compmode_cost;
- args.ref_frame_cost = ref_frame_cost;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- tmp_alt_rd =
- handle_inter_mode(cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y,
- &tmp_rd_stats_uv, &dummy_disable_skip, mi_row,
- mi_col, &args, ref_best_rd, &best_est_rd);
-#else
- tmp_alt_rd = handle_inter_mode(
- cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, &tmp_rd_stats_uv,
- &dummy_disable_skip, mi_row, mi_col, &args, ref_best_rd);
-#endif
-
- // Prevent pointers from escaping local scope
- args.single_newmv = search_state.single_newmv[0];
- args.single_newmv_rate = search_state.single_newmv_rate[0];
- args.single_newmv_valid = search_state.single_newmv_valid[0];
- }
-
- if (tmp_ref_rd > tmp_alt_rd) {
- rate2 = tmp_rd_stats.rate;
- disable_skip = dummy_disable_skip;
- distortion2 = tmp_rd_stats.dist;
- skippable = tmp_rd_stats.skip;
- rate_y = tmp_rd_stats_y.rate;
- rate_uv = tmp_rd_stats_uv.rate;
- this_rd = tmp_alt_rd;
- tmp_ref_rd = tmp_alt_rd;
- backup_mbmi = *mbmi;
- backup_skip = x->skip;
- memcpy(x->blk_skip_drl, x->blk_skip,
- sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
- } else {
- *mbmi = backup_mbmi;
- x->skip = backup_skip;
- }
- }
-
- memcpy(x->blk_skip, x->blk_skip_drl,
- sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
- }
if (this_rd == INT64_MAX) continue;
this_skip2 = mbmi->skip;
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
index 1fa3d68ce9..12df472c15 100644
--- a/third_party/aom/av1/encoder/rdopt.h
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -78,8 +78,8 @@ static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx,
}
static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x,
- int plane, int blk_row, int blk_col,
- int block, TX_SIZE tx_size,
+ int plane, int block, TX_SIZE tx_size,
+ const TX_TYPE tx_type,
const TXB_CTX *const txb_ctx,
int use_fast_coef_costing) {
#if TXCOEFF_COST_TIMER
@@ -87,8 +87,8 @@ static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x,
aom_usec_timer_start(&timer);
#endif
(void)use_fast_coef_costing;
- const int cost = av1_cost_coeffs_txb(cm, x, plane, blk_row, blk_col, block,
- tx_size, txb_ctx);
+ const int cost =
+ av1_cost_coeffs_txb(cm, x, plane, block, tx_size, tx_type, txb_ctx);
#if TXCOEFF_COST_TIMER
AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common;
aom_usec_timer_mark(&timer);
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
index 49740817c3..d4b4b19c40 100644
--- a/third_party/aom/av1/encoder/speed_features.c
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -89,9 +89,27 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
SPEED_FEATURES *sf,
int speed) {
AV1_COMMON *const cm = &cpi->common;
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+
+ if (is_480p_or_larger) {
+ sf->use_square_partition_only_threshold = BLOCK_128X128;
+ } else {
+ sf->use_square_partition_only_threshold = BLOCK_64X64;
+ }
+
+ if (speed >= 1) {
+ if (is_720p_or_larger) {
+ sf->use_square_partition_only_threshold = BLOCK_128X128;
+ } else if (is_480p_or_larger) {
+ sf->use_square_partition_only_threshold = BLOCK_64X64;
+ } else {
+ sf->use_square_partition_only_threshold = BLOCK_32X32;
+ }
+ }
if (speed >= 2) {
- if (AOMMIN(cm->width, cm->height) >= 720) {
+ if (is_720p_or_larger) {
sf->disable_split_mask =
cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
sf->adaptive_pred_interp_filter = 0;
@@ -106,7 +124,7 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
}
if (speed >= 3) {
- if (AOMMIN(cm->width, cm->height) >= 720) {
+ if (is_720p_or_larger) {
sf->disable_split_mask = DISABLE_ALL_SPLIT;
sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
sf->partition_search_breakout_dist_thr = (1 << 25);
@@ -130,7 +148,7 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
}
if (speed >= 4) {
- if (AOMMIN(cm->width, cm->height) >= 720) {
+ if (is_720p_or_larger) {
sf->partition_search_breakout_dist_thr = (1 << 26);
} else {
sf->partition_search_breakout_dist_thr = (1 << 24);
@@ -149,6 +167,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->reduce_inter_modes = 1;
sf->prune_ext_partition_types_search_level = 1;
sf->ml_prune_ab_partition = 1;
+ sf->ml_prune_4_partition = 1;
sf->adaptive_txb_search_level = 1;
sf->jnt_comp_skip_mv_search = 1;
sf->model_based_prune_tx_search_level = 1;
@@ -195,7 +214,9 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
sf->partition_search_breakout_rate_thr = 80;
- sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+ // Note: This speed feature is disable as it seems to be worse in
+ // compression/quality and is also slower.
+ // sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
sf->allow_partition_search_skip = 1;
sf->disable_wedge_search_var_thresh = 100;
sf->fast_wedge_sign_estimate = 1;
@@ -221,7 +242,8 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
if (speed >= 4) {
sf->tx_type_search.fast_intra_tx_type_search = 1;
sf->tx_type_search.fast_inter_tx_type_search = 1;
- sf->use_square_partition_only = !boosted;
+ sf->use_square_partition_only_threshold =
+ boosted ? BLOCK_128X128 : BLOCK_4X4;
sf->tx_size_search_method =
frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
@@ -242,7 +264,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
- sf->use_square_partition_only = 1;
+ sf->use_square_partition_only_threshold = BLOCK_4X4;
sf->tx_size_search_method = USE_LARGESTALL;
sf->mv.search_method = BIGDIA;
sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
@@ -363,9 +385,11 @@ static void set_dev_sf(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) {
if (speed & PARTITION_SF) {
if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
has_internal_image_edge(cpi)) {
- sf->use_square_partition_only = !frame_is_boosted(cpi);
+ sf->use_square_partition_only_threshold =
+ frame_is_boosted(cpi) ? BLOCK_128X128 : BLOCK_4X4;
} else {
- sf->use_square_partition_only = !frame_is_intra_only(cm);
+ sf->use_square_partition_only_threshold =
+ frame_is_intra_only(cm) ? BLOCK_128X128 : BLOCK_4X4;
}
sf->less_rectangular_check = 1;
sf->prune_ext_partition_types_search_level = 2;
@@ -438,7 +462,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
sf->tx_type_search.skip_tx_search = 0;
sf->selective_ref_frame = 0;
sf->less_rectangular_check = 0;
- sf->use_square_partition_only = 0;
+ sf->use_square_partition_only_threshold = BLOCK_128X128;
sf->auto_min_max_partition_size = NOT_IN_USE;
sf->rd_auto_partition_min_limit = BLOCK_4X4;
sf->default_max_partition_size = BLOCK_LARGEST;
@@ -493,6 +517,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
sf->simple_model_rd_from_var = 0;
sf->prune_ext_partition_types_search_level = 0;
sf->ml_prune_ab_partition = 0;
+ sf->ml_prune_4_partition = 0;
sf->fast_cdef_search = 0;
// Set this at the appropriate speed levels
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
index 59cb6be580..d0408ba2f0 100644
--- a/third_party/aom/av1/encoder/speed_features.h
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -400,6 +400,9 @@ typedef struct SPEED_FEATURES {
// Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
int ml_prune_ab_partition;
+ // Use a ML model to prune horz4 and vert4 partitions.
+ int ml_prune_4_partition;
+
int fast_cdef_search;
// 2-pass coding block partition search
@@ -413,8 +416,8 @@ typedef struct SPEED_FEATURES {
// rd than partition type split.
int less_rectangular_check;
- // Disable testing non square partitions. (eg 16x32)
- int use_square_partition_only;
+ // Use square partition only beyond this block size.
+ BLOCK_SIZE use_square_partition_only_threshold;
// Sets min and max partition sizes for this superblock based on the
// same superblock in last encoded frame, and the left and above neighbor.
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
index 250feab81f..d7e4f4eb39 100644
--- a/third_party/aom/av1/encoder/temporal_filter.c
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -535,10 +535,10 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
// Adjust the strength based on active max q.
if (cpi->common.current_video_frame > 1)
q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
- cpi->common.bit_depth));
+ cpi->common.seq_params.bit_depth));
else
q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[KEY_FRAME],
- cpi->common.bit_depth));
+ cpi->common.seq_params.bit_depth));
if (q > 16) {
strength = oxcf->arnr_strength;
} else {
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
index 84065d6de5..c71f2e74ce 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
#include "av1/encoder/x86/av1_txfm1d_sse4.h"
void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
new file mode 100644
index 0000000000..592462e20d
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -0,0 +1,2068 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/x86/av1_fwd_txfm_avx2.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+
+ // stage 1
+ __m256i x1[16];
+ btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]);
+ btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]);
+ btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]);
+ btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]);
+ btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]);
+ btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]);
+ btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]);
+ btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]);
+
+ // stage 2
+ btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+ // stage 5
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+
+ // stage 7
+ output[0] = x1[0];
+ output[1] = x1[8];
+ output[2] = x1[4];
+ output[3] = x1[12];
+ output[4] = x1[2];
+ output[5] = x1[10];
+ output[6] = x1[6];
+ output[7] = x1[14];
+ output[8] = x1[1];
+ output[9] = x1[9];
+ output[10] = x1[5];
+ output[11] = x1[13];
+ output[12] = x1[3];
+ output[13] = x1[11];
+ output[14] = x1[7];
+ output[15] = x1[15];
+}
+
+static INLINE void fdct16x32_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+ __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+ __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+ __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+ __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+ __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+ __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+ __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+ __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+ __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+ __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+ __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+ __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+ __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+ __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+ __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+
+ // stage 1
+ __m256i x1[32];
+ btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+ btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+ btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+ btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+ btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+ btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+ btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+ btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+ btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+ btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+ btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+ btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+ btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+ btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+ btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+ btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+ // stage 2
+ btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+
+ // stage 4
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+
+ // stage 5
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+ btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+
+ // stage 7
+ btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+
+ // stage 8
+ btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 9
+ output[0] = x1[0];
+ output[1] = x1[16];
+ output[2] = x1[8];
+ output[3] = x1[24];
+ output[4] = x1[4];
+ output[5] = x1[20];
+ output[6] = x1[12];
+ output[7] = x1[28];
+ output[8] = x1[2];
+ output[9] = x1[18];
+ output[10] = x1[10];
+ output[11] = x1[26];
+ output[12] = x1[6];
+ output[13] = x1[22];
+ output[14] = x1[14];
+ output[15] = x1[30];
+ output[16] = x1[1];
+ output[17] = x1[17];
+ output[18] = x1[9];
+ output[19] = x1[25];
+ output[20] = x1[5];
+ output[21] = x1[21];
+ output[22] = x1[13];
+ output[23] = x1[29];
+ output[24] = x1[3];
+ output[25] = x1[19];
+ output[26] = x1[11];
+ output[27] = x1[27];
+ output[28] = x1[7];
+ output[29] = x1[23];
+ output[30] = x1[15];
+ output[31] = x1[31];
+}
+
+static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+ __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+ __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+ __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+ __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+ __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+ __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+ __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+ __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+ __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+ __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+ __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+ __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+ __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+ __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+ __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+ __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+ __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+ __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+ __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+ __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]);
+ __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]);
+ __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]);
+ __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]);
+ __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]);
+ __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]);
+ __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]);
+ __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]);
+ __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]);
+ __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]);
+ __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]);
+ __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]);
+ __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]);
+ __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]);
+ __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]);
+ __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]);
+ __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]);
+ __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]);
+ __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]);
+ __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]);
+ __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]);
+ __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]);
+ __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]);
+ __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]);
+ __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]);
+ __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]);
+ __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]);
+ __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]);
+ __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]);
+ __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]);
+ __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]);
+ __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]);
+
+ // stage 1
+ __m256i x1[64];
+ btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+ btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+ btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+ btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+ btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+ btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+ btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+ btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+ btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+ btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+ btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+ btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+ btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+ btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+ btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+ btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+ btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+ btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+ btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+ btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+ btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+ btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+ btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+ btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+ btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+ btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+ btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+ btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+ btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+ btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+ btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+ btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+ // stage 2
+ btf_16_adds_subs_avx2(&x1[0], &x1[31]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[30]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[29]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[28]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[27]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[24]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[23]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[10], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[19]);
+ btf_16_adds_subs_avx2(&x1[13], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[17]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[47]);
+ btf_16_adds_subs_avx2(&x1[33], &x1[46]);
+ btf_16_adds_subs_avx2(&x1[34], &x1[45]);
+ btf_16_adds_subs_avx2(&x1[35], &x1[44]);
+ btf_16_adds_subs_avx2(&x1[36], &x1[43]);
+ btf_16_adds_subs_avx2(&x1[37], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[38], &x1[41]);
+ btf_16_adds_subs_avx2(&x1[39], &x1[40]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[48]);
+ btf_16_adds_subs_avx2(&x1[62], &x1[49]);
+ btf_16_adds_subs_avx2(&x1[61], &x1[50]);
+ btf_16_adds_subs_avx2(&x1[60], &x1[51]);
+ btf_16_adds_subs_avx2(&x1[59], &x1[52]);
+ btf_16_adds_subs_avx2(&x1[58], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[57], &x1[54]);
+ btf_16_adds_subs_avx2(&x1[56], &x1[55]);
+
+ // stage 4
+ btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit);
+
+ // stage 5
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[39]);
+ btf_16_adds_subs_avx2(&x1[33], &x1[38]);
+ btf_16_adds_subs_avx2(&x1[34], &x1[37]);
+ btf_16_adds_subs_avx2(&x1[35], &x1[36]);
+ btf_16_adds_subs_avx2(&x1[47], &x1[40]);
+ btf_16_adds_subs_avx2(&x1[46], &x1[41]);
+ btf_16_adds_subs_avx2(&x1[45], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[44], &x1[43]);
+ btf_16_adds_subs_avx2(&x1[48], &x1[55]);
+ btf_16_adds_subs_avx2(&x1[49], &x1[54]);
+ btf_16_adds_subs_avx2(&x1[50], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[51], &x1[52]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[56]);
+ btf_16_adds_subs_avx2(&x1[62], &x1[57]);
+ btf_16_adds_subs_avx2(&x1[61], &x1[58]);
+ btf_16_adds_subs_avx2(&x1[60], &x1[59]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+ btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit);
+
+ // stage 7
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[35]);
+ btf_16_adds_subs_avx2(&x1[33], &x1[34]);
+ btf_16_adds_subs_avx2(&x1[39], &x1[36]);
+ btf_16_adds_subs_avx2(&x1[38], &x1[37]);
+ btf_16_adds_subs_avx2(&x1[40], &x1[43]);
+ btf_16_adds_subs_avx2(&x1[41], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[47], &x1[44]);
+ btf_16_adds_subs_avx2(&x1[46], &x1[45]);
+ btf_16_adds_subs_avx2(&x1[48], &x1[51]);
+ btf_16_adds_subs_avx2(&x1[49], &x1[50]);
+ btf_16_adds_subs_avx2(&x1[55], &x1[52]);
+ btf_16_adds_subs_avx2(&x1[54], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[56], &x1[59]);
+ btf_16_adds_subs_avx2(&x1[57], &x1[58]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[60]);
+ btf_16_adds_subs_avx2(&x1[62], &x1[61]);
+
+ // stage 8
+ btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit);
+
+ // stage 9
+ btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[33]);
+ btf_16_adds_subs_avx2(&x1[35], &x1[34]);
+ btf_16_adds_subs_avx2(&x1[36], &x1[37]);
+ btf_16_adds_subs_avx2(&x1[39], &x1[38]);
+ btf_16_adds_subs_avx2(&x1[40], &x1[41]);
+ btf_16_adds_subs_avx2(&x1[43], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[44], &x1[45]);
+ btf_16_adds_subs_avx2(&x1[47], &x1[46]);
+ btf_16_adds_subs_avx2(&x1[48], &x1[49]);
+ btf_16_adds_subs_avx2(&x1[51], &x1[50]);
+ btf_16_adds_subs_avx2(&x1[52], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[55], &x1[54]);
+ btf_16_adds_subs_avx2(&x1[56], &x1[57]);
+ btf_16_adds_subs_avx2(&x1[59], &x1[58]);
+ btf_16_adds_subs_avx2(&x1[60], &x1[61]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[62]);
+
+ // stage 10
+ btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 11
+ output[0] = x1[0];
+ output[1] = x1[32];
+ output[2] = x1[16];
+ output[3] = x1[48];
+ output[4] = x1[8];
+ output[5] = x1[40];
+ output[6] = x1[24];
+ output[7] = x1[56];
+ output[8] = x1[4];
+ output[9] = x1[36];
+ output[10] = x1[20];
+ output[11] = x1[52];
+ output[12] = x1[12];
+ output[13] = x1[44];
+ output[14] = x1[28];
+ output[15] = x1[60];
+ output[16] = x1[2];
+ output[17] = x1[34];
+ output[18] = x1[18];
+ output[19] = x1[50];
+ output[20] = x1[10];
+ output[21] = x1[42];
+ output[22] = x1[26];
+ output[23] = x1[58];
+ output[24] = x1[6];
+ output[25] = x1[38];
+ output[26] = x1[22];
+ output[27] = x1[54];
+ output[28] = x1[14];
+ output[29] = x1[46];
+ output[30] = x1[30];
+ output[31] = x1[62];
+ output[32] = x1[1];
+ output[33] = x1[33];
+ output[34] = x1[17];
+ output[35] = x1[49];
+ output[36] = x1[9];
+ output[37] = x1[41];
+ output[38] = x1[25];
+ output[39] = x1[57];
+ output[40] = x1[5];
+ output[41] = x1[37];
+ output[42] = x1[21];
+ output[43] = x1[53];
+ output[44] = x1[13];
+ output[45] = x1[45];
+ output[46] = x1[29];
+ output[47] = x1[61];
+ output[48] = x1[3];
+ output[49] = x1[35];
+ output[50] = x1[19];
+ output[51] = x1[51];
+ output[52] = x1[11];
+ output[53] = x1[43];
+ output[54] = x1[27];
+ output[55] = x1[59];
+ output[56] = x1[7];
+ output[57] = x1[39];
+ output[58] = x1[23];
+ output[59] = x1[55];
+ output[60] = x1[15];
+ output[61] = x1[47];
+ output[62] = x1[31];
+ output[63] = x1[63];
+}
+
+static INLINE void av1_fdct32_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ __m256i x1[32];
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+ // stage 0
+ // stage 1
+ btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+ btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+ btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+ btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+ btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+ btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+ btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+ btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+ btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+ btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+ btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+ btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+ btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+ btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+ btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+ btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+ // stage 2
+ btf_32_add_sub_avx2(&x1[0], &x1[15]);
+ btf_32_add_sub_avx2(&x1[1], &x1[14]);
+ btf_32_add_sub_avx2(&x1[2], &x1[13]);
+ btf_32_add_sub_avx2(&x1[3], &x1[12]);
+ btf_32_add_sub_avx2(&x1[4], &x1[11]);
+ btf_32_add_sub_avx2(&x1[5], &x1[10]);
+ btf_32_add_sub_avx2(&x1[6], &x1[9]);
+ btf_32_add_sub_avx2(&x1[7], &x1[8]);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 3
+ btf_32_add_sub_avx2(&x1[0], &x1[7]);
+ btf_32_add_sub_avx2(&x1[1], &x1[6]);
+ btf_32_add_sub_avx2(&x1[2], &x1[5]);
+ btf_32_add_sub_avx2(&x1[3], &x1[4]);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[23]);
+ btf_32_add_sub_avx2(&x1[17], &x1[22]);
+ btf_32_add_sub_avx2(&x1[18], &x1[21]);
+ btf_32_add_sub_avx2(&x1[19], &x1[20]);
+ btf_32_add_sub_avx2(&x1[31], &x1[24]);
+ btf_32_add_sub_avx2(&x1[30], &x1[25]);
+ btf_32_add_sub_avx2(&x1[29], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[27]);
+
+ // stage 4
+ btf_32_add_sub_avx2(&x1[0], &x1[3]);
+ btf_32_add_sub_avx2(&x1[1], &x1[2]);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[11]);
+ btf_32_add_sub_avx2(&x1[9], &x1[10]);
+ btf_32_add_sub_avx2(&x1[15], &x1[12]);
+ btf_32_add_sub_avx2(&x1[14], &x1[13]);
+ btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit);
+
+ // stage 5
+ btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit);
+ btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[4], &x1[5]);
+ btf_32_add_sub_avx2(&x1[7], &x1[6]);
+ btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[19]);
+ btf_32_add_sub_avx2(&x1[17], &x1[18]);
+ btf_32_add_sub_avx2(&x1[23], &x1[20]);
+ btf_32_add_sub_avx2(&x1[22], &x1[21]);
+ btf_32_add_sub_avx2(&x1[24], &x1[27]);
+ btf_32_add_sub_avx2(&x1[25], &x1[26]);
+ btf_32_add_sub_avx2(&x1[31], &x1[28]);
+ btf_32_add_sub_avx2(&x1[30], &x1[29]);
+
+ // stage 6
+ btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit);
+ btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[9]);
+ btf_32_add_sub_avx2(&x1[11], &x1[10]);
+ btf_32_add_sub_avx2(&x1[12], &x1[13]);
+ btf_32_add_sub_avx2(&x1[15], &x1[14]);
+ btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit);
+
+ // stage 7
+ btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit);
+ btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[17]);
+ btf_32_add_sub_avx2(&x1[19], &x1[18]);
+ btf_32_add_sub_avx2(&x1[20], &x1[21]);
+ btf_32_add_sub_avx2(&x1[23], &x1[22]);
+ btf_32_add_sub_avx2(&x1[24], &x1[25]);
+ btf_32_add_sub_avx2(&x1[27], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[29]);
+ btf_32_add_sub_avx2(&x1[31], &x1[30]);
+
+ // stage 8
+ btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit);
+ btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 9
+ output[0] = x1[0];
+ output[1] = x1[16];
+ output[2] = x1[8];
+ output[3] = x1[24];
+ output[4] = x1[4];
+ output[5] = x1[20];
+ output[6] = x1[12];
+ output[7] = x1[28];
+ output[8] = x1[2];
+ output[9] = x1[18];
+ output[10] = x1[10];
+ output[11] = x1[26];
+ output[12] = x1[6];
+ output[13] = x1[22];
+ output[14] = x1[14];
+ output[15] = x1[30];
+ output[16] = x1[1];
+ output[17] = x1[17];
+ output[18] = x1[9];
+ output[19] = x1[25];
+ output[20] = x1[5];
+ output[21] = x1[21];
+ output[22] = x1[13];
+ output[23] = x1[29];
+ output[24] = x1[3];
+ output[25] = x1[19];
+ output[26] = x1[11];
+ output[27] = x1[27];
+ output[28] = x1[7];
+ output[29] = x1[23];
+ output[30] = x1[15];
+ output[31] = x1[31];
+}
+
+static INLINE void av1_fdct64_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+ __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+ __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+ __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+ __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+ __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+ __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+ __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+ __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+ __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+ __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+ __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+ __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+ __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+ __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+ __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+ __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+ __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+ __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+ __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+ __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+ __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+ __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+ __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+ __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+ __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+ __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+ __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+ __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+ __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+ __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+ __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+ __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+ __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+ __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+ __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+ __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+ __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+ __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+ __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+ __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+ __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+ __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+ __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+ __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+ __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+ __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+ __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+ __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+ __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+ __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+ __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+ __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+ __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+ __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+ __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+ __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+ __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+ __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+ __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+ __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+ __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+ __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+ __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+ __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+ __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+ __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+ __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+ __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+ __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+ __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+ __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+ __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+ __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+ __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+ __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+ __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+ __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+ // stage 1
+ __m256i x1[64];
+ btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+ btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+ btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+ btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+ btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+ btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+ btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+ btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+ btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+ btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+ btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+ btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+ btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+ btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+ btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+ btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+ btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+ btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+ btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+ btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+ btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+ btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+ btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+ btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+ btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+ btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+ btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+ btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+ btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+ btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+ btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+ btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+ // stage 2
+ btf_32_add_sub_avx2(&x1[0], &x1[31]);
+ btf_32_add_sub_avx2(&x1[1], &x1[30]);
+ btf_32_add_sub_avx2(&x1[2], &x1[29]);
+ btf_32_add_sub_avx2(&x1[3], &x1[28]);
+ btf_32_add_sub_avx2(&x1[4], &x1[27]);
+ btf_32_add_sub_avx2(&x1[5], &x1[26]);
+ btf_32_add_sub_avx2(&x1[6], &x1[25]);
+ btf_32_add_sub_avx2(&x1[7], &x1[24]);
+ btf_32_add_sub_avx2(&x1[8], &x1[23]);
+ btf_32_add_sub_avx2(&x1[9], &x1[22]);
+ btf_32_add_sub_avx2(&x1[10], &x1[21]);
+ btf_32_add_sub_avx2(&x1[11], &x1[20]);
+ btf_32_add_sub_avx2(&x1[12], &x1[19]);
+ btf_32_add_sub_avx2(&x1[13], &x1[18]);
+ btf_32_add_sub_avx2(&x1[14], &x1[17]);
+ btf_32_add_sub_avx2(&x1[15], &x1[16]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 3
+ btf_32_add_sub_avx2(&x1[0], &x1[15]);
+ btf_32_add_sub_avx2(&x1[1], &x1[14]);
+ btf_32_add_sub_avx2(&x1[2], &x1[13]);
+ btf_32_add_sub_avx2(&x1[3], &x1[12]);
+ btf_32_add_sub_avx2(&x1[4], &x1[11]);
+ btf_32_add_sub_avx2(&x1[5], &x1[10]);
+ btf_32_add_sub_avx2(&x1[6], &x1[9]);
+ btf_32_add_sub_avx2(&x1[7], &x1[8]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[47]);
+ btf_32_add_sub_avx2(&x1[33], &x1[46]);
+ btf_32_add_sub_avx2(&x1[34], &x1[45]);
+ btf_32_add_sub_avx2(&x1[35], &x1[44]);
+ btf_32_add_sub_avx2(&x1[36], &x1[43]);
+ btf_32_add_sub_avx2(&x1[37], &x1[42]);
+ btf_32_add_sub_avx2(&x1[38], &x1[41]);
+ btf_32_add_sub_avx2(&x1[39], &x1[40]);
+ btf_32_add_sub_avx2(&x1[63], &x1[48]);
+ btf_32_add_sub_avx2(&x1[62], &x1[49]);
+ btf_32_add_sub_avx2(&x1[61], &x1[50]);
+ btf_32_add_sub_avx2(&x1[60], &x1[51]);
+ btf_32_add_sub_avx2(&x1[59], &x1[52]);
+ btf_32_add_sub_avx2(&x1[58], &x1[53]);
+ btf_32_add_sub_avx2(&x1[57], &x1[54]);
+ btf_32_add_sub_avx2(&x1[56], &x1[55]);
+
+ // stage 4
+ btf_32_add_sub_avx2(&x1[0], &x1[7]);
+ btf_32_add_sub_avx2(&x1[1], &x1[6]);
+ btf_32_add_sub_avx2(&x1[2], &x1[5]);
+ btf_32_add_sub_avx2(&x1[3], &x1[4]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[23]);
+ btf_32_add_sub_avx2(&x1[17], &x1[22]);
+ btf_32_add_sub_avx2(&x1[18], &x1[21]);
+ btf_32_add_sub_avx2(&x1[19], &x1[20]);
+ btf_32_add_sub_avx2(&x1[31], &x1[24]);
+ btf_32_add_sub_avx2(&x1[30], &x1[25]);
+ btf_32_add_sub_avx2(&x1[29], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[27]);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit);
+
+ // stage 5
+ btf_32_add_sub_avx2(&x1[0], &x1[3]);
+ btf_32_add_sub_avx2(&x1[1], &x1[2]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[11]);
+ btf_32_add_sub_avx2(&x1[9], &x1[10]);
+ btf_32_add_sub_avx2(&x1[15], &x1[12]);
+ btf_32_add_sub_avx2(&x1[14], &x1[13]);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[39]);
+ btf_32_add_sub_avx2(&x1[33], &x1[38]);
+ btf_32_add_sub_avx2(&x1[34], &x1[37]);
+ btf_32_add_sub_avx2(&x1[35], &x1[36]);
+ btf_32_add_sub_avx2(&x1[47], &x1[40]);
+ btf_32_add_sub_avx2(&x1[46], &x1[41]);
+ btf_32_add_sub_avx2(&x1[45], &x1[42]);
+ btf_32_add_sub_avx2(&x1[44], &x1[43]);
+ btf_32_add_sub_avx2(&x1[48], &x1[55]);
+ btf_32_add_sub_avx2(&x1[49], &x1[54]);
+ btf_32_add_sub_avx2(&x1[50], &x1[53]);
+ btf_32_add_sub_avx2(&x1[51], &x1[52]);
+ btf_32_add_sub_avx2(&x1[63], &x1[56]);
+ btf_32_add_sub_avx2(&x1[62], &x1[57]);
+ btf_32_add_sub_avx2(&x1[61], &x1[58]);
+ btf_32_add_sub_avx2(&x1[60], &x1[59]);
+
+ // stage 6
+ btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[4], &x1[5]);
+ btf_32_add_sub_avx2(&x1[7], &x1[6]);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[19]);
+ btf_32_add_sub_avx2(&x1[17], &x1[18]);
+ btf_32_add_sub_avx2(&x1[23], &x1[20]);
+ btf_32_add_sub_avx2(&x1[22], &x1[21]);
+ btf_32_add_sub_avx2(&x1[24], &x1[27]);
+ btf_32_add_sub_avx2(&x1[25], &x1[26]);
+ btf_32_add_sub_avx2(&x1[31], &x1[28]);
+ btf_32_add_sub_avx2(&x1[30], &x1[29]);
+ btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit);
+
+ // stage 7
+ btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[9]);
+ btf_32_add_sub_avx2(&x1[11], &x1[10]);
+ btf_32_add_sub_avx2(&x1[12], &x1[13]);
+ btf_32_add_sub_avx2(&x1[15], &x1[14]);
+ btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[35]);
+ btf_32_add_sub_avx2(&x1[33], &x1[34]);
+ btf_32_add_sub_avx2(&x1[39], &x1[36]);
+ btf_32_add_sub_avx2(&x1[38], &x1[37]);
+ btf_32_add_sub_avx2(&x1[40], &x1[43]);
+ btf_32_add_sub_avx2(&x1[41], &x1[42]);
+ btf_32_add_sub_avx2(&x1[47], &x1[44]);
+ btf_32_add_sub_avx2(&x1[46], &x1[45]);
+ btf_32_add_sub_avx2(&x1[48], &x1[51]);
+ btf_32_add_sub_avx2(&x1[49], &x1[50]);
+ btf_32_add_sub_avx2(&x1[55], &x1[52]);
+ btf_32_add_sub_avx2(&x1[54], &x1[53]);
+ btf_32_add_sub_avx2(&x1[56], &x1[59]);
+ btf_32_add_sub_avx2(&x1[57], &x1[58]);
+ btf_32_add_sub_avx2(&x1[63], &x1[60]);
+ btf_32_add_sub_avx2(&x1[62], &x1[61]);
+
+ // stage 8
+ btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[17]);
+ btf_32_add_sub_avx2(&x1[19], &x1[18]);
+ btf_32_add_sub_avx2(&x1[20], &x1[21]);
+ btf_32_add_sub_avx2(&x1[23], &x1[22]);
+ btf_32_add_sub_avx2(&x1[24], &x1[25]);
+ btf_32_add_sub_avx2(&x1[27], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[29]);
+ btf_32_add_sub_avx2(&x1[31], &x1[30]);
+ btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit);
+
+ // stage 9
+ btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[33]);
+ btf_32_add_sub_avx2(&x1[35], &x1[34]);
+ btf_32_add_sub_avx2(&x1[36], &x1[37]);
+ btf_32_add_sub_avx2(&x1[39], &x1[38]);
+ btf_32_add_sub_avx2(&x1[40], &x1[41]);
+ btf_32_add_sub_avx2(&x1[43], &x1[42]);
+ btf_32_add_sub_avx2(&x1[44], &x1[45]);
+ btf_32_add_sub_avx2(&x1[47], &x1[46]);
+ btf_32_add_sub_avx2(&x1[48], &x1[49]);
+ btf_32_add_sub_avx2(&x1[51], &x1[50]);
+ btf_32_add_sub_avx2(&x1[52], &x1[53]);
+ btf_32_add_sub_avx2(&x1[55], &x1[54]);
+ btf_32_add_sub_avx2(&x1[56], &x1[57]);
+ btf_32_add_sub_avx2(&x1[59], &x1[58]);
+ btf_32_add_sub_avx2(&x1[60], &x1[61]);
+ btf_32_add_sub_avx2(&x1[63], &x1[62]);
+
+ // stage 10
+ btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 11
+ output[0] = x1[0];
+ output[1] = x1[32];
+ output[2] = x1[16];
+ output[3] = x1[48];
+ output[4] = x1[8];
+ output[5] = x1[40];
+ output[6] = x1[24];
+ output[7] = x1[56];
+ output[8] = x1[4];
+ output[9] = x1[36];
+ output[10] = x1[20];
+ output[11] = x1[52];
+ output[12] = x1[12];
+ output[13] = x1[44];
+ output[14] = x1[28];
+ output[15] = x1[60];
+ output[16] = x1[2];
+ output[17] = x1[34];
+ output[18] = x1[18];
+ output[19] = x1[50];
+ output[20] = x1[10];
+ output[21] = x1[42];
+ output[22] = x1[26];
+ output[23] = x1[58];
+ output[24] = x1[6];
+ output[25] = x1[38];
+ output[26] = x1[22];
+ output[27] = x1[54];
+ output[28] = x1[14];
+ output[29] = x1[46];
+ output[30] = x1[30];
+ output[31] = x1[62];
+ output[32] = x1[1];
+ output[33] = x1[33];
+ output[34] = x1[17];
+ output[35] = x1[49];
+ output[36] = x1[9];
+ output[37] = x1[41];
+ output[38] = x1[25];
+ output[39] = x1[57];
+ output[40] = x1[5];
+ output[41] = x1[37];
+ output[42] = x1[21];
+ output[43] = x1[53];
+ output[44] = x1[13];
+ output[45] = x1[45];
+ output[46] = x1[29];
+ output[47] = x1[61];
+ output[48] = x1[3];
+ output[49] = x1[35];
+ output[50] = x1[19];
+ output[51] = x1[51];
+ output[52] = x1[11];
+ output[53] = x1[43];
+ output[54] = x1[27];
+ output[55] = x1[59];
+ output[56] = x1[7];
+ output[57] = x1[39];
+ output[58] = x1[23];
+ output[59] = x1[55];
+ output[60] = x1[15];
+ output[61] = x1[47];
+ output[62] = x1[31];
+ output[63] = x1[63];
+}
+
+static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __zero = _mm256_setzero_si256();
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+ __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+ __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+ __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+ __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+ __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+ __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+ __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+ __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+ __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+ __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+ __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+ __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+ __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+ __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+ __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+ __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+ __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+ __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[0];
+ x1[1] = _mm256_subs_epi16(__zero, input[15]);
+ x1[2] = _mm256_subs_epi16(__zero, input[7]);
+ x1[3] = input[8];
+ x1[4] = _mm256_subs_epi16(__zero, input[3]);
+ x1[5] = input[12];
+ x1[6] = input[4];
+ x1[7] = _mm256_subs_epi16(__zero, input[11]);
+ x1[8] = _mm256_subs_epi16(__zero, input[1]);
+ x1[9] = input[14];
+ x1[10] = input[6];
+ x1[11] = _mm256_subs_epi16(__zero, input[9]);
+ x1[12] = input[2];
+ x1[13] = _mm256_subs_epi16(__zero, input[13]);
+ x1[14] = _mm256_subs_epi16(__zero, input[5]);
+ x1[15] = input[10];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[2]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[13], &x1[15]);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 5
+ btf_16_adds_subs_avx2(&x1[0], &x1[4]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[10], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[15]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 7
+ btf_16_adds_subs_avx2(&x1[0], &x1[8]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[15]);
+
+ // stage 8
+ btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 9
+ output[0] = x1[1];
+ output[1] = x1[14];
+ output[2] = x1[3];
+ output[3] = x1[12];
+ output[4] = x1[5];
+ output[5] = x1[10];
+ output[6] = x1[7];
+ output[7] = x1[8];
+ output[8] = x1[9];
+ output[9] = x1[6];
+ output[10] = x1[11];
+ output[11] = x1[4];
+ output[12] = x1[13];
+ output[13] = x1[2];
+ output[14] = x1[15];
+ output[15] = x1[0];
+}
+
+static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+ const __m256i scale__r = pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
+ const __m256i b = _mm256_madd_epi16(a, scale__r);
+ return _mm256_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void fidentity16x16_new_avx2(const __m256i *input,
+ __m256i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i one = _mm256_set1_epi16(1);
+
+ for (int i = 0; i < 16; ++i) {
+ const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one);
+ const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+ output[i] = _mm256_packs_epi32(b_lo, b_hi);
+ }
+}
+
+static INLINE void fidentity16x32_new_avx2(const __m256i *input,
+ __m256i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; ++i) {
+ output[i] = _mm256_slli_epi16(input[i], 2);
+ }
+}
+
+static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
+ __m256i *output,
+ const int size,
+ const int bit) {
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = av1_round_shift_32_avx2(input[i], bit);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = _mm256_slli_epi32(input[i], -bit);
+ }
+ }
+}
+
+static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input,
+ __m256i *output,
+ const int size,
+ const int bit) {
+ const __m256i sqrt2 = _mm256_set1_epi32(NewSqrt2);
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m256i r0 = av1_round_shift_32_avx2(input[i], bit);
+ const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+ output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
+ const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+ output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+ }
+ }
+}
+
+static INLINE void transpose_32_8x8_avx2(int stride, const __m256i *inputA,
+ __m256i *output) {
+ __m256i temp0 = _mm256_unpacklo_epi32(inputA[0], inputA[2]);
+ __m256i temp1 = _mm256_unpackhi_epi32(inputA[0], inputA[2]);
+ __m256i temp2 = _mm256_unpacklo_epi32(inputA[1], inputA[3]);
+ __m256i temp3 = _mm256_unpackhi_epi32(inputA[1], inputA[3]);
+ __m256i temp4 = _mm256_unpacklo_epi32(inputA[4], inputA[6]);
+ __m256i temp5 = _mm256_unpackhi_epi32(inputA[4], inputA[6]);
+ __m256i temp6 = _mm256_unpacklo_epi32(inputA[5], inputA[7]);
+ __m256i temp7 = _mm256_unpackhi_epi32(inputA[5], inputA[7]);
+
+ __m256i t0 = _mm256_unpacklo_epi32(temp0, temp2);
+ __m256i t1 = _mm256_unpackhi_epi32(temp0, temp2);
+ __m256i t2 = _mm256_unpacklo_epi32(temp1, temp3);
+ __m256i t3 = _mm256_unpackhi_epi32(temp1, temp3);
+ __m256i t4 = _mm256_unpacklo_epi32(temp4, temp6);
+ __m256i t5 = _mm256_unpackhi_epi32(temp4, temp6);
+ __m256i t6 = _mm256_unpacklo_epi32(temp5, temp7);
+ __m256i t7 = _mm256_unpackhi_epi32(temp5, temp7);
+
+ output[0 * stride] = _mm256_permute2x128_si256(t0, t4, 0x20);
+ output[1 * stride] = _mm256_permute2x128_si256(t1, t5, 0x20);
+ output[2 * stride] = _mm256_permute2x128_si256(t2, t6, 0x20);
+ output[3 * stride] = _mm256_permute2x128_si256(t3, t7, 0x20);
+ output[4 * stride] = _mm256_permute2x128_si256(t0, t4, 0x31);
+ output[5 * stride] = _mm256_permute2x128_si256(t1, t5, 0x31);
+ output[6 * stride] = _mm256_permute2x128_si256(t2, t6, 0x31);
+ output[7 * stride] = _mm256_permute2x128_si256(t3, t7, 0x31);
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+ int32_t *out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm256_store_si256((__m256i *)(out),
+ _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i])));
+ _mm256_store_si256(
+ (__m256i *)(out + 8),
+ _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1)));
+ out += stride;
+ }
+}
+
+static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a,
+ int32_t *const b) {
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
+ const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one);
+ const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+ _mm256_store_si256((__m256i *)b, b_lo);
+ _mm256_store_si256((__m256i *)(b + 8), b_hi);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2(
+ const __m256i *const in, int32_t *const out, const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit_avx2(in[i], out + i * stride);
+ }
+}
+
+static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
+ fdct16x32_new_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity16x32_new_avx2, // IDTX
+ fdct16x32_new_avx2, // V_DCT
+ fidentity16x32_new_avx2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
+ fdct16x32_new_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity16x32_new_avx2, // IDTX
+ fidentity16x32_new_avx2, // V_DCT
+ fdct16x32_new_avx2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
+ fdct16x16_new_avx2, // DCT_DCT
+ fadst16x16_new_avx2, // ADST_DCT
+ fdct16x16_new_avx2, // DCT_ADST
+ fadst16x16_new_avx2, // ADST_ADST
+ fadst16x16_new_avx2, // FLIPADST_DCT
+ fdct16x16_new_avx2, // DCT_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_FLIPADST
+ fadst16x16_new_avx2, // ADST_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_ADST
+ fidentity16x16_new_avx2, // IDTX
+ fdct16x16_new_avx2, // V_DCT
+ fidentity16x16_new_avx2, // H_DCT
+ fadst16x16_new_avx2, // V_ADST
+ fidentity16x16_new_avx2, // H_ADST
+ fadst16x16_new_avx2, // V_FLIPADST
+ fidentity16x16_new_avx2 // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
+ fdct16x16_new_avx2, // DCT_DCT
+ fdct16x16_new_avx2, // ADST_DCT
+ fadst16x16_new_avx2, // DCT_ADST
+ fadst16x16_new_avx2, // ADST_ADST
+ fdct16x16_new_avx2, // FLIPADST_DCT
+ fadst16x16_new_avx2, // DCT_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_FLIPADST
+ fadst16x16_new_avx2, // ADST_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_ADST
+ fidentity16x16_new_avx2, // IDTX
+ fidentity16x16_new_avx2, // V_DCT
+ fdct16x16_new_avx2, // H_DCT
+ fidentity16x16_new_avx2, // V_ADST
+ fadst16x16_new_avx2, // H_ADST
+ fidentity16x16_new_avx2, // V_FLIPADST
+ fadst16x16_new_avx2 // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_16X16;
+ __m256i buf0[16], buf1[16];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const int32_t i = 0;
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ transpose_16bit_16x16_avx2(buf, buf);
+ store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width, 16);
+}
+
+static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_32X32;
+ __m256i buf0[32], buf1[128];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+ height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i);
+ transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ transpose_16bit_16x16_avx2(buf, buf);
+ store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width,
+ 16);
+ transpose_16bit_16x16_avx2(buf + 16, buf + 16);
+ store_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16 * width * i + 16,
+ width, 16);
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X64;
+ __m256i buf0[64], buf1[256];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i bufA[64];
+ __m256i bufB[64];
+ __m128i *buf = (__m128i *)(buf1 + width * i);
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+ bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+ }
+ av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
+ av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
+ av1_round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
+ av1_round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
+
+ int32_t *output8 = output + 16 * 32 * i;
+ for (int j = 0; j < 4; ++j) {
+ __m256i *out = (__m256i *)(output8 + 8 * j);
+ transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+ transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_16X32;
+ __m256i buf0[32], buf1[32];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0, buf1);
+ transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16);
+
+ for (int i = 0; i < 2; i++) {
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ transpose_16bit_16x16_avx2(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i,
+ width, 16);
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m256i buf0[32], buf1[64];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
+ const int txw_idx = get_txw_idx(TX_32X16);
+ const int txh_idx = get_txh_idx(TX_32X16);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 16;
+ const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+ height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+ }
+
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ transpose_16bit_16x16_avx2(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, width, 16);
+
+ transpose_16bit_16x16_avx2(buf + 16, buf + 16);
+ store_rect_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16, width, 16);
+}
+
+static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_64X32;
+ __m256i buf0[64], buf1[256];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div16); ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+ assert(tx_type == DCT_DCT);
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i bufA[64];
+ __m256i bufB[64];
+ __m128i *buf = (__m128i *)(buf1 + width * i);
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+ bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+ }
+ av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
+ av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
+ av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]);
+ av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]);
+
+ int32_t *output8 = output + 16 * 32 * i;
+ for (int j = 0; j < 4; ++j) {
+ __m256i *out = (__m256i *)(output8 + 8 * j);
+ transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+ transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_32X64;
+ __m256i buf0[64], buf1[256];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i bufA[32];
+ __m256i bufB[32];
+ __m128i *buf = (__m128i *)(buf1 + width * i);
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+ bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+ }
+ av1_fdct32_new_avx2(bufA, bufA, cos_bit_row);
+ av1_fdct32_new_avx2(bufB, bufB, cos_bit_row);
+ av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]);
+ av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]);
+
+ int32_t *output8 = output + 16 * 32 * i;
+ for (int j = 0; j < 4; ++j) {
+ __m256i *out = (__m256i *)(output8 + 8 * j);
+ transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+ transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_16X64;
+ __m256i buf0[64], buf1[64];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+ const transform_1d_avx2 row_txfm = fdct16x16_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < height_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(4, height_div16); i++) {
+ __m256i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ int32_t *output16 = output + 16 * width * i;
+ for (int j = 0; j < width_div16; ++j) {
+ __m256i *buf16 = buf + 16 * j;
+ transpose_16bit_16x16_avx2(buf16, buf16);
+ store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, width, 16);
+ }
+ }
+ // Zero out the bottom 16x32 area.
+ memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X16;
+ __m256i buf0[64], buf1[64];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
+ const transform_1d_avx2 row_txfm = fdct16x64_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < height_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < height_div16; i++) {
+ __m256i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ int32_t *output16 = output + 16 * 32 * i;
+ for (int j = 0; j < 2; ++j) {
+ __m256i *buf16 = buf + 16 * j;
+ transpose_16bit_16x16_avx2(buf16, buf16);
+ store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, 32, 16);
+ }
+ }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+ av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
+ av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform
+ lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform
+ lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform
+ lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform
+ av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
+ av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
+ av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform
+ av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform
+ lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform
+ lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform
+ lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform
+ lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform
+ av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
+ av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
+ av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
+ av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
+ lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform
+ lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+ if ((fwd_txfm2d_func == NULL) ||
+ (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ } else {
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
new file mode 100644
index 0000000000..c582ca0e34
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_FWD_TXFM_AVX2_H_
+#define AV1_FWD_TXFM_AVX2_H_
+#include <immintrin.h>
+
+static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+ __m256i tmp, round;
+ round = _mm256_set1_epi32(1 << (bit - 1));
+ tmp = _mm256_add_epi32(vec, round);
+ return _mm256_srai_epi32(tmp, bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r, const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i ww0 = _mm256_set1_epi32(w0);
+ const __m256i ww1 = _mm256_set1_epi32(w1);
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r, const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i ww0 = _mm256_set1_epi32(w0);
+ const __m256i ww1 = _mm256_set1_epi32(w1);
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r,
+ const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r,
+ const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+#endif // AV1_FWD_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/encoder/x86/corner_match_sse4.c b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
index 381f757da7..93f37b71d3 100644
--- a/third_party/aom/av1/encoder/x86/corner_match_sse4.c
+++ b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
#include <stdlib.h>
#include <memory.h>
#include <math.h>
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
new file mode 100644
index 0000000000..f776e84c77
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ int n = -N;
+
+ uint64_t csse;
+
+ const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
+ const __m256i v_zext_q = _mm256_set1_epi64x(0xffffffff);
+
+ __m256i v_acc0_q = _mm256_setzero_si256();
+
+ assert(N % 64 == 0);
+
+ r1 += N;
+ d += N;
+ m += N;
+
+ do {
+ const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n));
+ const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n));
+ const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n));
+
+ const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w);
+ const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w);
+ const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b);
+
+ const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w);
+ const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w);
+
+ const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w);
+ const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w);
+
+ const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d);
+
+ const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w);
+
+ const __m256i v_sum0_q = _mm256_add_epi64(
+ _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32));
+
+ v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q);
+
+ n += 16;
+ } while (n);
+
+ v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8));
+ __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q);
+ __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1);
+ v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+#if ARCH_X86_64
+ csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+#else
+ xx_storel_64(&csse, v_acc_q_0);
+#endif
+
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
+ int N, int64_t limit) {
+ int64_t acc;
+ __m256i v_acc0_d = _mm256_setzero_si256();
+
+ // Input size limited to 8192 by the use of 32 bit accumulators and m
+ // being between [0, 64]. Overflow might happen at larger sizes,
+ // though it is practically impossible on real video input.
+ assert(N < 8192);
+ assert(N % 64 == 0);
+
+ do {
+ const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m));
+ const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32));
+
+ const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds));
+ const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16));
+ const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32));
+ const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48));
+
+ const __m256i v_m0_w =
+ _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b));
+ const __m256i v_m1_w =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1));
+ const __m256i v_m2_w =
+ _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b));
+ const __m256i v_m3_w =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1));
+
+ const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w);
+ const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w);
+ const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w);
+ const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w);
+
+ const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d);
+ const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d);
+
+ const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d);
+
+ v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d);
+
+ ds += 64;
+ m += 64;
+
+ N -= 64;
+ } while (N);
+
+ __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31);
+ v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d),
+ _mm256_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+ __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8));
+
+ __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q);
+ __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1);
+ v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+
+#if ARCH_X86_64
+ acc = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+#else
+ xx_storel_64(&acc, v_acc_q_0);
+#endif
+
+ return acc > limit;
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a,
+ const int16_t *b, int N) {
+ const __m256i v_neg_w = _mm256_set1_epi32(0xffff0001);
+
+ assert(N % 64 == 0);
+
+ do {
+ const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a));
+ const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b));
+ const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16));
+ const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16));
+ const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32));
+ const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32));
+ const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48));
+ const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48));
+
+ const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w);
+ const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w);
+ const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w);
+ const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w);
+ const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w);
+ const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w);
+ const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w);
+ const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w);
+
+ // Negate top word of pairs
+ const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w);
+ const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w);
+ const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w);
+ const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w);
+ const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w);
+ const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w);
+ const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w);
+ const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w);
+
+ const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w);
+ const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w);
+ const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w);
+ const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w);
+ const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w);
+ const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w);
+ const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w);
+ const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+ const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w);
+ const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w);
+ const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w);
+ const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w);
+
+ _mm256_store_si256((__m256i *)(d), v_r0_w);
+ _mm256_store_si256((__m256i *)(d + 16), v_r1_w);
+ _mm256_store_si256((__m256i *)(d + 32), v_r2_w);
+ _mm256_store_si256((__m256i *)(d + 48), v_r3_w);
+
+ a += 64;
+ b += 64;
+ d += 64;
+ N -= 64;
+ } while (N);
+}