diff options
Diffstat (limited to 'media/libaom/src/aom_dsp/x86')
75 files changed, 14360 insertions, 2318 deletions
diff --git a/media/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c b/media/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c new file mode 100644 index 0000000000..e33dff20c2 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "av1/encoder/av1_quantize.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin, + const int16_t *round_ptr, __m256i *round, + const int16_t *quant_ptr, __m256i *quant, + const int16_t *dequant_ptr, + __m256i *dequant, + const int16_t *shift_ptr, + __m256i *shift) { + *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); + *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); + *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); + *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); + *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr)); + *shift = _mm256_permute4x64_epi64(*shift, 0x54); +} + +static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) { + const __m256i coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr)); + const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); + return _mm256_packs_epi32(coeff1, coeff2); +} + +static INLINE void update_mask1_avx2(__m256i *cmp_mask, + const int16_t *iscan_ptr, int *is_found, + __m256i *mask) { + __m256i temp_mask = _mm256_setzero_si256(); + if (_mm256_movemask_epi8(*cmp_mask)) { + __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr)); + temp_mask = _mm256_and_si256(*cmp_mask, iscan); + *is_found = 1; + } + *mask = _mm256_max_epi16(temp_mask, *mask); +} + +static INLINE void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold, + const int16_t *iscan_ptr, int *is_found, + __m256i *mask) { + __m256i zero = _mm256_setzero_si256(); + __m256i coeff[2], cmp_mask0, cmp_mask1; + coeff[0] = _mm256_unpacklo_epi16(*qcoeff, zero); + coeff[1] = _mm256_unpackhi_epi16(*qcoeff, zero); + coeff[0] = _mm256_slli_epi32(coeff[0], AOM_QM_BITS); + cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]); + coeff[1] = _mm256_slli_epi32(coeff[1], AOM_QM_BITS); + cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]); + cmp_mask0 = + _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8); + update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask); +} + +static INLINE void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round, + const __m256i *quant, + const __m256i *shift) { + __m256i tmp, qcoeff; + qcoeff = _mm256_adds_epi16(*coeff, *round); + tmp = _mm256_mulhi_epi16(qcoeff, *quant); + qcoeff = _mm256_add_epi16(tmp, qcoeff); + *coeff = _mm256_mulhi_epi16(qcoeff, *shift); +} + +static INLINE __m256i calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) { + return _mm256_mullo_epi16(qcoeff, dequant); +} + +static INLINE void store_coefficients_avx2(__m256i coeff_vals, + tran_low_t *coeff_ptr) { + __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); + __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); + __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); + _mm256_store_si256((__m256i *)(coeff_ptr), coeff_vals_lo); + _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); +} + +void aom_quantize_b_adaptive_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m256i zero = _mm256_setzero_si256(); + __m256i zbin, round, quant, dequant, shift; + __m256i coeff, qcoeff; + __m256i cmp_mask, mask0 = zero, mask1 = zero; + __m128i temp_mask0, temp_mask1; + int prescan_add[2]; + int thresh[2]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; + } + __m256i threshold[2]; + threshold[0] = _mm256_set1_epi32(thresh[0]); + threshold[1] = _mm256_set1_epi32(thresh[1]); + threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + + // Setup global values. + load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff = load_coefficients_avx2(coeff_ptr); + qcoeff = _mm256_abs_epi16(coeff); + update_mask0_avx2(&qcoeff, threshold, iscan, &is_found0, &mask0); + __m256i temp0 = _mm256_cmpgt_epi16(qcoeff, zbin); + zbin = _mm256_unpackhi_epi64(zbin, zbin); + cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8); + update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); + threshold[0] = threshold[1]; + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); + round = _mm256_unpackhi_epi64(round, round); + quant = _mm256_unpackhi_epi64(quant, quant); + shift = _mm256_unpackhi_epi64(shift, shift); + dequant = _mm256_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift); + round = _mm256_unpackhi_epi64(round, round); + quant = _mm256_unpackhi_epi64(quant, quant); + shift = _mm256_unpackhi_epi64(shift, shift); + // Reinsert signs + qcoeff = _mm256_sign_epi16(qcoeff, coeff); + // Mask out zbin threshold coeffs + qcoeff = _mm256_and_si256(qcoeff, temp0); + store_coefficients_avx2(qcoeff, qcoeff_ptr); + coeff = calculate_dqcoeff_avx2(qcoeff, dequant); + dequant = _mm256_unpackhi_epi64(dequant, dequant); + store_coefficients_avx2(coeff, dqcoeff_ptr); + } + + // AC only loop. + while (index < n_coeffs) { + coeff = load_coefficients_avx2(coeff_ptr + index); + qcoeff = _mm256_abs_epi16(coeff); + update_mask0_avx2(&qcoeff, threshold, iscan + index, &is_found0, &mask0); + temp0 = _mm256_cmpgt_epi16(qcoeff, zbin); + cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8); + update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); + index += 16; + continue; + } + calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift); + qcoeff = _mm256_sign_epi16(qcoeff, coeff); + qcoeff = _mm256_and_si256(qcoeff, temp0); + store_coefficients_avx2(qcoeff, qcoeff_ptr + index); + coeff = calculate_dqcoeff_avx2(qcoeff, dequant); + store_coefficients_avx2(coeff, dqcoeff_ptr + index); + index += 16; + } + if (is_found0) { + temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), + _mm256_extracti128_si256(mask0, 1)); + non_zero_count = calculate_non_zero_count(temp_mask0); + } + if (is_found1) { + temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), + _mm256_extracti128_si256(mask1, 1)); + non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); + } + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff0 = qcoeff_ptr[rc]; + if (qcoeff0) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff0 = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff0); + const int abs_coeff = (coeff0 ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < + (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} diff --git a/media/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c b/media/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c new file mode 100644 index 0000000000..584cd671f1 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c @@ -0,0 +1,633 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "av1/encoder/av1_quantize.h" +#include "aom_dsp/x86/quantize_x86.h" + +void aom_quantize_b_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr); + store_coefficients(coeff1, dqcoeff_ptr + 8); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, + &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + index += 16; + continue; + } + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + index); + store_coefficients(coeff1, dqcoeff_ptr + index + 8); + + index += 16; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < + (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_quantize_b_32x32_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + const int log_scale = 1; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i log_scale_vec = _mm_set1_epi16(log_scale); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + // Shift with rounding. + zbin = _mm_add_epi16(zbin, log_scale_vec); + round = _mm_add_epi16(round, log_scale_vec); + zbin = _mm_srli_epi16(zbin, log_scale); + round = _mm_srli_epi16(round, log_scale); + zbin = _mm_sub_epi16(zbin, one); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, + &log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + 8, &log_scale); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, + &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + index += 16; + continue; + } + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, + dqcoeff_ptr + index, &log_scale); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + index + 8, &log_scale); + index += 16; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_quantize_b_64x64_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + const int log_scale = 2; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i log_scale_vec = _mm_set1_epi16(log_scale); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + // Shift with rounding. + zbin = _mm_add_epi16(zbin, log_scale_vec); + round = _mm_add_epi16(round, log_scale_vec); + zbin = _mm_srli_epi16(zbin, log_scale); + round = _mm_srli_epi16(round, log_scale); + zbin = _mm_sub_epi16(zbin, one); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, + &log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + 8, &log_scale); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, + &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + index += 16; + continue; + } + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, + dqcoeff_ptr + index, &log_scale); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + index + 8, &log_scale); + index += 16; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} diff --git a/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c b/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c index 5f5bf5f14e..ce8285e43d 100644 --- a/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c +++ b/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c @@ -21,13 +21,13 @@ filter8_1dfunction aom_filter_block1d8_v8_sse2; filter8_1dfunction aom_filter_block1d8_h8_sse2; filter8_1dfunction aom_filter_block1d4_v8_sse2; filter8_1dfunction aom_filter_block1d4_h8_sse2; +filter8_1dfunction aom_filter_block1d16_v4_sse2; +filter8_1dfunction aom_filter_block1d16_h4_sse2; -#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2 -#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2 -#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2 -#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2 -#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2 -#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2 +filter8_1dfunction aom_filter_block1d8_h4_sse2; +filter8_1dfunction aom_filter_block1d8_v4_sse2; +filter8_1dfunction aom_filter_block1d4_h4_sse2; +filter8_1dfunction aom_filter_block1d4_v4_sse2; filter8_1dfunction aom_filter_block1d16_v2_sse2; filter8_1dfunction aom_filter_block1d16_h2_sse2; @@ -49,7 +49,7 @@ filter8_1dfunction aom_filter_block1d4_h2_sse2; FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -#if ARCH_X86_64 +#if CONFIG_AV1_HIGHBITDEPTH highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; @@ -57,6 +57,13 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2; + highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; @@ -84,6 +91,5 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; // int w, int h, int bd); HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); - -#endif // ARCH_X86_64 +#endif #endif // HAVE_SSE2 diff --git a/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm index 7b3fe6419a..a7152be57c 100644 --- a/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm +++ b/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm @@ -67,7 +67,6 @@ dec rcx %endm -%if ARCH_X86_64 %macro HIGH_GET_PARAM 0 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr @@ -86,14 +85,17 @@ mov rdx, 0x00010001 movsxd rcx, DWORD PTR arg(6) ;bps - movq xmm8, rdx + movq xmm3, rdx movq xmm5, rcx - pshufd xmm8, xmm8, 0b - movdqa xmm1, xmm8 - psllw xmm8, xmm5 - psubw xmm8, xmm1 ;max value (for clamping) + pshufd xmm3, xmm3, 0b + movdqa xmm1, xmm3 + psllw xmm3, xmm5 + psubw xmm3, xmm1 ;max value (for clamping) pxor xmm5, xmm5 ;min value (for clamping) + movdqa max, xmm3 + movdqa min, xmm5 + movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rdx, DWORD PTR arg(3) ;out_pitch movsxd rcx, DWORD PTR arg(4) ;output_height @@ -113,8 +115,8 @@ packssdw xmm0, xmm6 ;pack back to word ;clamp the values - pminsw xmm0, xmm8 - pmaxsw xmm0, xmm5 + pminsw xmm0, max + pmaxsw xmm0, min %if %1 movdqu xmm1, [rdi] @@ -128,36 +130,36 @@ %endm %macro HIGH_APPLY_FILTER_16 1 - movdqa xmm9, xmm0 + movdqa xmm5, xmm0 movdqa xmm6, xmm2 - punpckhwd xmm9, xmm1 + punpckhwd xmm5, xmm1 punpckhwd xmm6, xmm3 punpcklwd xmm0, xmm1 punpcklwd xmm2, xmm3 - pmaddwd xmm9, xmm7 + pmaddwd xmm5, xmm7 pmaddwd xmm6, xmm7 pmaddwd xmm0, xmm7 pmaddwd xmm2, xmm7 - paddd xmm9, xmm4 ;rounding + paddd xmm5, xmm4 ;rounding paddd xmm6, xmm4 paddd xmm0, xmm4 paddd xmm2, xmm4 - psrad xmm9, 7 ;shift + psrad xmm5, 7 ;shift psrad xmm6, 7 psrad xmm0, 7 psrad xmm2, 7 - packssdw xmm0, xmm9 ;pack back to word + packssdw xmm0, xmm5 ;pack back to word packssdw xmm2, xmm6 ;pack back to word ;clamp the values - pminsw xmm0, xmm8 - pmaxsw xmm0, xmm5 - pminsw xmm2, xmm8 - pmaxsw xmm2, xmm5 + pminsw xmm0, max + pmaxsw xmm0, min + pminsw xmm2, max + pmaxsw xmm2, min %if %1 movdqu xmm1, [rdi] @@ -172,7 +174,6 @@ lea rdi, [rdi + 2*rdx] dec rcx %endm -%endif SECTION .text @@ -200,7 +201,6 @@ sym(aom_highbd_filter_block1d4_v2_sse2): pop rbp ret -%if ARCH_X86_64 global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE sym(aom_highbd_filter_block1d8_v2_sse2): push rbp @@ -211,6 +211,11 @@ sym(aom_highbd_filter_block1d8_v2_sse2): push rdi ; end prolog + ALIGN_STACK 16, rax + sub rsp, 16 * 2 + %define max [rsp + 16 * 0] + %define min [rsp + 16 * 1] + HIGH_GET_PARAM .loop: movdqu xmm0, [rsi] ;0 @@ -219,6 +224,9 @@ sym(aom_highbd_filter_block1d8_v2_sse2): HIGH_APPLY_FILTER_8 0 jnz .loop + add rsp, 16 * 2 + pop rsp + ; begin epilog pop rdi pop rsi @@ -237,6 +245,11 @@ sym(aom_highbd_filter_block1d16_v2_sse2): push rdi ; end prolog + ALIGN_STACK 16, rax + sub rsp, 16 * 2 + %define max [rsp + 16 * 0] + %define min [rsp + 16 * 1] + HIGH_GET_PARAM .loop: movdqu xmm0, [rsi] ;0 @@ -247,6 +260,9 @@ sym(aom_highbd_filter_block1d16_v2_sse2): HIGH_APPLY_FILTER_16 0 jnz .loop + add rsp, 16 * 2 + pop rsp + ; begin epilog pop rdi pop rsi @@ -254,7 +270,6 @@ sym(aom_highbd_filter_block1d16_v2_sse2): UNSHADOW_ARGS pop rbp ret -%endif global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE sym(aom_highbd_filter_block1d4_h2_sse2): @@ -281,7 +296,6 @@ sym(aom_highbd_filter_block1d4_h2_sse2): pop rbp ret -%if ARCH_X86_64 global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE sym(aom_highbd_filter_block1d8_h2_sse2): push rbp @@ -292,6 +306,11 @@ sym(aom_highbd_filter_block1d8_h2_sse2): push rdi ; end prolog + ALIGN_STACK 16, rax + sub rsp, 16 * 2 + %define max [rsp + 16 * 0] + %define min [rsp + 16 * 1] + HIGH_GET_PARAM .loop: movdqu xmm0, [rsi] ;load src @@ -300,6 +319,9 @@ sym(aom_highbd_filter_block1d8_h2_sse2): HIGH_APPLY_FILTER_8 0 jnz .loop + add rsp, 16 * 2 + pop rsp + ; begin epilog pop rdi pop rsi @@ -318,6 +340,11 @@ sym(aom_highbd_filter_block1d16_h2_sse2): push rdi ; end prolog + ALIGN_STACK 16, rax + sub rsp, 16 * 2 + %define max [rsp + 16 * 0] + %define min [rsp + 16 * 1] + HIGH_GET_PARAM .loop: movdqu xmm0, [rsi] ;load src @@ -328,6 +355,9 @@ sym(aom_highbd_filter_block1d16_h2_sse2): HIGH_APPLY_FILTER_16 0 jnz .loop + add rsp, 16 * 2 + pop rsp + ; begin epilog pop rdi pop rsi @@ -335,4 +365,3 @@ sym(aom_highbd_filter_block1d16_h2_sse2): UNSHADOW_ARGS pop rbp ret -%endif diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c new file mode 100644 index 0000000000..cff7f43eee --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c @@ -0,0 +1,569 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> // SSE2 + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/x86/convolve.h" +#include "aom_ports/mem.h" + +void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1, + srcRegFilt32b2_2; + __m128i srcReg32b1, srcReg32b2; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); + __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); + __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); + __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); + __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); + + __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3); + __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5); + __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); + __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); + d1 = _mm_madd_epi16(ss_1_2, secondFilters); + d2 = _mm_madd_epi16(ss_2_2, thirdFilters); + srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); + + __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); + + // reading stride of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + + ss_2 = _mm_srli_si128(srcReg32b2, 2); + ss_4 = _mm_srli_si128(srcReg32b2, 4); + ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); + ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); + d1 = _mm_madd_epi16(ss_1_1, secondFilters); + d2 = _mm_madd_epi16(ss_2_1, thirdFilters); + srcRegFilt32b2_1 = _mm_add_epi32(d1, d2); + + ss_1 = _mm_srli_si128(srcReg32b2, 3); + ss_3 = _mm_srli_si128(srcReg32b2, 5); + ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); + ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); + d1 = _mm_madd_epi16(ss_1_2, secondFilters); + d2 = _mm_madd_epi16(ss_2_2, thirdFilters); + srcRegFilt32b2_2 = _mm_add_epi32(d1, d2); + + res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); + res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); + srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_pixels_per_line; + + _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; + __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; + __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + __m128i tmp_0, tmp_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiply the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); + srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); + __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); + __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); + __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128()); + __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128()); + + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); + srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); + __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); + __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); + __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128()); + __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128()); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + + srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); + srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); + + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + + srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + + tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); + resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); + + tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); + resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); + __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); + resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); + __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); + resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); + + // add and saturate the results together + resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); + + // multiply 2 adjacent elements with the filter and add the result + + tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters); + resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1); + + tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters); + resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128()); + __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters); + resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128()); + __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters); + resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1); + + // add and saturate the results together + resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); + resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); + + // shift by 6 bit each 16 bit + resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); + resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); + resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); + resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); + resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); + resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); + resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); + resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); + resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); + + src_ptr += src_stride; + + _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); + _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + resReg23_lo_1 = resReg45_lo_1; + resReg23_lo_2 = resReg45_lo_2; + resReg23_hi_1 = resReg45_hi_1; + resReg23_hi_2 = resReg45_hi_2; + resReg34_lo_1 = resReg56_lo_1; + resReg34_lo_2 = resReg56_lo_2; + resReg34_hi_1 = resReg56_hi_1; + resReg34_hi_2 = resReg56_hi_2; + srcReg4 = srcReg6; + } +} + +void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; + __m128i srcReg32b1; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); + ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); + ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); + __m128i d1 = _mm_madd_epi16(ss_2, secondFilters); + __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); + + __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); + __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); + ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); + ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); + d1 = _mm_madd_epi16(ss_3, secondFilters); + d2 = _mm_madd_epi16(ss_5, thirdFilters); + srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); + + __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + src_ptr += src_pixels_per_line; + + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg34_lo; + __m128i srcReg45_lo, srcReg56_lo; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_45_lo, resReg34_56_lo; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + __m128i tmp_0, tmp_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiply the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); + __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); + __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); + + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); + __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); + __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); + + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + + tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); + resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); + + tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); + resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); + __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); + resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); + __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); + resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); + + // add and saturate the results together + resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); + + // shift by 6 bit each 16 bit + resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); + resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); + resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); + resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128()); + resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128()); + + src_ptr += src_stride; + + _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45)); + _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + resReg23_lo_1 = resReg45_lo_1; + resReg23_lo_2 = resReg45_lo_2; + resReg34_lo_1 = resReg56_lo_1; + resReg34_lo_2 = resReg56_lo_2; + srcReg4 = srcReg6; + } +} + +void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1; + __m128i srcReg32b1; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); + __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); + __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); + + ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); + ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); + ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); + ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); + + __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3); + __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5); + + __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); + __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); + + srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + src_ptr += src_pixels_per_line; + + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23, srcReg34, srcReg45, srcReg56; + __m128i resReg23_34, resReg45_56; + __m128i resReg23_34_45_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + __m128i tmp_0, tmp_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiply the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3); + __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128()); + + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4); + __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128()); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5); + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + tmp_0 = _mm_madd_epi16(resReg23, secondFilters); + tmp_1 = _mm_madd_epi16(resReg34, secondFilters); + resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128()); + __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128()); + + tmp_0 = _mm_madd_epi16(resReg45, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg56, thirdFilters); + resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1); + + // add and saturate the results together + resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56); + + // shift by 6 bit each 16 bit + resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32); + resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_34_45_56 = + _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128()); + + src_ptr += src_stride; + + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56); + *((uint32_t *)(output_ptr + out_pitch)) = + _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + resReg23 = resReg45; + resReg34 = resReg56; + srcReg4 = srcReg6; + } +} diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c index 325a21b761..f64b821ea4 100644 --- a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c +++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c @@ -20,29 +20,44 @@ #include "aom_ports/emmintrin_compat.h" // filters only for the 4_h8 convolution -DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 -}; +DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { 0, 1, 1, 2, 2, 3, + 3, 4, 2, 3, 3, 4, + 4, 5, 5, 6 }; -DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 -}; +DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { 4, 5, 5, 6, 6, 7, + 7, 8, 6, 7, 7, 8, + 8, 9, 9, 10 }; // filters for 8_h8 and 16_h8 -DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +DECLARE_ALIGNED(16, static const uint8_t, + filt1_global[16]) = { 0, 1, 1, 2, 2, 3, 3, 4, + 4, 5, 5, 6, 6, 7, 7, 8 }; + +DECLARE_ALIGNED(16, static const uint8_t, + filt2_global[16]) = { 2, 3, 3, 4, 4, 5, 5, 6, + 6, 7, 7, 8, 8, 9, 9, 10 }; + +DECLARE_ALIGNED(16, static const uint8_t, + filt3_global[16]) = { 4, 5, 5, 6, 6, 7, 7, 8, + 8, 9, 9, 10, 10, 11, 11, 12 }; + +DECLARE_ALIGNED(16, static const uint8_t, + filt4_global[16]) = { 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 12, 13, 13, 14 }; + +DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, + 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, + 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; -DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = { + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, }; // These are reused by the avx2 intrinsics. @@ -50,6 +65,133 @@ filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3; filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3; filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3; +static void aom_filter_block1d4_h4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); + filt1Reg = _mm_load_si128((__m128i const *)(filtd4)); + + for (i = output_height; i > 0; i -= 1) { + // load the 2 strides of source + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + src_ptr += src_pixels_per_line; + + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); + output_ptr += output_pitch; + } +} + +static void aom_filter_block1d4_v4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32; + __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45, + srcReg6, srcReg56; + __m128i srcReg23_34_lo, srcReg45_56_lo; + __m128i srcReg2345_3456_lo, srcReg2345_3456_hi; + __m128i resReglo, resReghi; + __m128i firstFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3); + + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + + // have consecutive loads on the same 256 register + srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4); + + srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5); + + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6); + + // merge every two consecutive registers + srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56); + + srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); + srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters); + resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters); + + resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128()); + resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128()); + + // shift by 6 bit each 16 bit + resReglo = _mm_adds_epi16(resReglo, addFilterReg32); + resReghi = _mm_adds_epi16(resReghi, addFilterReg32); + resReglo = _mm_srai_epi16(resReglo, 6); + resReghi = _mm_srai_epi16(resReghi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReglo = _mm_packus_epi16(resReglo, resReglo); + resReghi = _mm_packus_epi16(resReghi, resReghi); + + src_ptr += src_stride; + + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReglo); + *((uint32_t *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4 = srcReg6; + } +} + void aom_filter_block1d4_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { @@ -118,6 +260,145 @@ void aom_filter_block1d4_h8_intrin_ssse3( } } +static void aom_filter_block1d8_h4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32, filt2Reg, filt3Reg; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m128i srcReg32b1; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + + filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32)); + filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2)); + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + src_ptr += src_pixels_per_line; + + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +static void aom_filter_block1d8_v4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23, srcReg34, srcReg45, srcReg56; + __m128i resReg23, resReg34, resReg45, resReg56; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3); + + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + + // have consecutive loads on the same 256 register + srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + + srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5); + + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + + srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters); + resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters); + resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters); + resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters); + + // add and saturate the results together + resReg23_45 = _mm_adds_epi16(resReg23, resReg45); + resReg34_56 = _mm_adds_epi16(resReg34, resReg56); + + // shift by 6 bit each 16 bit + resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32); + resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32); + resReg23_45 = _mm_srai_epi16(resReg23_45, 6); + resReg34_56 = _mm_srai_epi16(resReg34_56, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128()); + resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128()); + + src_ptr += src_stride; + + _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45)); + _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23 = srcReg45; + srcReg34 = srcReg56; + srcReg4 = srcReg6; + } +} + void aom_filter_block1d8_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { @@ -280,6 +561,187 @@ void aom_filter_block1d8_v8_intrin_ssse3( } } +static void aom_filter_block1d16_h4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32, filt2Reg, filt3Reg; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m128i srcReg32b1, srcReg32b2; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + + filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32)); + filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2)); + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // reading stride of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + + // filter the source buffer + srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_pixels_per_line; + + _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +static void aom_filter_block1d16_v4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; + __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; + __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); + srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); + + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + + // have consecutive loads on the same 256 register + srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); + srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + + srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); + srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); + + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + + srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters); + resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters); + resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters); + resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters); + + // add and saturate the results together + resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); + + // multiply 2 adjacent elements with the filter and add the result + + resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters); + resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters); + resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters); + resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters); + + // add and saturate the results together + resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); + resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); + + // shift by 6 bit each 16 bit + resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); + resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); + resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); + resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); + resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); + resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); + resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); + resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); + resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); + + src_ptr += src_stride; + + _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); + _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_lo = srcReg45_lo; + srcReg34_lo = srcReg56_lo; + srcReg23_hi = srcReg45_hi; + srcReg34_hi = srcReg56_hi; + srcReg4 = srcReg6; + } +} + filter8_1dfunction aom_filter_block1d16_v8_ssse3; filter8_1dfunction aom_filter_block1d16_h8_ssse3; filter8_1dfunction aom_filter_block1d8_v8_ssse3; @@ -287,13 +749,6 @@ filter8_1dfunction aom_filter_block1d8_h8_ssse3; filter8_1dfunction aom_filter_block1d4_v8_ssse3; filter8_1dfunction aom_filter_block1d4_h8_ssse3; -#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3 -#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3 -#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3 -#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3 -#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3 -#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3 - filter8_1dfunction aom_filter_block1d16_v2_ssse3; filter8_1dfunction aom_filter_block1d16_h2_ssse3; filter8_1dfunction aom_filter_block1d8_v2_ssse3; diff --git a/media/libaom/src/aom_dsp/x86/avg_intrin_avx2.c b/media/libaom/src/aom_dsp/x86/avg_intrin_avx2.c new file mode 100644 index 0000000000..3bbffbd805 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/avg_intrin_avx2.c @@ -0,0 +1,504 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/bitdepth_conversion_avx2.h" +#include "aom_ports/mem.h" + +static void hadamard_col8x2_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi16(a0, a1); + __m256i b1 = _mm256_sub_epi16(a0, a1); + __m256i b2 = _mm256_add_epi16(a2, a3); + __m256i b3 = _mm256_sub_epi16(a2, a3); + __m256i b4 = _mm256_add_epi16(a4, a5); + __m256i b5 = _mm256_sub_epi16(a4, a5); + __m256i b6 = _mm256_add_epi16(a6, a7); + __m256i b7 = _mm256_sub_epi16(a6, a7); + + a0 = _mm256_add_epi16(b0, b2); + a1 = _mm256_add_epi16(b1, b3); + a2 = _mm256_sub_epi16(b0, b2); + a3 = _mm256_sub_epi16(b1, b3); + a4 = _mm256_add_epi16(b4, b6); + a5 = _mm256_add_epi16(b5, b7); + a6 = _mm256_sub_epi16(b4, b6); + a7 = _mm256_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi16(a0, a4); + b7 = _mm256_add_epi16(a1, a5); + b3 = _mm256_add_epi16(a2, a6); + b4 = _mm256_add_epi16(a3, a7); + b2 = _mm256_sub_epi16(a0, a4); + b6 = _mm256_sub_epi16(a1, a5); + b1 = _mm256_sub_epi16(a2, a6); + b5 = _mm256_sub_epi16(a3, a7); + + a0 = _mm256_unpacklo_epi16(b0, b1); + a1 = _mm256_unpacklo_epi16(b2, b3); + a2 = _mm256_unpackhi_epi16(b0, b1); + a3 = _mm256_unpackhi_epi16(b2, b3); + a4 = _mm256_unpacklo_epi16(b4, b5); + a5 = _mm256_unpacklo_epi16(b6, b7); + a6 = _mm256_unpackhi_epi16(b4, b5); + a7 = _mm256_unpackhi_epi16(b6, b7); + + b0 = _mm256_unpacklo_epi32(a0, a1); + b1 = _mm256_unpacklo_epi32(a4, a5); + b2 = _mm256_unpackhi_epi32(a0, a1); + b3 = _mm256_unpackhi_epi32(a4, a5); + b4 = _mm256_unpacklo_epi32(a2, a3); + b5 = _mm256_unpacklo_epi32(a6, a7); + b6 = _mm256_unpackhi_epi32(a2, a3); + b7 = _mm256_unpackhi_epi32(a6, a7); + + in[0] = _mm256_unpacklo_epi64(b0, b1); + in[1] = _mm256_unpackhi_epi64(b0, b1); + in[2] = _mm256_unpacklo_epi64(b2, b3); + in[3] = _mm256_unpackhi_epi64(b2, b3); + in[4] = _mm256_unpacklo_epi64(b4, b5); + in[5] = _mm256_unpackhi_epi64(b4, b5); + in[6] = _mm256_unpacklo_epi64(b6, b7); + in[7] = _mm256_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm256_add_epi16(a0, a4); + in[7] = _mm256_add_epi16(a1, a5); + in[3] = _mm256_add_epi16(a2, a6); + in[4] = _mm256_add_epi16(a3, a7); + in[2] = _mm256_sub_epi16(a0, a4); + in[6] = _mm256_sub_epi16(a1, a5); + in[1] = _mm256_sub_epi16(a2, a6); + in[5] = _mm256_sub_epi16(a3, a7); + } +} + +static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + __m256i src[8]; + src[0] = _mm256_loadu_si256((const __m256i *)src_diff); + src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + + hadamard_col8x2_avx2(src, 0); + hadamard_col8x2_avx2(src, 1); + + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x31)); +} + +static INLINE void hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; + int16_t *coeff16 = (int16_t *)coeff; + int idx; + for (idx = 0; idx < 2; ++idx) { + const int16_t *src_ptr = src_diff + idx * 8 * src_stride; + hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); + } + + for (idx = 0; idx < 64; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 1); + b1 = _mm256_srai_epi16(b1, 1); + b2 = _mm256_srai_epi16(b2, 1); + b3 = _mm256_srai_epi16(b3, 1); + if (is_final) { + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); + coeff += 16; + } else { + _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3)); + _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3)); + coeff16 += 16; + } + t_coeff += 16; + } +} + +void aom_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_16x16_avx2(src_diff, src_stride, coeff, 1); +} + +void aom_hadamard_lp_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + int16_t *t_coeff = coeff; + for (int idx = 0; idx < 2; ++idx) { + const int16_t *src_ptr = src_diff + idx * 8 * src_stride; + hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); + } + + for (int idx = 0; idx < 64; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 1); + b1 = _mm256_srai_epi16(b1, 1); + b2 = _mm256_srai_epi16(b2, 1); + b3 = _mm256_srai_epi16(b3, 1); + _mm256_storeu_si256((__m256i *)coeff, _mm256_add_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff + 64), _mm256_add_epi16(b1, b3)); + _mm256_storeu_si256((__m256i *)(coeff + 128), _mm256_sub_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff + 192), _mm256_sub_epi16(b1, b3)); + coeff += 16; + t_coeff += 16; + } +} + +void aom_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); + int16_t *t_coeff = temp_coeff; + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + hadamard_16x16_avx2(src_ptr, src_stride, + (tran_low_t *)(t_coeff + idx * 256), 0); + } + + for (idx = 0; idx < 256; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 2); + b1 = _mm256_srai_epi16(b1, 2); + b2 = _mm256_srai_epi16(b2, 2); + b3 = _mm256_srai_epi16(b3, 2); + + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768); + + coeff += 16; + t_coeff += 16; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_hadamard_col8_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi32(a0, a1); + __m256i b1 = _mm256_sub_epi32(a0, a1); + __m256i b2 = _mm256_add_epi32(a2, a3); + __m256i b3 = _mm256_sub_epi32(a2, a3); + __m256i b4 = _mm256_add_epi32(a4, a5); + __m256i b5 = _mm256_sub_epi32(a4, a5); + __m256i b6 = _mm256_add_epi32(a6, a7); + __m256i b7 = _mm256_sub_epi32(a6, a7); + + a0 = _mm256_add_epi32(b0, b2); + a1 = _mm256_add_epi32(b1, b3); + a2 = _mm256_sub_epi32(b0, b2); + a3 = _mm256_sub_epi32(b1, b3); + a4 = _mm256_add_epi32(b4, b6); + a5 = _mm256_add_epi32(b5, b7); + a6 = _mm256_sub_epi32(b4, b6); + a7 = _mm256_sub_epi32(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi32(a0, a4); + b7 = _mm256_add_epi32(a1, a5); + b3 = _mm256_add_epi32(a2, a6); + b4 = _mm256_add_epi32(a3, a7); + b2 = _mm256_sub_epi32(a0, a4); + b6 = _mm256_sub_epi32(a1, a5); + b1 = _mm256_sub_epi32(a2, a6); + b5 = _mm256_sub_epi32(a3, a7); + + a0 = _mm256_unpacklo_epi32(b0, b1); + a1 = _mm256_unpacklo_epi32(b2, b3); + a2 = _mm256_unpackhi_epi32(b0, b1); + a3 = _mm256_unpackhi_epi32(b2, b3); + a4 = _mm256_unpacklo_epi32(b4, b5); + a5 = _mm256_unpacklo_epi32(b6, b7); + a6 = _mm256_unpackhi_epi32(b4, b5); + a7 = _mm256_unpackhi_epi32(b6, b7); + + b0 = _mm256_unpacklo_epi64(a0, a1); + b1 = _mm256_unpacklo_epi64(a4, a5); + b2 = _mm256_unpackhi_epi64(a0, a1); + b3 = _mm256_unpackhi_epi64(a4, a5); + b4 = _mm256_unpacklo_epi64(a2, a3); + b5 = _mm256_unpacklo_epi64(a6, a7); + b6 = _mm256_unpackhi_epi64(a2, a3); + b7 = _mm256_unpackhi_epi64(a6, a7); + + in[0] = _mm256_permute2x128_si256(b0, b1, 0x20); + in[1] = _mm256_permute2x128_si256(b0, b1, 0x31); + in[2] = _mm256_permute2x128_si256(b2, b3, 0x20); + in[3] = _mm256_permute2x128_si256(b2, b3, 0x31); + in[4] = _mm256_permute2x128_si256(b4, b5, 0x20); + in[5] = _mm256_permute2x128_si256(b4, b5, 0x31); + in[6] = _mm256_permute2x128_si256(b6, b7, 0x20); + in[7] = _mm256_permute2x128_si256(b6, b7, 0x31); + } else { + in[0] = _mm256_add_epi32(a0, a4); + in[7] = _mm256_add_epi32(a1, a5); + in[3] = _mm256_add_epi32(a2, a6); + in[4] = _mm256_add_epi32(a3, a7); + in[2] = _mm256_sub_epi32(a0, a4); + in[6] = _mm256_sub_epi32(a1, a5); + in[1] = _mm256_sub_epi32(a2, a6); + in[5] = _mm256_sub_epi32(a3, a7); + } +} + +void aom_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + __m128i src16[8]; + __m256i src32[8]; + + src16[0] = _mm_loadu_si128((const __m128i *)src_diff); + src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + + src32[0] = _mm256_cvtepi16_epi32(src16[0]); + src32[1] = _mm256_cvtepi16_epi32(src16[1]); + src32[2] = _mm256_cvtepi16_epi32(src16[2]); + src32[3] = _mm256_cvtepi16_epi32(src16[3]); + src32[4] = _mm256_cvtepi16_epi32(src16[4]); + src32[5] = _mm256_cvtepi16_epi32(src16[5]); + src32[6] = _mm256_cvtepi16_epi32(src16[6]); + src32[7] = _mm256_cvtepi16_epi32(src16[7]); + + highbd_hadamard_col8_avx2(src32, 0); + highbd_hadamard_col8_avx2(src32, 1); + + _mm256_storeu_si256((__m256i *)coeff, src32[0]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[1]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[2]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[3]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[4]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[5]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[6]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[7]); +} + +void aom_highbd_hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + aom_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64); + } + + for (idx = 0; idx < 64; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 1); + b1 = _mm256_srai_epi32(b1, 1); + b2 = _mm256_srai_epi32(b2, 1); + b3 = _mm256_srai_epi32(b3, 1); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3); + + coeff += 8; + t_coeff += 8; + } +} + +void aom_highbd_hadamard_32x32_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + aom_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256); + } + + for (idx = 0; idx < 256; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 2); + b1 = _mm256_srai_epi32(b1, 2); + b2 = _mm256_srai_epi32(b2, 2); + b3 = _mm256_srai_epi32(b3, 2); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3); + + coeff += 8; + t_coeff += 8; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +int aom_satd_avx2(const tran_low_t *coeff, int length) { + __m256i accum = _mm256_setzero_si256(); + int i; + + for (i = 0; i < length; i += 8, coeff += 8) { + const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i abs = _mm256_abs_epi32(src_line); + accum = _mm256_add_epi32(accum, abs); + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} + +int aom_satd_lp_avx2(const int16_t *coeff, int length) { + const __m256i one = _mm256_set1_epi16(1); + __m256i accum = _mm256_setzero_si256(); + + for (int i = 0; i < length; i += 16) { + const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i abs = _mm256_abs_epi16(src_line); + const __m256i sum = _mm256_madd_epi16(abs, one); + accum = _mm256_add_epi32(accum, sum); + coeff += 16; + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} diff --git a/media/libaom/src/aom_dsp/x86/avg_intrin_sse2.c b/media/libaom/src/aom_dsp/x86/avg_intrin_sse2.c new file mode 100644 index 0000000000..260ca2ad17 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/avg_intrin_sse2.c @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/bitdepth_conversion_sse2.h" +#include "aom_ports/mem.h" + +void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; + u0 = _mm_setzero_si128(); + // Row 0 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff0 = _mm_max_epi16(diff, negdiff); + // Row 1 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(absdiff0, absdiff); + minabsdiff = _mm_min_epi16(absdiff0, absdiff); + // Row 2 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 3 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 4 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 5 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 6 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 7 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); + *max = _mm_extract_epi16(maxabsdiff, 0); + + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); + *min = _mm_extract_epi16(minabsdiff, 0); +} + +unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); + avg = _mm_extract_epi16(s0, 0); + return (avg + 32) >> 6; +} + +unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); + avg = _mm_extract_epi16(s0, 0); + return (avg + 8) >> 4; +} + +static INLINE void hadamard_col8_sse2(__m128i *in, int iter) { + __m128i a0 = in[0]; + __m128i a1 = in[1]; + __m128i a2 = in[2]; + __m128i a3 = in[3]; + __m128i a4 = in[4]; + __m128i a5 = in[5]; + __m128i a6 = in[6]; + __m128i a7 = in[7]; + + __m128i b0 = _mm_add_epi16(a0, a1); + __m128i b1 = _mm_sub_epi16(a0, a1); + __m128i b2 = _mm_add_epi16(a2, a3); + __m128i b3 = _mm_sub_epi16(a2, a3); + __m128i b4 = _mm_add_epi16(a4, a5); + __m128i b5 = _mm_sub_epi16(a4, a5); + __m128i b6 = _mm_add_epi16(a6, a7); + __m128i b7 = _mm_sub_epi16(a6, a7); + + a0 = _mm_add_epi16(b0, b2); + a1 = _mm_add_epi16(b1, b3); + a2 = _mm_sub_epi16(b0, b2); + a3 = _mm_sub_epi16(b1, b3); + a4 = _mm_add_epi16(b4, b6); + a5 = _mm_add_epi16(b5, b7); + a6 = _mm_sub_epi16(b4, b6); + a7 = _mm_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm_add_epi16(a0, a4); + b7 = _mm_add_epi16(a1, a5); + b3 = _mm_add_epi16(a2, a6); + b4 = _mm_add_epi16(a3, a7); + b2 = _mm_sub_epi16(a0, a4); + b6 = _mm_sub_epi16(a1, a5); + b1 = _mm_sub_epi16(a2, a6); + b5 = _mm_sub_epi16(a3, a7); + + a0 = _mm_unpacklo_epi16(b0, b1); + a1 = _mm_unpacklo_epi16(b2, b3); + a2 = _mm_unpackhi_epi16(b0, b1); + a3 = _mm_unpackhi_epi16(b2, b3); + a4 = _mm_unpacklo_epi16(b4, b5); + a5 = _mm_unpacklo_epi16(b6, b7); + a6 = _mm_unpackhi_epi16(b4, b5); + a7 = _mm_unpackhi_epi16(b6, b7); + + b0 = _mm_unpacklo_epi32(a0, a1); + b1 = _mm_unpacklo_epi32(a4, a5); + b2 = _mm_unpackhi_epi32(a0, a1); + b3 = _mm_unpackhi_epi32(a4, a5); + b4 = _mm_unpacklo_epi32(a2, a3); + b5 = _mm_unpacklo_epi32(a6, a7); + b6 = _mm_unpackhi_epi32(a2, a3); + b7 = _mm_unpackhi_epi32(a6, a7); + + in[0] = _mm_unpacklo_epi64(b0, b1); + in[1] = _mm_unpackhi_epi64(b0, b1); + in[2] = _mm_unpacklo_epi64(b2, b3); + in[3] = _mm_unpackhi_epi64(b2, b3); + in[4] = _mm_unpacklo_epi64(b4, b5); + in[5] = _mm_unpackhi_epi64(b4, b5); + in[6] = _mm_unpacklo_epi64(b6, b7); + in[7] = _mm_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm_add_epi16(a0, a4); + in[7] = _mm_add_epi16(a1, a5); + in[3] = _mm_add_epi16(a2, a6); + in[4] = _mm_add_epi16(a3, a7); + in[2] = _mm_sub_epi16(a0, a4); + in[6] = _mm_sub_epi16(a1, a5); + in[1] = _mm_sub_epi16(a2, a6); + in[5] = _mm_sub_epi16(a3, a7); + } +} + +static INLINE void hadamard_8x8_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { + __m128i src[8]; + src[0] = _mm_load_si128((const __m128i *)src_diff); + src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + + hadamard_col8_sse2(src, 0); + hadamard_col8_sse2(src, 1); + + if (is_final) { + store_tran_low(src[0], coeff); + coeff += 8; + store_tran_low(src[1], coeff); + coeff += 8; + store_tran_low(src[2], coeff); + coeff += 8; + store_tran_low(src[3], coeff); + coeff += 8; + store_tran_low(src[4], coeff); + coeff += 8; + store_tran_low(src[5], coeff); + coeff += 8; + store_tran_low(src[6], coeff); + coeff += 8; + store_tran_low(src[7], coeff); + } else { + int16_t *coeff16 = (int16_t *)coeff; + _mm_store_si128((__m128i *)coeff16, src[0]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[1]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[2]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[3]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[4]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[5]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[6]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[7]); + } +} + +void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_8x8_sse2(src_diff, src_stride, coeff, 1); +} + +void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + __m128i src[8]; + src[0] = _mm_load_si128((const __m128i *)src_diff); + src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + + hadamard_col8_sse2(src, 0); + hadamard_col8_sse2(src, 1); + + _mm_store_si128((__m128i *)coeff, src[0]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[1]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[2]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[3]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[4]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[5]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[6]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[7]); +} + +static INLINE void hadamard_16x16_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; + int16_t *coeff16 = (int16_t *)coeff; + int idx; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64), + 0); + } + + for (idx = 0; idx < 64; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); + + __m128i b0 = _mm_add_epi16(coeff0, coeff1); + __m128i b1 = _mm_sub_epi16(coeff0, coeff1); + __m128i b2 = _mm_add_epi16(coeff2, coeff3); + __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + + b0 = _mm_srai_epi16(b0, 1); + b1 = _mm_srai_epi16(b1, 1); + b2 = _mm_srai_epi16(b2, 1); + b3 = _mm_srai_epi16(b3, 1); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + + if (is_final) { + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 64); + store_tran_low(coeff2, coeff + 128); + store_tran_low(coeff3, coeff + 192); + coeff += 8; + } else { + _mm_store_si128((__m128i *)coeff16, coeff0); + _mm_store_si128((__m128i *)(coeff16 + 64), coeff1); + _mm_store_si128((__m128i *)(coeff16 + 128), coeff2); + _mm_store_si128((__m128i *)(coeff16 + 192), coeff3); + coeff16 += 8; + } + + t_coeff += 8; + } +} + +void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_16x16_sse2(src_diff, src_stride, coeff, 1); +} + +void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); + int16_t *t_coeff = temp_coeff; + int idx; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + hadamard_16x16_sse2(src_ptr, src_stride, + (tran_low_t *)(t_coeff + idx * 256), 0); + } + + for (idx = 0; idx < 256; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); + + __m128i b0 = _mm_add_epi16(coeff0, coeff1); + __m128i b1 = _mm_sub_epi16(coeff0, coeff1); + __m128i b2 = _mm_add_epi16(coeff2, coeff3); + __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + + b0 = _mm_srai_epi16(b0, 2); + b1 = _mm_srai_epi16(b1, 2); + b2 = _mm_srai_epi16(b2, 2); + b3 = _mm_srai_epi16(b3, 2); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 256); + + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + store_tran_low(coeff2, coeff + 512); + store_tran_low(coeff3, coeff + 768); + + coeff += 8; + t_coeff += 8; + } +} + +int aom_satd_sse2(const tran_low_t *coeff, int length) { + int i; + const __m128i zero = _mm_setzero_si128(); + __m128i accum = zero; + + for (i = 0; i < length; i += 8) { + const __m128i src_line = load_tran_low(coeff); + const __m128i inv = _mm_sub_epi16(zero, src_line); + const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) + const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); + const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero); + const __m128i sum = _mm_add_epi32(abs_lo, abs_hi); + accum = _mm_add_epi32(accum, sum); + coeff += 8; + } + + { // cascading summation of accum + __m128i hi = _mm_srli_si128(accum, 8); + accum = _mm_add_epi32(accum, hi); + hi = _mm_srli_epi64(accum, 32); + accum = _mm_add_epi32(accum, hi); + } + + return _mm_cvtsi128_si32(accum); +} + +void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, + const int ref_stride, const int height) { + int idx = 1; + __m128i zero = _mm_setzero_si128(); + __m128i src_line = _mm_loadu_si128((const __m128i *)ref); + __m128i s0 = _mm_unpacklo_epi8(src_line, zero); + __m128i s1 = _mm_unpackhi_epi8(src_line, zero); + __m128i t0, t1; + int height_1 = height - 1; + ref += ref_stride; + do { + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + ref += ref_stride; + + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + ref += ref_stride; + idx += 2; + } while (idx < height_1); + + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + if (height == 128) { + s0 = _mm_srai_epi16(s0, 6); + s1 = _mm_srai_epi16(s1, 6); + } else if (height == 64) { + s0 = _mm_srai_epi16(s0, 5); + s1 = _mm_srai_epi16(s1, 5); + } else if (height == 32) { + s0 = _mm_srai_epi16(s0, 4); + s1 = _mm_srai_epi16(s1, 4); + } else { + assert(height == 16); + s0 = _mm_srai_epi16(s0, 3); + s1 = _mm_srai_epi16(s1, 3); + } + + _mm_storeu_si128((__m128i *)hbuf, s0); + hbuf += 8; + _mm_storeu_si128((__m128i *)hbuf, s1); +} + +int16_t aom_int_pro_col_sse2(const uint8_t *ref, const int width) { + __m128i zero = _mm_setzero_si128(); + __m128i src_line = _mm_loadu_si128((const __m128i *)ref); + __m128i s0 = _mm_sad_epu8(src_line, zero); + __m128i s1; + int i; + + for (i = 16; i < width; i += 16) { + ref += 16; + src_line = _mm_loadu_si128((const __m128i *)ref); + s1 = _mm_sad_epu8(src_line, zero); + s0 = _mm_adds_epu16(s0, s1); + } + + s1 = _mm_srli_si128(s0, 8); + s0 = _mm_adds_epu16(s0, s1); + + return _mm_extract_epi16(s0, 0); +} diff --git a/media/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h b/media/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h new file mode 100644 index 0000000000..85896e2768 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "config/aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +static INLINE __m256i load_tran_low(const tran_low_t *a) { + const __m256i a_low = _mm256_loadu_si256((const __m256i *)a); + const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8)); + return _mm256_packs_epi32(a_low, a_high); +} + +static INLINE void store_tran_low(__m256i a, tran_low_t *b) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_hi = _mm256_mulhi_epi16(a, one); + const __m256i a_lo = _mm256_mullo_epi16(a, one); + const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi); + const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi); + _mm256_storeu_si256((__m256i *)b, a_1); + _mm256_storeu_si256((__m256i *)(b + 8), a_2); +} diff --git a/media/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h b/media/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h new file mode 100644 index 0000000000..42bb2d1d32 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <xmmintrin.h> + +#include "config/aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +// Load 8 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE __m128i load_tran_low(const tran_low_t *a) { + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); +} + +// Store 8 16 bit values. If the destination is 32 bits then sign extend the +// values by multiplying by 1. +static INLINE void store_tran_low(__m128i a, tran_low_t *b) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a_hi = _mm_mulhi_epi16(a, one); + const __m128i a_lo = _mm_mullo_epi16(a, one); + const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi); + const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi); + _mm_store_si128((__m128i *)(b), a_1); + _mm_store_si128((__m128i *)(b + 4), a_2); +} diff --git a/media/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c b/media/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c index 4f5e3f8c1b..e0289abe12 100644 --- a/media/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c +++ b/media/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c @@ -24,6 +24,7 @@ void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, src1_stride, mask, 0, w, h, 0, 0); } +#if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_blend_a64_hmask_sse4_1( uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, @@ -32,3 +33,4 @@ void aom_highbd_blend_a64_hmask_sse4_1( src1_8, src1_stride, mask, 0, w, h, 0, 0, bd); } +#endif diff --git a/media/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c b/media/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c index 67fb4d32bd..95383d2fd1 100644 --- a/media/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c +++ b/media/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c @@ -870,7 +870,7 @@ void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, - int h, int subx, int suby) { + int h, int subw, int subh) { assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); @@ -881,15 +881,15 @@ void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, - mask, mask_stride, w, h, subx, suby); + mask, mask_stride, w, h, subw, subh); } else { - if (subx & suby) { + if (subw & subh) { blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); - } else if (subx) { + } else if (subw) { blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); - } else if (suby) { + } else if (subh) { blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); } else { @@ -898,3 +898,477 @@ void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, } } } + +#if CONFIG_AV1_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// aom_highbd_blend_a64_d16_mask_avx2() +////////////////////////////////////////////////////////////////////////////// + +static INLINE void highbd_blend_a64_d16_mask_w4_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0, + const __m256i *round_offset, int shift, const __m256i *clip_low, + const __m256i *clip_high, const __m256i *mask_max) { + // Load 4x u16 pixels from each of 4 rows from each source + const __m256i s0 = _mm256_set_epi64x(*(uint64_t *)(src0 + 3 * src0_stride), + *(uint64_t *)(src0 + 2 * src0_stride), + *(uint64_t *)(src0 + 1 * src0_stride), + *(uint64_t *)(src0 + 0 * src0_stride)); + const __m256i s1 = _mm256_set_epi64x(*(uint64_t *)(src1 + 3 * src1_stride), + *(uint64_t *)(src1 + 2 * src1_stride), + *(uint64_t *)(src1 + 1 * src1_stride), + *(uint64_t *)(src1 + 0 * src1_stride)); + // Generate the inverse mask + const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0); + + // Multiply each mask by the respective source + const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0); + const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0); + const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs); + const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs); + // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within + // lanes Later, packs does the same again which cancels this out with no need + // for a permute. The intermediate values being reordered makes no difference + + const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1); + const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1); + const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs); + const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs); + + const __m256i sumh = _mm256_add_epi32(mul0h, mul1h); + const __m256i suml = _mm256_add_epi32(mul0l, mul1l); + + const __m256i roundh = + _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift); + const __m256i roundl = + _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift); + + const __m256i pack = _mm256_packs_epi32(roundl, roundh); + const __m256i clip = + _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high); + + // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way: + const __m128i cliph = _mm256_extracti128_si256(clip, 1); + xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8)); + xx_storel_64(dst + 2 * dst_stride, cliph); + const __m128i clipl = _mm256_castsi256_si128(clip); + xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8)); + xx_storel_64(dst + 0 * dst_stride, clipl); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift, const __m256i *clip_low, + const __m256i *clip_high, const __m256i *mask_max) { + do { + // Load 8x u8 pixels from each of 4 rows of the mask, pad each to u16 + const __m128i mask08 = _mm_set_epi32(*(uint32_t *)(mask + 3 * mask_stride), + *(uint32_t *)(mask + 2 * mask_stride), + *(uint32_t *)(mask + 1 * mask_stride), + *(uint32_t *)(mask + 0 * mask_stride)); + const __m256i mask0 = _mm256_cvtepu8_epi16(mask08); + + highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, &mask0, round_offset, shift, + clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 4; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift, const __m256i *clip_low, + const __m256i *clip_high, const __m256i *mask_max) { + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + do { + // Load 8 pixels from each of 8 rows of mask, + // (saturating) add together rows then use madd to add adjacent pixels + // Finally, divide each value by 4 (with rounding) + const __m256i m0246 = + _mm256_set_epi64x(*(uint64_t *)(mask + 6 * mask_stride), + *(uint64_t *)(mask + 4 * mask_stride), + *(uint64_t *)(mask + 2 * mask_stride), + *(uint64_t *)(mask + 0 * mask_stride)); + const __m256i m1357 = + _mm256_set_epi64x(*(uint64_t *)(mask + 7 * mask_stride), + *(uint64_t *)(mask + 5 * mask_stride), + *(uint64_t *)(mask + 3 * mask_stride), + *(uint64_t *)(mask + 1 * mask_stride)); + const __m256i addrows = _mm256_adds_epu8(m0246, m1357); + const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b); + const __m256i mask0 = + _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2); + + highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, &mask0, round_offset, shift, + clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 8; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_w8_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a, + const __m256i *mask0b, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + // Load 8x u16 pixels from each of 4 rows from each source + const __m256i s0a = + yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride); + const __m256i s0b = + yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride); + const __m256i s1a = + yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride); + const __m256i s1b = + yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride); + + // Generate inverse masks + const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a); + const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b); + + // Multiply sources by respective masks + const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a); + const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a); + const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs); + const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs); + // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within + // lanes Later, packs does the same again which cancels this out with no need + // for a permute. The intermediate values being reordered makes no difference + + const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a); + const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a); + const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs); + const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs); + + const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah); + const __m256i sumal = _mm256_add_epi32(mul0al, mul1al); + + const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b); + const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b); + const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs); + const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs); + + const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b); + const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b); + const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs); + const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs); + + const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh); + const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl); + + // Divide down each result, with rounding + const __m256i roundah = + _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift); + const __m256i roundal = + _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift); + const __m256i roundbh = + _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift); + const __m256i roundbl = + _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift); + + // Pack each i32 down to an i16 with saturation, then clip to valid range + const __m256i packa = _mm256_packs_epi32(roundal, roundah); + const __m256i clipa = + _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high); + const __m256i packb = _mm256_packs_epi32(roundbl, roundbh); + const __m256i clipb = + _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high); + + // Store 8x u16 pixels to each of 4 rows in the destination + yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa); + yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, + int mask_stride, int h, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + do { + // Load 8x u8 pixels from each of 4 rows in the mask + const __m128i mask0a8 = + _mm_set_epi64x(*(uint64_t *)mask, *(uint64_t *)(mask + mask_stride)); + const __m128i mask0b8 = + _mm_set_epi64x(*(uint64_t *)(mask + 2 * mask_stride), + *(uint64_t *)(mask + 3 * mask_stride)); + const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8); + const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8); + + highbd_blend_a64_d16_mask_w8_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, + round_offset, shift, clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 4; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, + int mask_stride, int h, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + do { + // Load 16x u8 pixels from each of 8 rows in the mask, + // (saturating) add together rows then use madd to add adjacent pixels + // Finally, divide each value by 4 (with rounding) + const __m256i m02 = + yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride); + const __m256i m13 = + yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride); + const __m256i m0123 = + _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b); + const __m256i mask_0a = + _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2); + const __m256i m46 = + yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride); + const __m256i m57 = + yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride); + const __m256i m4567 = + _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b); + const __m256i mask_0b = + _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2); + + highbd_blend_a64_d16_mask_w8_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a, + &mask_0b, round_offset, shift, clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 8; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_w16_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a, + const __m256i *mask0b, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + // Load 16x pixels from each of 2 rows from each source + const __m256i s0a = yy_loadu_256(src0); + const __m256i s0b = yy_loadu_256(src0 + src0_stride); + const __m256i s1a = yy_loadu_256(src1); + const __m256i s1b = yy_loadu_256(src1 + src1_stride); + + // Calculate inverse masks + const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a); + const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b); + + // Multiply each source by appropriate mask + const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a); + const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a); + const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs); + const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs); + // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within + // lanes Later, packs does the same again which cancels this out with no need + // for a permute. The intermediate values being reordered makes no difference + + const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a); + const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a); + const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs); + const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs); + + const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah); + const __m256i mulal = _mm256_add_epi32(mul0al, mul1al); + + const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b); + const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b); + const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs); + const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs); + + const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b); + const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b); + const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs); + const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs); + + const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh); + const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl); + + const __m256i resah = + _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift); + const __m256i resal = + _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift); + const __m256i resbh = + _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift); + const __m256i resbl = + _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift); + + // Signed saturating pack from i32 to i16: + const __m256i packa = _mm256_packs_epi32(resal, resah); + const __m256i packb = _mm256_packs_epi32(resbl, resbh); + + // Clip the values to the valid range + const __m256i clipa = + _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high); + const __m256i clipb = + _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high); + + // Store 16 pixels + yy_storeu_256(dst, clipa); + yy_storeu_256(dst + dst_stride, clipb); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, + int mask_stride, int h, int w, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + for (int i = 0; i < h; i += 2) { + for (int j = 0; j < w; j += 16) { + // Load 16x u8 alpha-mask values from each of two rows and pad to u16 + const __m128i masks_a8 = xx_loadu_128(mask + j); + const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j); + const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8); + const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8); + + highbd_blend_a64_d16_mask_w16_avx2( + dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride, + &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max); + } + dst += dst_stride * 2; + src0 += src0_stride * 2; + src1 += src1_stride * 2; + mask += mask_stride * 2; + } +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, + int mask_stride, int h, int w, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; i += 2) { + for (int j = 0; j < w; j += 16) { + // Load 32x u8 alpha-mask values from each of four rows + // (saturating) add pairs of rows, then use madd to add adjacent values + // Finally, divide down each result with rounding + const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j); + const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j); + const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j); + const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j); + + const __m256i m01_8 = _mm256_adds_epu8(m0, m1); + const __m256i m23_8 = _mm256_adds_epu8(m2, m3); + + const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b); + const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b); + + const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2); + const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2); + + highbd_blend_a64_d16_mask_w16_avx2( + dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride, + &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max); + } + dst += dst_stride * 2; + src0 += src0_stride * 2; + src1 += src1_stride * 2; + mask += mask_stride * 4; + } +} + +void aom_highbd_blend_a64_d16_mask_avx2( + uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params, const int bd) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int32_t round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + const __m256i v_round_offset = _mm256_set1_epi32(round_offset); + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + + const __m256i clip_low = _mm256_set1_epi16(0); + const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1); + const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + case 8: + highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + default: // >= 16 + highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + } + + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + case 8: + highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + default: // >= 16 + highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + } + } else { + // Sub-sampling in only one axis doesn't seem to happen very much, so fall + // back to the vanilla C implementation instead of having all the optimised + // code for these. + aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, subw, + subh, conv_params, bd); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/media/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c b/media/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c index 9d6b4c2f74..4a368ef947 100644 --- a/media/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c +++ b/media/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c @@ -339,8 +339,8 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { @@ -386,7 +386,7 @@ void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, - int h, int subx, int suby) { + int h, int subw, int subh) { typedef void (*blend_fn)( uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, @@ -415,14 +415,15 @@ void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, - mask, mask_stride, w, h, subx, suby); + mask, mask_stride, w, h, subw, subh); } else { - blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0, + blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); } } +#if CONFIG_AV1_HIGHBITDEPTH ////////////////////////////////////////////////////////////////////////////// // No sub-sampling ////////////////////////////////////////////////////////////////////////////// @@ -518,8 +519,8 @@ static INLINE void blend_a64_mask_bn_sx_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { @@ -565,8 +566,8 @@ static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { @@ -710,8 +711,8 @@ static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { @@ -762,8 +763,8 @@ static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { @@ -812,20 +813,19 @@ static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( ////////////////////////////////////////////////////////////////////////////// // Dispatch ////////////////////////////////////////////////////////////////////////////// - void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, - int subx, int suby, int bd) { + int subw, int subh, int bd) { typedef void (*blend_fn)( uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h); - // Dimensions are: bd_index X width_index X subx X suby + // Dimensions are: bd_index X width_index X subw X subh static const blend_fn blend[2][2][2][2] = { { // bd == 8 or 10 { // w % 8 == 0 @@ -858,18 +858,19 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, assert(bd == 8 || bd == 10 || bd == 12); if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, - src1_stride, mask, mask_stride, w, h, subx, - suby, bd); + src1_stride, mask, mask_stride, w, h, subw, + subh, bd); } else { uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); - blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0]( + blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0]( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); } } +#endif // CONFIG_AV1_HIGHBITDEPTH static INLINE void blend_a64_d16_mask_w16_sse41( uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, @@ -1107,3 +1108,453 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1( } } } + +////////////////////////////////////////////////////////////////////////////// +// aom_highbd_blend_a64_d16_mask_sse4_1() +////////////////////////////////////////////////////////////////////////////// +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_blend_a64_d16_mask_w4_sse4_1( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a, + const __m128i *mask0b, const __m128i *round_offset, int shift, + const __m128i *clip_low, const __m128i *clip_high, + const __m128i *mask_max) { + // Load 4 pixels from each of 4 rows from each source + const __m128i s0a = + _mm_set_epi64x(*(uint64_t *)src0, *(uint64_t *)(src0 + src0_stride)); + const __m128i s0b = _mm_set_epi64x(*(uint64_t *)(src0 + 2 * src0_stride), + *(uint64_t *)(src0 + 3 * src0_stride)); + const __m128i s1a = + _mm_set_epi64x(*(uint64_t *)(src1), *(uint64_t *)(src1 + src1_stride)); + const __m128i s1b = _mm_set_epi64x(*(uint64_t *)(src1 + 2 * src1_stride), + *(uint64_t *)(src1 + 3 * src1_stride)); + + // Generate the inverse masks + const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a); + const __m128i mask1b = _mm_sub_epi16(*mask_max, *mask0b); + + // Multiply each mask by the respective source + const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a); + const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a); + const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs); + const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs); + const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a); + const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a); + const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs); + const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs); + + const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b); + const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b); + const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs); + const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs); + const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b); + const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b); + const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs); + const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs); + + const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah); + const __m128i sumal = _mm_add_epi32(mul0al, mul1al); + const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh); + const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl); + + const __m128i roundah = + _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift); + const __m128i roundbh = + _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift); + const __m128i roundal = + _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift); + const __m128i roundbl = + _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift); + + const __m128i packa = _mm_packs_epi32(roundal, roundah); + const __m128i packb = _mm_packs_epi32(roundbl, roundbh); + + const __m128i clipa = + _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high); + const __m128i clipb = + _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high); + + xx_storel_64(dst, _mm_srli_si128(clipa, 8)); + xx_storel_64(dst + dst_stride, clipa); + xx_storel_64(dst + 2 * dst_stride, _mm_srli_si128(clipb, 8)); + xx_storel_64(dst + 3 * dst_stride, clipb); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *mask_max) { + do { + const __m128i mask0a8 = _mm_set_epi32(0, 0, *(uint32_t *)mask, + *(uint32_t *)(mask + mask_stride)); + const __m128i mask0b8 = + _mm_set_epi32(0, 0, *(uint32_t *)(mask + 2 * mask_stride), + *(uint32_t *)(mask + 3 * mask_stride)); + const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8); + const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8); + + highbd_blend_a64_d16_mask_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, + round_offset, shift, clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 4; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *mask_max) { + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + do { + // Load 8 pixels from each of 8 rows of mask, + // (saturating) add together rows then use madd to add adjacent pixels + // Finally, divide each value by 4 (with rounding) + const __m128i m02 = _mm_set_epi64x(*(uint64_t *)(mask), + *(uint64_t *)(mask + 2 * mask_stride)); + const __m128i m13 = _mm_set_epi64x(*(uint64_t *)(mask + mask_stride), + *(uint64_t *)(mask + 3 * mask_stride)); + const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b); + const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2); + const __m128i m46 = _mm_set_epi64x(*(uint64_t *)(mask + 4 * mask_stride), + *(uint64_t *)(mask + 6 * mask_stride)); + const __m128i m57 = _mm_set_epi64x(*(uint64_t *)(mask + 5 * mask_stride), + *(uint64_t *)(mask + 7 * mask_stride)); + const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b); + const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2); + + highbd_blend_a64_d16_mask_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a, + &mask_0b, round_offset, shift, clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 8; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_w8_sse4_1( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a, + const __m128i *mask0b, const __m128i *round_offset, int shift, + const __m128i *clip_low, const __m128i *clip_high, + const __m128i *max_mask) { + // Load 8x pixels from each of 2 rows from each source + const __m128i s0a = xx_loadu_128(src0); + const __m128i s0b = xx_loadu_128(src0 + src0_stride); + const __m128i s1a = xx_loadu_128(src1); + const __m128i s1b = xx_loadu_128(src1 + src1_stride); + + // Generate inverse masks + const __m128i mask1a = _mm_sub_epi16(*max_mask, *mask0a); + const __m128i mask1b = _mm_sub_epi16(*max_mask, *mask0b); + + // Multiply sources by respective masks + const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a); + const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a); + const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs); + const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs); + + const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a); + const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a); + const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs); + const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs); + + const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah); + const __m128i sumal = _mm_add_epi32(mul0al, mul1al); + + const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b); + const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b); + const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs); + const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs); + + const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b); + const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b); + const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs); + const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs); + + const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh); + const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl); + + const __m128i roundah = + _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift); + const __m128i roundal = + _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift); + const __m128i roundbh = + _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift); + const __m128i roundbl = + _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift); + + const __m128i packa = _mm_packs_epi32(roundal, roundah); + const __m128i clipa = + _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high); + const __m128i packb = _mm_packs_epi32(roundbl, roundbh); + const __m128i clipb = + _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high); + + xx_storeu_128(dst, clipa); + xx_storeu_128(dst + dst_stride, clipb); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *max_mask) { + do { + const __m128i mask0a = _mm_cvtepu8_epi16(xx_loadl_64(mask)); + const __m128i mask0b = _mm_cvtepu8_epi16(xx_loadl_64(mask + mask_stride)); + highbd_blend_a64_d16_mask_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, + round_offset, shift, clip_low, clip_high, max_mask); + + dst += dst_stride * 2; + src0 += src0_stride * 2; + src1 += src1_stride * 2; + mask += mask_stride * 2; + } while (h -= 2); +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *max_mask) { + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + do { + const __m128i mask_thisrowa = xx_loadu_128(mask); + const __m128i mask_nextrowa = xx_loadu_128(mask + mask_stride); + const __m128i mask_thisrowb = xx_loadu_128(mask + 2 * mask_stride); + const __m128i mask_nextrowb = xx_loadu_128(mask + 3 * mask_stride); + const __m128i mask_bothrowsa = _mm_adds_epu8(mask_thisrowa, mask_nextrowa); + const __m128i mask_bothrowsb = _mm_adds_epu8(mask_thisrowb, mask_nextrowb); + const __m128i mask_16a = _mm_maddubs_epi16(mask_bothrowsa, one_b); + const __m128i mask_16b = _mm_maddubs_epi16(mask_bothrowsb, one_b); + const __m128i mask_sa = _mm_srli_epi16(_mm_add_epi16(mask_16a, two_w), 2); + const __m128i mask_sb = _mm_srli_epi16(_mm_add_epi16(mask_16b, two_w), 2); + + highbd_blend_a64_d16_mask_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_sa, + &mask_sb, round_offset, shift, clip_low, clip_high, max_mask); + + dst += dst_stride * 2; + src0 += src0_stride * 2; + src1 += src1_stride * 2; + mask += mask_stride * 4; + } while (h -= 2); +} + +static INLINE void highbd_blend_a64_d16_mask_w16_sse4_1( + uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *round_offset, int shift, const __m128i *mask0l, + const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high, + const __m128i *mask_max) { + // Load 16x u16 pixels for this row from each src + const __m128i s0l = xx_loadu_128(src0); + const __m128i s0h = xx_loadu_128(src0 + 8); + const __m128i s1l = xx_loadu_128(src1); + const __m128i s1h = xx_loadu_128(src1 + 8); + + // Calculate inverse masks + const __m128i mask1h = _mm_sub_epi16(*mask_max, *mask0h); + const __m128i mask1l = _mm_sub_epi16(*mask_max, *mask0l); + + const __m128i mul0_highs = _mm_mulhi_epu16(*mask0h, s0h); + const __m128i mul0_lows = _mm_mullo_epi16(*mask0h, s0h); + const __m128i mul0h = _mm_unpackhi_epi16(mul0_lows, mul0_highs); + const __m128i mul0l = _mm_unpacklo_epi16(mul0_lows, mul0_highs); + + const __m128i mul1_highs = _mm_mulhi_epu16(mask1h, s1h); + const __m128i mul1_lows = _mm_mullo_epi16(mask1h, s1h); + const __m128i mul1h = _mm_unpackhi_epi16(mul1_lows, mul1_highs); + const __m128i mul1l = _mm_unpacklo_epi16(mul1_lows, mul1_highs); + + const __m128i mulhh = _mm_add_epi32(mul0h, mul1h); + const __m128i mulhl = _mm_add_epi32(mul0l, mul1l); + + const __m128i mul2_highs = _mm_mulhi_epu16(*mask0l, s0l); + const __m128i mul2_lows = _mm_mullo_epi16(*mask0l, s0l); + const __m128i mul2h = _mm_unpackhi_epi16(mul2_lows, mul2_highs); + const __m128i mul2l = _mm_unpacklo_epi16(mul2_lows, mul2_highs); + + const __m128i mul3_highs = _mm_mulhi_epu16(mask1l, s1l); + const __m128i mul3_lows = _mm_mullo_epi16(mask1l, s1l); + const __m128i mul3h = _mm_unpackhi_epi16(mul3_lows, mul3_highs); + const __m128i mul3l = _mm_unpacklo_epi16(mul3_lows, mul3_highs); + + const __m128i mullh = _mm_add_epi32(mul2h, mul3h); + const __m128i mulll = _mm_add_epi32(mul2l, mul3l); + + const __m128i reshh = + _mm_srai_epi32(_mm_sub_epi32(mulhh, *round_offset), shift); + const __m128i reshl = + _mm_srai_epi32(_mm_sub_epi32(mulhl, *round_offset), shift); + const __m128i reslh = + _mm_srai_epi32(_mm_sub_epi32(mullh, *round_offset), shift); + const __m128i resll = + _mm_srai_epi32(_mm_sub_epi32(mulll, *round_offset), shift); + + // Signed saturating pack from i32 to i16: + const __m128i packh = _mm_packs_epi32(reshl, reshh); + const __m128i packl = _mm_packs_epi32(resll, reslh); + + // Clip the values to the valid range + const __m128i cliph = + _mm_min_epi16(_mm_max_epi16(packh, *clip_low), *clip_high); + const __m128i clipl = + _mm_min_epi16(_mm_max_epi16(packl, *clip_low), *clip_high); + + // Store 16 pixels + xx_storeu_128(dst, clipl); + xx_storeu_128(dst + 8, cliph); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *mask_max) { + for (int i = 0; i < h; i++) { + for (int j = 0; j < w; j += 16) { + // Load 16x u8 alpha-mask values and pad to u16 + const __m128i masks_u8 = xx_loadu_128(mask + j); + const __m128i mask0l = _mm_cvtepu8_epi16(masks_u8); + const __m128i mask0h = _mm_cvtepu8_epi16(_mm_srli_si128(masks_u8, 8)); + + highbd_blend_a64_d16_mask_w16_sse4_1( + dst + j, src0 + j, src1 + j, round_offset, shift, &mask0l, &mask0h, + clip_low, clip_high, mask_max); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *mask_max) { + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; i++) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); + const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); + + const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); + const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); + const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); + const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); + const __m128i mask_l = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); + const __m128i mask_h = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); + + highbd_blend_a64_d16_mask_w16_sse4_1( + dst + j, src0 + j, src1 + j, round_offset, shift, &mask_l, &mask_h, + clip_low, clip_high, mask_max); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride * 2; + } +} + +void aom_highbd_blend_a64_d16_mask_sse4_1( + uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params, const int bd) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int32_t round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + const __m128i v_round_offset = _mm_set1_epi32(round_offset); + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + + const __m128i clip_low = _mm_set1_epi16(0); + const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1); + const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + case 8: + highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + default: // >=16 + highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + } + + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + case 8: + highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + default: // >=16 + highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + } + } else { + // Sub-sampling in only one axis doesn't seem to happen very much, so fall + // back to the vanilla C implementation instead of having all the optimised + // code for these. + aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, subw, + subh, conv_params, bd); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/media/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c b/media/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c index 0649102325..75fb1c5a94 100644 --- a/media/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c +++ b/media/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c @@ -143,6 +143,7 @@ void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, h); } +#if CONFIG_AV1_HIGHBITDEPTH ////////////////////////////////////////////////////////////////////////////// // Implementation - No sub-sampling ////////////////////////////////////////////////////////////////////////////// @@ -281,3 +282,4 @@ void aom_highbd_blend_a64_vmask_sse4_1( src1_stride, mask, w, h); } } +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/media/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c b/media/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c new file mode 100644 index 0000000000..f7c0eb0370 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +static INLINE void accumulate_sse_sum(__m256i regx_sum, __m256i regx2_sum, + int *x_sum, int64_t *x2_sum) { + __m256i sum_buffer, sse_buffer; + __m128i out_buffer; + + // Accumulate the various elements of register into first element. + sum_buffer = _mm256_permute2f128_si256(regx_sum, regx_sum, 1); + regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); + regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 8)); + regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 4)); + + sse_buffer = _mm256_permute2f128_si256(regx2_sum, regx2_sum, 1); + regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); + regx2_sum = _mm256_add_epi64(regx2_sum, _mm256_srli_si256(regx2_sum, 8)); + + out_buffer = _mm256_castsi256_si128(regx_sum); + *x_sum += _mm_cvtsi128_si32(out_buffer); + out_buffer = _mm256_castsi256_si128(regx2_sum); +#if ARCH_X86_64 + *x2_sum += _mm_cvtsi128_si64(out_buffer); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, out_buffer); + *x2_sum += tmp; + } +#endif +} + +static INLINE void sse_sum_wd4_avx2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum) { + __m128i row1, row2, row3; + __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer, + temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer; + const int16_t *data_tmp = data; + __m256i one = _mm256_set1_epi16(1); + regx_sum = _mm256_setzero_si256(); + regx2_sum = regx_sum; + sum_buffer = _mm256_setzero_si256(); + sse_buffer = sum_buffer; + + for (int j = 0; j < (bh >> 2); ++j) { + // Load 4 rows at a time. + row1 = _mm_loadl_epi64((__m128i const *)(data_tmp)); + row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + stride)); + row1 = _mm_unpacklo_epi64(row1, row2); + row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + 2 * stride)); + row3 = _mm_loadl_epi64((__m128i const *)(data_tmp + 3 * stride)); + row2 = _mm_unpacklo_epi64(row2, row3); + load_pixels = + _mm256_insertf128_si256(_mm256_castsi128_si256(row1), row2, 1); + + row_sum_buffer = _mm256_madd_epi16(load_pixels, one); + row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); + sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer); + sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer); + data_tmp += 4 * stride; + } + + // To prevent 32-bit variable overflow, unpack the elements to 64-bit. + temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256()); + temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256()); + sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2); + regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); + + accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum); +} + +static INLINE void sse_sum_wd8_avx2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum) { + __m128i load_128bit, load_next_128bit; + __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer, + temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer; + const int16_t *data_tmp = data; + __m256i one = _mm256_set1_epi16(1); + regx_sum = _mm256_setzero_si256(); + regx2_sum = regx_sum; + sum_buffer = _mm256_setzero_si256(); + sse_buffer = sum_buffer; + + for (int j = 0; j < (bh >> 1); ++j) { + // Load 2 rows at a time. + load_128bit = _mm_loadu_si128((__m128i const *)(data_tmp)); + load_next_128bit = _mm_loadu_si128((__m128i const *)(data_tmp + stride)); + load_pixels = _mm256_insertf128_si256(_mm256_castsi128_si256(load_128bit), + load_next_128bit, 1); + + row_sum_buffer = _mm256_madd_epi16(load_pixels, one); + row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); + sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer); + sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer); + data_tmp += 2 * stride; + } + + temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256()); + temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256()); + sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2); + regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); + + accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum); +} + +static INLINE void sse_sum_wd16_avx2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum, + int loop_count) { + __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer, + temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer; + const int16_t *data_tmp = data; + __m256i one = _mm256_set1_epi16(1); + regx_sum = _mm256_setzero_si256(); + regx2_sum = regx_sum; + sum_buffer = _mm256_setzero_si256(); + sse_buffer = sum_buffer; + + for (int i = 0; i < loop_count; ++i) { + data_tmp = data + 16 * i; + for (int j = 0; j < bh; ++j) { + load_pixels = _mm256_lddqu_si256((__m256i const *)(data_tmp)); + + row_sum_buffer = _mm256_madd_epi16(load_pixels, one); + row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); + sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer); + sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer); + data_tmp += stride; + } + } + + temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256()); + temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256()); + sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2); + regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); + + accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum); +} + +void aom_get_blk_sse_sum_avx2(const int16_t *data, int stride, int bw, int bh, + int *x_sum, int64_t *x2_sum) { + *x_sum = 0; + *x2_sum = 0; + + if ((bh & 3) == 0) { + switch (bw) { + // For smaller block widths, compute multiple rows simultaneously. + case 4: sse_sum_wd4_avx2(data, stride, bh, x_sum, x2_sum); break; + case 8: sse_sum_wd8_avx2(data, stride, bh, x_sum, x2_sum); break; + case 16: + case 32: + sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4); + break; + case 64: + // 32-bit variables will overflow for 64 rows at a single time, so + // compute 32 rows at a time. + if (bh <= 32) { + sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4); + } else { + sse_sum_wd16_avx2(data, stride, 32, x_sum, x2_sum, bw >> 4); + sse_sum_wd16_avx2(data + 32 * stride, stride, 32, x_sum, x2_sum, + bw >> 4); + } + break; + + default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); + } + } else { + aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); + } +} diff --git a/media/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c b/media/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c new file mode 100644 index 0000000000..ef0a024eeb --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> + +#include "config/aom_dsp_rtcd.h" + +static INLINE void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum) { + const int16_t *data_tmp = data; + __m128i temp_buffer1, temp_buffer2; + __m128i load_pixels_low, load_pixels_hi, sum_buffer, sse_buffer; + __m128i one = _mm_set1_epi16(1); + __m128i regx_sum = _mm_setzero_si128(); + __m128i regx2_sum = regx_sum; + + for (int j = 0; j < (bh >> 1); ++j) { + // Load 2 rows (8 pixels) at a time. + load_pixels_low = _mm_loadl_epi64((__m128i const *)(data_tmp)); + load_pixels_hi = _mm_loadl_epi64((__m128i const *)(data_tmp + stride)); + load_pixels_low = _mm_unpacklo_epi64(load_pixels_low, load_pixels_hi); + sum_buffer = _mm_madd_epi16(load_pixels_low, one); + sse_buffer = _mm_madd_epi16(load_pixels_low, load_pixels_low); + regx_sum = _mm_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum); + data_tmp += 2 * stride; + } + + regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8)); + regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4)); + *x_sum = _mm_cvtsi128_si32(regx_sum); + temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128()); + temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128()); + regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2); + regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8)); +#if ARCH_X86_64 + *x2_sum += _mm_cvtsi128_si64(regx2_sum); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, regx2_sum); + *x2_sum += tmp; + } +#endif +} + +static INLINE void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum, + int loop_cycles) { + const int16_t *data_tmp; + __m128i temp_buffer1, temp_buffer2; + __m128i one = _mm_set1_epi16(1); + __m128i regx_sum = _mm_setzero_si128(); + __m128i regx2_sum = regx_sum; + __m128i load_pixels, sum_buffer, sse_buffer; + + for (int i = 0; i < loop_cycles; ++i) { + data_tmp = data + (8 * i); + for (int j = 0; j < bh; ++j) { + // Load 1 row (8-pixels) at a time. + load_pixels = _mm_loadu_si128((__m128i const *)(data_tmp)); + sum_buffer = _mm_madd_epi16(load_pixels, one); + sse_buffer = _mm_madd_epi16(load_pixels, load_pixels); + regx_sum = _mm_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum); + data_tmp += stride; + } + } + + regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8)); + regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4)); + *x_sum += _mm_cvtsi128_si32(regx_sum); + temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128()); + temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128()); + regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2); + regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8)); +#if ARCH_X86_64 + *x2_sum += _mm_cvtsi128_si64(regx2_sum); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, regx2_sum); + *x2_sum += tmp; + } +#endif +} + +// This functions adds SSE2 Support for the functions 'get_blk_sse_sum_c' +void aom_get_blk_sse_sum_sse2(const int16_t *data, int stride, int bw, int bh, + int *x_sum, int64_t *x2_sum) { + *x_sum = 0; + *x2_sum = 0; + + if ((bh & 3) == 0) { + switch (bw) { + case 4: sse_sum_wd4_sse2(data, stride, bh, x_sum, x2_sum); break; + case 8: + case 16: + sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3); + break; + // For widths 32 and 64, the registers may overflow. So compute + // partial widths at a time. + case 32: + if (bh <= 32) { + sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3); + break; + } else { + sse_sum_wd8_sse2(data, stride, 32, x_sum, x2_sum, bw >> 3); + sse_sum_wd8_sse2(data + 32 * stride, stride, 32, x_sum, x2_sum, + bw >> 3); + break; + } + + case 64: + if (bh <= 16) { + sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3); + break; + } else { + for (int i = 0; i < bh; i += 16) + sse_sum_wd8_sse2(data + i * stride, stride, 16, x_sum, x2_sum, + bw >> 3); + break; + } + + default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); + } + } else { + aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); + } +} diff --git a/media/libaom/src/aom_dsp/x86/convolve.h b/media/libaom/src/aom_dsp/x86/convolve.h index 3e19682cd9..b4ff6975cd 100644 --- a/media/libaom/src/aom_dsp/x86/convolve.h +++ b/media/libaom/src/aom_dsp/x86/convolve.h @@ -107,6 +107,7 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, } \ } +#if CONFIG_AV1_HIGHBITDEPTH typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, const ptrdiff_t src_pitch, uint16_t *output_ptr, @@ -122,7 +123,30 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ if (step_q4 == 16 && filter[3] != 128) { \ - if (filter[0] | filter[1] | filter[2]) { \ + if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \ + (filter[2] | filter[5])) { \ + while (w >= 16) { \ + aom_highbd_filter_block1d16_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_highbd_filter_block1d8_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_highbd_filter_block1d4_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else if (filter[0] | filter[1] | filter[2]) { \ while (w >= 16) { \ aom_highbd_filter_block1d16_##dir##8_##avg##opt( \ src_start, src_stride, dst, dst_stride, h, filter, bd); \ @@ -174,5 +198,6 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ } \ } +#endif // CONFIG_AV1_HIGHBITDEPTH #endif // AOM_AOM_DSP_X86_CONVOLVE_H_ diff --git a/media/libaom/src/aom_dsp/x86/convolve_avx2.h b/media/libaom/src/aom_dsp/x86/convolve_avx2.h index 30253f65c2..d516de5f2f 100644 --- a/media/libaom/src/aom_dsp/x86/convolve_avx2.h +++ b/media/libaom/src/aom_dsp/x86/convolve_avx2.h @@ -34,6 +34,239 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, }; +DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = { + 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255, + 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255 +}; + +DECLARE_ALIGNED(32, static const uint8_t, + filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, + 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, + 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + +DECLARE_ALIGNED(32, static const uint8_t, + filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, + 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 }; + +DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \ + for (i = 0; i < (im_h - 2); i += 2) { \ + __m256i data = _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ + data = _mm256_inserti128_si256( \ + data, \ + _mm_loadu_si128( \ + (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ + 1); \ + \ + __m256i res = convolve_lowbd_x(data, coeffs_h, filt); \ + res = \ + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ + } \ + \ + __m256i data_1 = _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ + \ + __m256i res = convolve_lowbd_x(data_1, coeffs_h, filt); \ + \ + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + +#define CONVOLVE_SR_VERTICAL_FILTER_8TAP \ + __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ + __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ + __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ + __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ + __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ + __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ + \ + __m256i s[8]; \ + s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ + s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ + s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ + \ + s[4] = _mm256_unpackhi_epi16(src_0, src_1); \ + s[5] = _mm256_unpackhi_epi16(src_2, src_3); \ + s[6] = _mm256_unpackhi_epi16(src_4, src_5); \ + \ + for (i = 0; i < h; i += 2) { \ + const int16_t *data = &im_block[i * im_stride]; \ + \ + const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ + const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ + \ + s[3] = _mm256_unpacklo_epi16(s6, s7); \ + s[7] = _mm256_unpackhi_epi16(s6, s7); \ + \ + __m256i res_a = convolve(s, coeffs_v); \ + __m256i res_b = convolve(s + 4, coeffs_v); \ + \ + res_a = \ + _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ + res_b = \ + _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ + \ + const __m256i res_a_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ + const __m256i res_b_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ + \ + const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ + const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ + \ + const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ + \ + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ + if (w - j > 4) { \ + _mm_storel_epi64(p_0, res_0); \ + _mm_storel_epi64(p_1, res_1); \ + } else if (w == 4) { \ + xx_storel_32(p_0, res_0); \ + xx_storel_32(p_1, res_1); \ + } else { \ + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ + } \ + \ + s[0] = s[1]; \ + s[1] = s[2]; \ + s[2] = s[3]; \ + \ + s[4] = s[5]; \ + s[5] = s[6]; \ + s[6] = s[7]; \ + } + +#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP \ + for (i = 0; i < im_h; i += 2) { \ + __m256i data = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); \ + if (i + 1 < im_h) \ + data = _mm256_inserti128_si256( \ + data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \ + src_h += (src_stride << 1); \ + __m256i res = convolve_lowbd_x(data, coeffs_x, filt); \ + \ + res = \ + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ + } + +#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP \ + __m256i s[8]; \ + __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ + __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ + __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ + __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ + __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ + __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ + \ + s[0] = _mm256_unpacklo_epi16(s0, s1); \ + s[1] = _mm256_unpacklo_epi16(s2, s3); \ + s[2] = _mm256_unpacklo_epi16(s4, s5); \ + \ + s[4] = _mm256_unpackhi_epi16(s0, s1); \ + s[5] = _mm256_unpackhi_epi16(s2, s3); \ + s[6] = _mm256_unpackhi_epi16(s4, s5); \ + \ + for (i = 0; i < h; i += 2) { \ + const int16_t *data = &im_block[i * im_stride]; \ + \ + const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ + const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ + \ + s[3] = _mm256_unpacklo_epi16(s6, s7); \ + s[7] = _mm256_unpackhi_epi16(s6, s7); \ + \ + const __m256i res_a = convolve(s, coeffs_y); \ + const __m256i res_a_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ + \ + if (w - j > 4) { \ + const __m256i res_b = convolve(s + 4, coeffs_y); \ + const __m256i res_b_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \ + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ + \ + if (do_average) { \ + const __m256i data_ref_0 = load_line2_avx2( \ + &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); \ + const __m256i comp_avg_res = \ + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); \ + \ + const __m256i round_result = convolve_rounding( \ + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ + \ + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \ + const __m128i res_0 = _mm256_castsi256_si128(res_8); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ + \ + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \ + _mm_storel_epi64( \ + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \ + } else { \ + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ + \ + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ + res_1); \ + } \ + } else { \ + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \ + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ + \ + if (do_average) { \ + const __m256i data_ref_0 = load_line2_avx2( \ + &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); \ + \ + const __m256i comp_avg_res = \ + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); \ + \ + const __m256i round_result = convolve_rounding( \ + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ + \ + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \ + const __m128i res_0 = _mm256_castsi256_si128(res_8); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ + \ + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \ + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \ + _mm_cvtsi128_si32(res_1); \ + \ + } else { \ + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ + \ + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ + res_1); \ + } \ + } \ + \ + s[0] = s[1]; \ + s[1] = s[2]; \ + s[2] = s[3]; \ + \ + s[4] = s[5]; \ + s[5] = s[6]; \ + s[6] = s[7]; \ + } static INLINE void prepare_coeffs_lowbd( const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { @@ -48,7 +281,7 @@ static INLINE void prepare_coeffs_lowbd( // Since all filter co-efficients are even, this change will not affect the // end result assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), - _mm_set1_epi16(0xffff))); + _mm_set1_epi16((short)0xffff))); const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); @@ -95,6 +328,17 @@ static INLINE __m256i convolve_lowbd(const __m256i *const s, return res; } +static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); + + // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + const __m256i res = _mm256_add_epi16(res_45, res_23); + + return res; +} + static INLINE __m256i convolve(const __m256i *const s, const __m256i *const coeffs) { const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); @@ -108,6 +352,15 @@ static INLINE __m256i convolve(const __m256i *const s, return res; } +static INLINE __m256i convolve_4tap(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); + const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); + + const __m256i res = _mm256_add_epi32(res_1, res_2); + return res; +} + static INLINE __m256i convolve_lowbd_x(const __m256i data, const __m256i *const coeffs, const __m256i *const filt) { @@ -121,6 +374,17 @@ static INLINE __m256i convolve_lowbd_x(const __m256i data, return convolve_lowbd(s, coeffs); } +static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data, + const __m256i *const coeffs, + const __m256i *const filt) { + __m256i s[2]; + + s[0] = _mm256_shuffle_epi8(data, filt[0]); + s[1] = _mm256_shuffle_epi8(data, filt[1]); + + return convolve_lowbd_4tap(s, coeffs); +} + static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst, const __m256i *const res, const int do_average) { @@ -138,9 +402,9 @@ static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst, static INLINE __m256i comp_avg(const __m256i *const data_ref_0, const __m256i *const res_unsigned, const __m256i *const wt, - const int use_jnt_comp_avg) { + const int use_dist_wtd_comp_avg) { __m256i res; - if (use_jnt_comp_avg) { + if (use_dist_wtd_comp_avg) { const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); @@ -172,9 +436,9 @@ static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0, const __m256i *const res_unsigned, const __m256i *const wt0, const __m256i *const wt1, - const int use_jnt_comp_avg) { + const int use_dist_wtd_comp_avg) { __m256i res; - if (use_jnt_comp_avg) { + if (use_dist_wtd_comp_avg) { const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); diff --git a/media/libaom/src/aom_dsp/x86/convolve_sse2.h b/media/libaom/src/aom_dsp/x86/convolve_sse2.h index 445d04b103..385c7c7e18 100644 --- a/media/libaom/src/aom_dsp/x86/convolve_sse2.h +++ b/media/libaom/src/aom_dsp/x86/convolve_sse2.h @@ -78,9 +78,9 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s, static INLINE __m128i comp_avg(const __m128i *const data_ref_0, const __m128i *const res_unsigned, const __m128i *const wt, - const int use_jnt_comp_avg) { + const int use_dist_wtd_avg) { __m128i res; - if (use_jnt_comp_avg) { + if (use_dist_wtd_avg) { const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned); const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned); diff --git a/media/libaom/src/aom_dsp/x86/convolve_sse4_1.h b/media/libaom/src/aom_dsp/x86/convolve_sse4_1.h index 6b8388d84a..b1a3bb4664 100644 --- a/media/libaom/src/aom_dsp/x86/convolve_sse4_1.h +++ b/media/libaom/src/aom_dsp/x86/convolve_sse4_1.h @@ -35,9 +35,9 @@ static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0, const __m128i *const res_unsigned, const __m128i *const wt0, const __m128i *const wt1, - const int use_jnt_comp_avg) { + const int use_dist_wtd_avg) { __m128i res; - if (use_jnt_comp_avg) { + if (use_dist_wtd_avg) { const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0); const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1); diff --git a/media/libaom/src/aom_dsp/x86/fft_avx2.c b/media/libaom/src/aom_dsp/x86/fft_avx2.c index 54da022538..4cccc5f00f 100644 --- a/media/libaom/src/aom_dsp/x86/fft_avx2.c +++ b/media/libaom/src/aom_dsp/x86/fft_avx2.c @@ -11,6 +11,7 @@ #include <immintrin.h> +#include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/fft_common.h" diff --git a/media/libaom/src/aom_dsp/x86/fft_sse2.c b/media/libaom/src/aom_dsp/x86/fft_sse2.c index 12bdc3e185..6f20a3cc01 100644 --- a/media/libaom/src/aom_dsp/x86/fft_sse2.c +++ b/media/libaom/src/aom_dsp/x86/fft_sse2.c @@ -11,6 +11,7 @@ s * PATENTS file, you can obtain it at www.aomedia.org/license/patent. #include <xmmintrin.h> +#include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/fft_common.h" diff --git a/media/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h b/media/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h index 1e3d13ec85..89fe1899bb 100644 --- a/media/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h +++ b/media/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h @@ -30,6 +30,206 @@ #define SUB_EPI16 _mm_sub_epi16 #endif +static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0, + __m128i *in1) { + // Constants + // These are the coefficients used for the multiplies. + // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), + // where cospi_N_64 = cos(N pi /64) + const __m128i k__cospi_A = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_B = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_C = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64); + const __m128i k__cospi_D = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64); + const __m128i k__cospi_E = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_F = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_G = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64); + const __m128i k__cospi_H = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64); + + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + // This second rounding constant saves doing some extra adds at the end + const __m128i k__DCT_CONST_ROUNDING2 = + _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1)); + const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; + const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); + const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); + + // Load inputs. + *in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + *in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + *in1 = _mm_unpacklo_epi64( + *in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); + *in0 = _mm_unpacklo_epi64( + *in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); + // in0 = [i0 i1 i2 i3 iC iD iE iF] + // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + // multiply by 16 to give some extra precision + *in0 = _mm_slli_epi16(*in0, 4); + *in1 = _mm_slli_epi16(*in1, 4); + // if (i == 0 && input[0]) input[0] += 1; + // add 1 to the upper left pixel if it is non-zero, which helps reduce + // the round-trip error + { + // The mask will only contain whether the first value is zero, all + // other comparison will fail as something shifted by 4 (above << 4) + // can never be equal to one. To increment in the non-zero case, we + // add the mask and one for the first element: + // - if zero, mask = -1, v = v - 1 + 1 = v + // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 + __m128i mask = _mm_cmpeq_epi16(*in0, k__nonzero_bias_a); + *in0 = _mm_add_epi16(*in0, mask); + *in0 = _mm_add_epi16(*in0, k__nonzero_bias_b); + } + // There are 4 total stages, alternating between an add/subtract stage + // followed by an multiply-and-add stage. + { + // Stage 1: Add/subtract + + // in0 = [i0 i1 i2 i3 iC iD iE iF] + // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + const __m128i r0 = _mm_unpacklo_epi16(*in0, *in1); + const __m128i r1 = _mm_unpackhi_epi16(*in0, *in1); + // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] + // r1 = [iC i8 iD i9 iE iA iF iB] + const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); + const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); + // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] + // r3 = [iC i8 iD i9 iF iB iE iA] + + const __m128i t0 = _mm_add_epi16(r2, r3); + const __m128i t1 = _mm_sub_epi16(r2, r3); + // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] + // t1 = [aC a8 aD a9 aF aB aE aA] + + // Stage 2: multiply by constants (which gets us into 32 bits). + // The constants needed here are: + // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] + // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] + // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] + // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); + // Then add and right-shift to get back to 16-bit range + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // w0 = [b0 b1 b7 b6] + // w1 = [b8 b9 bF bE] + // w2 = [b4 b5 b3 b2] + // w3 = [bC bD bB bA] + const __m128i x0 = _mm_packs_epi32(w0, w1); + const __m128i x1 = _mm_packs_epi32(w2, w3); + + // x0 = [b0 b1 b7 b6 b8 b9 bF bE] + // x1 = [b4 b5 b3 b2 bC bD bB bA] + *in0 = _mm_shuffle_epi32(x0, 0xD8); + *in1 = _mm_shuffle_epi32(x1, 0x8D); + // in0 = [b0 b1 b8 b9 b7 b6 bF bE] + // in1 = [b3 b2 bB bA b4 b5 bC bD] + } + { + // vertical DCTs finished. Now we do the horizontal DCTs. + // Stage 3: Add/subtract + + const __m128i t0 = ADD_EPI16(*in0, *in1); + const __m128i t1 = SUB_EPI16(*in0, *in1); + + // Stage 4: multiply by constants (which gets us into 32 bits). + { + // The constants needed here are: + // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] + // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] + // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] + // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); + const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); + const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); + // Then add and right-shift to get back to 16-bit range + // but this combines the final right-shift as well to save operations + // This unusual rounding operations is to maintain bit-accurate + // compatibility with the c version of this function which has two + // rounding steps in a row. + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); + // w0 = [o0 o4 o8 oC] + // w1 = [o2 o6 oA oE] + // w2 = [o1 o5 o9 oD] + // w3 = [o3 o7 oB oF] + // remember the o's are numbered according to the correct output location + const __m128i x0 = _mm_packs_epi32(w0, w1); + const __m128i x1 = _mm_packs_epi32(w2, w3); + { + // x0 = [o0 o4 o8 oC o2 o6 oA oE] + // x1 = [o1 o5 o9 oD o3 o7 oB oF] + const __m128i y0 = _mm_unpacklo_epi16(x0, x1); + const __m128i y1 = _mm_unpackhi_epi16(x0, x1); + // y0 = [o0 o1 o4 o5 o8 o9 oC oD] + // y1 = [o2 o3 o6 o7 oA oB oE oF] + *in0 = _mm_unpacklo_epi32(y0, y1); + // in0 = [o0 o1 o2 o3 o4 o5 o6 o7] + *in1 = _mm_unpackhi_epi32(y0, y1); + // in1 = [o8 o9 oA oB oC oD oE oF] + } + } + } +} + +void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { + // This 2D transform implements 4 vertical 1D transforms followed + // by 4 horizontal 1D transforms. The multiplies and adds are as given + // by Chen, Smith and Fralick ('77). The commands for moving the data + // around have been minimized by hand. + // For the purposes of the comments, the 16 inputs are referred to at i0 + // through iF (in raster order), intermediate variables are a0, b0, c0 + // through f, and correspond to the in-place computations mapped to input + // locations. The outputs, o0 through oF are labeled according to the + // output locations. + __m128i in0, in1; + FDCT4x4_2D_HELPER(input, stride, &in0, &in1); + + // Post-condition (v + 1) >> 2 is now incorporated into previous + // add and right-shift commands. Only 2 store instructions needed + // because we are using the fact that 1/3 are stored just after 0/2. + storeu_output(&in0, output + 0 * 4); + storeu_output(&in1, output + 2 * 4); +} + +void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1; + FDCT4x4_2D_HELPER(input, stride, &in0, &in1); + _mm_storeu_si128((__m128i *)(output + 0 * 4), in0); + _mm_storeu_si128((__m128i *)(output + 2 * 4), in1); +} + void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { int pass; // Constants diff --git a/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c b/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c index 2d8f8f71e4..0e4fb80468 100644 --- a/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c +++ b/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c @@ -17,53 +17,23 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/x86/fwd_txfm_sse2.h" -void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i u0, u1, sum; - - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - - in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - - sum = _mm_add_epi16(u0, u1); - - in0 = _mm_add_epi16(in0, in1); - in2 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, in0); - - u0 = _mm_setzero_si128(); - sum = _mm_add_epi16(sum, in2); - - in0 = _mm_unpacklo_epi16(u0, sum); - in1 = _mm_unpackhi_epi16(u0, sum); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(sum, u0); - in1 = _mm_unpackhi_epi32(sum, u0); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(sum, 8); - - in1 = _mm_add_epi32(sum, in0); - output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); -} - #define DCT_HIGH_BIT_DEPTH 0 +#define FDCT4x4_2D_HELPER fdct4x4_helper +#define FDCT4x4_2D aom_fdct4x4_sse2 +#define FDCT4x4_2D_LP aom_fdct4x4_lp_sse2 #define FDCT8x8_2D aom_fdct8x8_sse2 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h" +#undef FDCT4x4_2D_HELPER +#undef FDCT4x4_2D +#undef FDCT4x4_2D_LP #undef FDCT8x8_2D +#if CONFIG_AV1_HIGHBITDEPTH + #undef DCT_HIGH_BIT_DEPTH #define DCT_HIGH_BIT_DEPTH 1 #define FDCT8x8_2D aom_highbd_fdct8x8_sse2 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT #undef FDCT8x8_2D + +#endif diff --git a/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h b/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h index 260d8dd58e..ab3cd91557 100644 --- a/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h +++ b/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h @@ -136,16 +136,21 @@ static INLINE int check_epi16_overflow_x32( } static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { - if (sizeof(tran_low_t) == 4) { - const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - _mm_store_si128((__m128i *)(dst_ptr), out0); - _mm_store_si128((__m128i *)(dst_ptr + 4), out1); - } else { - _mm_store_si128((__m128i *)(dst_ptr), *poutput); - } + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_store_si128((__m128i *)(dst_ptr), out0); + _mm_store_si128((__m128i *)(dst_ptr + 4), out1); +} + +static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_storeu_si128((__m128i *)(dst_ptr), out0); + _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); } #ifdef __cplusplus diff --git a/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c b/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c new file mode 100644 index 0000000000..c500b0a26c --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" + +#include "av1/encoder/av1_quantize.h" + +static INLINE void highbd_load_b_values_avx2( + const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, + __m256i *round, const int16_t *quant_ptr, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, + __m256i *shift) { + *zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr)); + *zbin = _mm256_sub_epi32(*zbin, _mm256_set1_epi32(1)); + *round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr)); + *quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr)); + *dequant = + _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr)); + *shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)shift_ptr)); +} + +static INLINE void highbd_update_mask1_avx2(__m256i *cmp_mask, + const int16_t *iscan_ptr, + int *is_found, __m256i *mask) { + __m256i temp_mask = _mm256_setzero_si256(); + if (_mm256_movemask_epi8(*cmp_mask)) { + __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr)); + temp_mask = _mm256_and_si256(*cmp_mask, iscan); + *is_found = 1; + } + *mask = _mm256_max_epi16(temp_mask, *mask); +} + +static INLINE void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1, + __m256i *threshold, + const int16_t *iscan_ptr, + int *is_found, __m256i *mask) { + __m256i coeff[2], cmp_mask0, cmp_mask1; + coeff[0] = _mm256_slli_epi32(*qcoeff0, AOM_QM_BITS); + cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]); + coeff[1] = _mm256_slli_epi32(*qcoeff1, AOM_QM_BITS); + cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]); + cmp_mask0 = + _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask); +} + +static INLINE void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y, + __m256i *p, const int shift) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + + prod_lo = _mm256_srli_epi64(prod_lo, shift); + prod_hi = _mm256_srli_epi64(prod_hi, shift); + + prod_hi = _mm256_slli_epi64(prod_hi, 32); + *p = _mm256_blend_epi32(prod_lo, prod_hi, 0xaa); +} + +static INLINE void highbd_calculate_qcoeff_avx2(__m256i *coeff, + const __m256i *round, + const __m256i *quant, + const __m256i *shift, + const int *log_scale) { + __m256i tmp, qcoeff; + qcoeff = _mm256_add_epi32(*coeff, *round); + highbd_mul_shift_avx2(&qcoeff, quant, &tmp, 16); + qcoeff = _mm256_add_epi32(tmp, qcoeff); + highbd_mul_shift_avx2(&qcoeff, shift, coeff, 16 - *log_scale); +} + +static INLINE __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff, + __m256i dequant) { + return _mm256_mullo_epi32(qcoeff, dequant); +} + +static INLINE __m256i highbd_calculate_dqcoeff_log_scale_avx2( + __m256i qcoeff, __m256i dequant, const int log_scale) { + __m256i abs_coeff = _mm256_abs_epi32(qcoeff); + highbd_mul_shift_avx2(&abs_coeff, &dequant, &abs_coeff, log_scale); + return _mm256_sign_epi32(abs_coeff, qcoeff); +} + +static INLINE void highbd_store_coefficients_avx2(__m256i coeff0, + __m256i coeff1, + tran_low_t *coeff_ptr) { + _mm256_store_si256((__m256i *)(coeff_ptr), coeff0); + _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff1); +} + +void aom_highbd_quantize_b_adaptive_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m256i zero = _mm256_setzero_si256(); + __m256i zbin, round, quant, dequant, shift; + __m256i coeff0, qcoeff0, coeff1, qcoeff1; + __m256i cmp_mask, mask0 = zero, mask1 = zero; + __m128i temp_mask0, temp_mask1; + int prescan_add[2]; + int thresh[2]; + const int log_scale = 0; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; + } + __m256i threshold[2]; + threshold[0] = _mm256_set1_epi32(thresh[0]); + threshold[1] = _mm256_set1_epi32(thresh[1]); + threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + + // Setup global values. + highbd_load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, + &quant, dequant_ptr, &dequant, quant_shift_ptr, + &shift); + + // Do DC and first 15 AC. + coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr)); + qcoeff0 = _mm256_abs_epi32(coeff0); + coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); + qcoeff1 = _mm256_abs_epi32(coeff1); + highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, + &mask0); + __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm256_unpackhi_epi64(zbin, zbin); + __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); + threshold[0] = threshold[1]; + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); + round = _mm256_unpackhi_epi64(round, round); + quant = _mm256_unpackhi_epi64(quant, quant); + shift = _mm256_unpackhi_epi64(shift, shift); + dequant = _mm256_unpackhi_epi64(dequant, dequant); + } else { + highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); + round = _mm256_unpackhi_epi64(round, round); + quant = _mm256_unpackhi_epi64(quant, quant); + shift = _mm256_unpackhi_epi64(shift, shift); + highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); + // Reinsert signs + qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); + qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); + // Mask out zbin threshold coeffs + qcoeff0 = _mm256_and_si256(qcoeff0, temp0); + qcoeff1 = _mm256_and_si256(qcoeff1, temp1); + highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr); + coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant); + dequant = _mm256_unpackhi_epi64(dequant, dequant); + coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant); + highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index)); + qcoeff0 = _mm256_abs_epi32(coeff0); + coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8)); + qcoeff1 = _mm256_abs_epi32(coeff1); + highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); + temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); + index += 16; + continue; + } + highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); + qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); + qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); + qcoeff0 = _mm256_and_si256(qcoeff0, temp0); + qcoeff1 = _mm256_and_si256(qcoeff1, temp1); + highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index); + coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant); + coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant); + highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index); + index += 16; + } + if (is_found0) { + temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), + _mm256_extracti128_si256(mask0, 1)); + non_zero_count = calculate_non_zero_count(temp_mask0); + } + if (is_found1) { + temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), + _mm256_extracti128_si256(mask1, 1)); + non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); + } + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < + (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_highbd_quantize_b_32x32_adaptive_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const int log_scale = 1; + const __m256i zero = _mm256_setzero_si256(); + __m256i zbin, round, quant, dequant, shift; + __m256i coeff0, qcoeff0, coeff1, qcoeff1; + __m256i cmp_mask, mask0 = zero, mask1 = zero; + __m128i temp_mask0, temp_mask1; + const __m256i one = _mm256_set1_epi32(1); + const __m256i log_scale_vec = _mm256_set1_epi32(log_scale); + int prescan_add[2]; + int thresh[2]; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + __m256i threshold[2]; + threshold[0] = _mm256_set1_epi32(thresh[0]); + threshold[1] = _mm256_set1_epi32(thresh[1]); + threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + + // Setup global values. + zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr)); + round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr)); + quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr)); + dequant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr)); + shift = + _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_shift_ptr)); + + // Shift with rounding. + zbin = _mm256_add_epi32(zbin, log_scale_vec); + round = _mm256_add_epi32(round, log_scale_vec); + zbin = _mm256_srli_epi32(zbin, log_scale); + round = _mm256_srli_epi32(round, log_scale); + zbin = _mm256_sub_epi32(zbin, one); + + // Do DC and first 15 AC. + coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr)); + qcoeff0 = _mm256_abs_epi32(coeff0); + coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); + qcoeff1 = _mm256_abs_epi32(coeff1); + highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, + &mask0); + __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm256_permute2x128_si256(zbin, zbin, 0x11); + __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); + threshold[0] = threshold[1]; + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); + round = _mm256_permute2x128_si256(round, round, 0x11); + quant = _mm256_permute2x128_si256(quant, quant, 0x11); + shift = _mm256_permute2x128_si256(shift, shift, 0x11); + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11); + } else { + highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); + round = _mm256_permute2x128_si256(round, round, 0x11); + quant = _mm256_permute2x128_si256(quant, quant, 0x11); + shift = _mm256_permute2x128_si256(shift, shift, 0x11); + highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); + // Reinsert signs + qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); + qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); + // Mask out zbin threshold coeffs + qcoeff0 = _mm256_and_si256(qcoeff0, temp0); + qcoeff1 = _mm256_and_si256(qcoeff1, temp1); + highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr); + coeff0 = + highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale); + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11); + coeff1 = + highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale); + highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index)); + qcoeff0 = _mm256_abs_epi32(coeff0); + coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8)); + qcoeff1 = _mm256_abs_epi32(coeff1); + highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); + temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); + index += 16; + continue; + } + highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); + qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); + qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); + qcoeff0 = _mm256_and_si256(qcoeff0, temp0); + qcoeff1 = _mm256_and_si256(qcoeff1, temp1); + highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index); + coeff0 = + highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale); + coeff1 = + highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale); + highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index); + index += 16; + } + if (is_found0) { + temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), + _mm256_extracti128_si256(mask0, 1)); + non_zero_count = calculate_non_zero_count(temp_mask0); + } + if (is_found1) { + temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), + _mm256_extracti128_si256(mask1, 1)); + non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); + } + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} diff --git a/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c new file mode 100644 index 0000000000..8f31f3596f --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c @@ -0,0 +1,732 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" +#include "av1/encoder/av1_quantize.h" + +static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi64(a, sign); +} + +static INLINE void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y, + __m128i *p, const int shift) { + __m128i sign = _mm_srai_epi32(*y, 31); + __m128i sign_lo = _mm_unpacklo_epi32(sign, sign); + __m128i sign_hi = _mm_unpackhi_epi32(sign, sign); + __m128i abs_y = invert_sign_32_sse2(*y, sign); + __m128i prod_lo = _mm_mul_epu32(*x, abs_y); + __m128i prod_hi = _mm_srli_epi64(*x, 32); + const __m128i mult_hi = _mm_srli_epi64(abs_y, 32); + prod_hi = _mm_mul_epu32(prod_hi, mult_hi); + prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo); + prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi); + + prod_lo = _mm_srli_epi64(prod_lo, shift); + const __m128i mask = _mm_set_epi32(0, -1, 0, -1); + prod_lo = _mm_and_si128(prod_lo, mask); + prod_hi = _mm_srli_epi64(prod_hi, shift); + + prod_hi = _mm_slli_epi64(prod_hi, 32); + *p = _mm_or_si128(prod_lo, prod_hi); +} + +static INLINE void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round, + const __m128i *quant, + const __m128i *shift, + const int *log_scale) { + __m128i tmp, qcoeff; + qcoeff = _mm_add_epi32(*coeff, *round); + highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16); + qcoeff = _mm_add_epi32(tmp, qcoeff); + highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale); +} + +static INLINE void highbd_update_mask1(__m128i *cmp_mask0, + const int16_t *iscan_ptr, int *is_found, + __m128i *mask) { + __m128i temp_mask = _mm_setzero_si128(); + if (_mm_movemask_epi8(*cmp_mask0)) { + __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr)); + __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0); + temp_mask = mask0; + *is_found = 1; + } + *mask = _mm_max_epi16(temp_mask, *mask); +} + +static INLINE void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1, + __m128i *threshold, + const int16_t *iscan_ptr, int *is_found, + __m128i *mask) { + __m128i coeff[2], cmp_mask0, cmp_mask1; + + coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS); + cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]); + coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS); + cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]); + + cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1); + + highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask); +} + +static INLINE __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant, + const int log_scale) { + __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31); + __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign); + highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale); + return invert_sign_32_sse2(abs_coeff, coeff_sign); +} + +void aom_highbd_quantize_b_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 8; + const int log_scale = 0; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi32(1); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1, cmp_mask; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + __m128i zbin_sign = _mm_srai_epi16(zbin, 15); + __m128i round_sign = _mm_srai_epi16(round, 15); + __m128i quant_sign = _mm_srai_epi16(quant, 15); + __m128i dequant_sign = _mm_srai_epi16(dequant, 15); + __m128i shift_sign = _mm_srai_epi16(shift, 15); + + zbin = _mm_unpacklo_epi16(zbin, zbin_sign); + round = _mm_unpacklo_epi16(round, round_sign); + quant = _mm_unpacklo_epi16(quant, quant_sign); + dequant = _mm_unpacklo_epi16(dequant, dequant_sign); + shift = _mm_unpacklo_epi16(shift, shift_sign); + zbin = _mm_sub_epi32(zbin, one); + + // Do DC and first 15 AC. + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + index += 8; + continue; + } + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); + + index += 8; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < + (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_highbd_quantize_b_32x32_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 8; + const int log_scale = 1; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi32(1); + const __m128i log_scale_vec = _mm_set1_epi32(log_scale); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1, cmp_mask; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + __m128i zbin_sign = _mm_srai_epi16(zbin, 15); + __m128i round_sign = _mm_srai_epi16(round, 15); + __m128i quant_sign = _mm_srai_epi16(quant, 15); + __m128i dequant_sign = _mm_srai_epi16(dequant, 15); + __m128i shift_sign = _mm_srai_epi16(shift, 15); + + zbin = _mm_unpacklo_epi16(zbin, zbin_sign); + round = _mm_unpacklo_epi16(round, round_sign); + quant = _mm_unpacklo_epi16(quant, quant_sign); + dequant = _mm_unpacklo_epi16(dequant, dequant_sign); + shift = _mm_unpacklo_epi16(shift, shift_sign); + + // Shift with rounding. + zbin = _mm_add_epi32(zbin, log_scale_vec); + round = _mm_add_epi32(round, log_scale_vec); + zbin = _mm_srli_epi32(zbin, log_scale); + round = _mm_srli_epi32(round, log_scale); + zbin = _mm_sub_epi32(zbin, one); + + // Do DC and first 15 AC. + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + index += 8; + continue; + } + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); + + index += 8; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_highbd_quantize_b_64x64_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 8; + const int log_scale = 2; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi32(1); + const __m128i log_scale_vec = _mm_set1_epi32(log_scale); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1, cmp_mask; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + __m128i zbin_sign = _mm_srai_epi16(zbin, 15); + __m128i round_sign = _mm_srai_epi16(round, 15); + __m128i quant_sign = _mm_srai_epi16(quant, 15); + __m128i dequant_sign = _mm_srai_epi16(dequant, 15); + __m128i shift_sign = _mm_srai_epi16(shift, 15); + + zbin = _mm_unpacklo_epi16(zbin, zbin_sign); + round = _mm_unpacklo_epi16(round, round_sign); + quant = _mm_unpacklo_epi16(quant, quant_sign); + dequant = _mm_unpacklo_epi16(dequant, dequant_sign); + shift = _mm_unpacklo_epi16(shift, shift_sign); + + // Shift with rounding. + zbin = _mm_add_epi32(zbin, log_scale_vec); + round = _mm_add_epi32(round, log_scale_vec); + zbin = _mm_srli_epi32(zbin, log_scale); + round = _mm_srli_epi32(round, log_scale); + zbin = _mm_sub_epi32(zbin, one); + + // Do DC and first 15 AC. + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + index += 8; + continue; + } + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); + + index += 8; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} diff --git a/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c b/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c index 099fcf7fc6..b43a7d7b5b 100644 --- a/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c +++ b/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c @@ -20,6 +20,14 @@ // ----------------------------------------------------------------------------- // Copy and average +static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; +static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13, + 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13 }; + void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, @@ -107,13 +115,13 @@ void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { int i, j; const int fo_vert = filter_params_y->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride; (void)filter_params_x; - (void)subpel_x_q4; + (void)subpel_x_qn; (void)conv_params; assert(conv_params->round_0 <= FILTER_BITS); @@ -130,7 +138,7 @@ void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m256i zero = _mm256_setzero_si256(); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { const uint16_t *data = &src_ptr[j]; @@ -256,12 +264,12 @@ void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { int i, j; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_horiz; - (void)subpel_y_q4; + (void)subpel_y_qn; (void)filter_params_y; // Check that, even with 12-bit input, the intermediate values will fit @@ -285,7 +293,7 @@ void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, assert((FILTER_BITS - conv_params->round_1) >= 0 || ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); for (j = 0; j < w; j += 8) { /* Horizontal filter */ @@ -444,6 +452,17 @@ static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { f[3] = _mm256_shuffle_epi8(hh, p3); } +static INLINE void pack_filters_4tap(const int16_t *filter, + __m256i *f /*f[4]*/) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i coeff = _mm256_broadcastsi128_si256(h); + + // coeffs 2 3 2 3 2 3 2 3 + f[0] = _mm256_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + f[1] = _mm256_shuffle_epi32(coeff, 0xaa); +} + static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, const __m256i *fil /*fil[4]*/, __m256i *y) { @@ -544,6 +563,176 @@ static void aom_highbd_filter_block1d16_h8_avx2( } while (height > 0); } +static void aom_highbd_filter_block1d4_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i ff[2], s[2]; + uint32_t i; + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; + + __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask); + __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3); + __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5); + + pack_filters_4tap(filter, ff); + src_ptr -= 3; + for (i = 0; i <= (height - 2); i += 2) { + __m256i row0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2])); + __m256i row1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2])); + + s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1); + s[1] = _mm256_alignr_epi8(s[0], s[0], 4); + + s[0] = _mm256_shuffle_epi8(s[0], mask); + s[1] = _mm256_shuffle_epi8(s[1], mask); + + __m256i res = convolve_4tap(s, ff); + res = + _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); + + res = _mm256_packs_epi32(res, res); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res)); + _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch], + _mm256_extracti128_si256(res, 1)); + } + if (height % 2 != 0) { + i = height - 1; + const __m256i row0_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2])); + const __m256i row0_1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6])); + + const __m256i r0 = + _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1); + + s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3); + s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5); + + __m256i res = convolve_4tap(s, ff); + res = + _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); + + res = _mm256_packs_epi32(res, res); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res)); + } +} + +static void aom_highbd_filter_block1d8_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i ff[2], s[2]; + uint32_t i = 0; + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + static const uint8_t shuffle_mask[32] = { 0, 1, 8, 9, 2, 3, 10, 11, + 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, + 4, 5, 12, 13, 6, 7, 14, 15 }; + + __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask); + __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3); + __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5); + + pack_filters_4tap(filter, ff); + src_ptr -= 3; + + /* Horizontal filter */ + + for (i = 0; i <= (height - 2); i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]); + __m256i row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]); + + const __m256i r0 = + _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = r0; + s[1] = _mm256_alignr_epi8(r1, r0, 4); + + __m256i res_even = convolve_4tap(s, ff); + res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding), + CONV8_ROUNDING_BITS); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + + __m256i res_odd = convolve_4tap(s, ff); + res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding), + CONV8_ROUNDING_BITS); + + __m256i res = _mm256_packs_epi32(res_even, res_odd); + res = _mm256_shuffle_epi8(res, mask); + + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res)); + _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], + _mm256_extracti128_si256(res, 1)); + } + + if (height % 2 != 0) { + i = height - 1; + const __m256i row0_0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]); + const __m256i row0_1 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]); + + const __m256i r0 = + _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1); + + s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3); + s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5); + + __m256i res = convolve_4tap(s, ff); + res = + _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); + + res = _mm256_packs_epi32(res, res); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res)); + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4], + _mm256_extracti128_si256(res, 1)); + } +} + +static void aom_highbd_filter_block1d16_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch, + height, filter, bd); + aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8, + dst_pitch, height, filter, bd); +} + // ----------------------------------------------------------------------------- // 2-tap horizontal filtering @@ -875,6 +1064,142 @@ static void aom_highbd_filter_block1d16_v8_avx2( } while (height > 0); } +static void aom_highbd_filter_block1d4_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + uint32_t i; + __m256i s[2], ff[2]; + + pack_filters_4tap(filter, ff); + + const uint16_t *data = src_ptr; + /* Vertical filter */ + { + __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch)); + __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch)); + + __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); + + __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch)); + + __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); + + s[0] = _mm256_unpacklo_epi16(s23, s34); + + for (i = 0; i < height; i += 2) { + data = &src_ptr[i * src_pitch]; + + __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch)); + __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch)); + + __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); + __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); + + s[1] = _mm256_unpacklo_epi16(s45, s56); + + const __m256i res_a = convolve_4tap(s, ff); + + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); + + __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel); + res_16bit = _mm256_max_epi32(res_16bit, zero); + res_16bit = _mm256_packs_epi32(res_16bit, res_16bit); + + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res_16bit)); + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], + _mm256_extracti128_si256(res_16bit, 1)); + + s[0] = s[1]; + s4 = s6; + } + } +} + +static void aom_highbd_filter_block1d8_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + __m256i s[4], ff[2]; + uint32_t i; + pack_filters_4tap(filter, ff); + + const uint16_t *data = src_ptr; + /* Vertical filter */ + { + __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch)); + __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch)); + + __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); + + __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch)); + + __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); + + s[0] = _mm256_unpacklo_epi16(s23, s34); + s[2] = _mm256_unpackhi_epi16(s23, s34); + + for (i = 0; i < height; i += 2) { + data = &src_ptr[i * src_pitch]; + + __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch)); + __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch)); + + __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); + __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); + + s[1] = _mm256_unpacklo_epi16(s45, s56); + s[3] = _mm256_unpackhi_epi16(s45, s56); + + const __m256i res_a = convolve_4tap(s, ff); + + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); + + const __m256i res_b = convolve_4tap(s + 2, ff); + __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); + + __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); + res_16bit = _mm256_max_epi16(res_16bit, zero); + + _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res_16bit)); + _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], + _mm256_extracti128_si256(res_16bit, 1)); + + s[0] = s[1]; + s[2] = s[3]; + s4 = s6; + } + } +} + +static void aom_highbd_filter_block1d16_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch, + height, filter, bd); + + aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8, + dst_pitch, height, filter, bd); +} + // ----------------------------------------------------------------------------- // 2-tap vertical filtering diff --git a/media/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c new file mode 100644 index 0000000000..a2bb283222 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <emmintrin.h> + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/x86/convolve.h" + +// ----------------------------------------------------------------------------- + +void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg34_lo; + __m128i srcReg45_lo, srcReg56_lo; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_45_lo, resReg34_56_lo; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg64, secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + addFilterReg64 = _mm_set1_epi32(64); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiply the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = dst_pitch << 1; + + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); + + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); + + for (i = height; i > 1; i -= 2) { + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); + + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + + resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); + resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); + resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); + resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); + + resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); + + // shift by 7 bit each 32 bit + resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); + resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); + resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); + resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); + + // shrink to 16 bit each 32 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128()); + resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128()); + + resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); + resReg23_45 = _mm_min_epi16(resReg23_45, max); + resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); + resReg34_56 = _mm_min_epi16(resReg34_56, max); + + src_ptr += src_stride; + + _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45)); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); + + dst_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_lo = srcReg45_lo; + srcReg34_lo = srcReg56_lo; + srcReg4 = srcReg6; + } +} + +void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m128i filtersReg; + __m128i addFilterReg64; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1; + __m128i srcReg32b1; + unsigned int i; + src_ptr -= 3; + addFilterReg64 = _mm_set1_epi32(64); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); + + __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); + __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); + __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1); + __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1); + + ss_23 = _mm_madd_epi16(ss_23, secondFilters); + ss_45 = _mm_madd_epi16(ss_45, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45); + + // shift by 7 bit each 32 bit + srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64); + srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7); + + srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); + srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); + + src_ptr += src_pitch; + + _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1); + + dst_ptr += dst_pitch; + } +} + +void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; + __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; + __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg64, secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + addFilterReg64 = _mm_set1_epi32(64); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = dst_pitch << 1; + + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); + srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3); + + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); + srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4); + + for (i = height; i > 1; i -= 2) { + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + + srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); + srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5); + + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + + srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); + srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + + resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); + resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); + resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); + resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); + + resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); + + // multiply 2 adjacent elements with the filter and add the result + + resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters); + resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters); + resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters); + resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters); + + resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi); + resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi); + + // shift by 7 bit each 32 bit + resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); + resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); + resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64); + resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64); + resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); + resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); + resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7); + resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7); + + // shrink to 16 bit each 32 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi); + resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi); + + resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); + resReg23_45 = _mm_min_epi16(resReg23_45, max); + resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); + resReg34_56 = _mm_min_epi16(resReg34_56, max); + + src_ptr += src_stride; + + _mm_store_si128((__m128i *)dst_ptr, (resReg23_45)); + _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); + + dst_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_lo = srcReg45_lo; + srcReg23_hi = srcReg45_hi; + srcReg34_lo = srcReg56_lo; + srcReg34_hi = srcReg56_hi; + srcReg4 = srcReg6; + } +} + +void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m128i filtersReg; + __m128i addFilterReg64; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; + __m128i srcReg32b1, srcReg32b2; + unsigned int i; + src_ptr -= 3; + addFilterReg64 = _mm_set1_epi32(64); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); + srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6)); + + __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); + __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4); + __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2); + + __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters); + __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); + + __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); + __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2); + __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6); + __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2); + __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2); + + d1 = _mm_madd_epi16(ss_3, secondFilters); + d2 = _mm_madd_epi16(ss_5, thirdFilters); + srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); + + __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + + // shift by 7 bit each 32 bit + res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64); + res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64); + res_lo_1 = _mm_srai_epi32(res_lo_1, 7); + res_hi_1 = _mm_srai_epi32(res_hi_1, 7); + + srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1); + + srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); + + src_ptr += src_pitch; + + _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1); + + dst_ptr += dst_pitch; + } +} + +void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, + height, filter, bd); + aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), + dst_pitch, height, filter, bd); +} + +void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, + height, filter, bd); + aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), + dst_pitch, height, filter, bd); +} diff --git a/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c b/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c index e7b33d1c46..a79350f5a6 100644 --- a/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c +++ b/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c @@ -20,14 +20,14 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, - const int subpel_y_q4, + const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { int i, j; const int fo_vert = filter_params_y->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride; (void)filter_params_x; - (void)subpel_x_q4; + (void)subpel_x_qn; (void)conv_params; assert(conv_params->round_0 <= FILTER_BITS); @@ -44,7 +44,7 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m128i zero = _mm_setzero_si128(); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { const uint16_t *data = &src_ptr[j]; @@ -168,13 +168,13 @@ void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, - const int subpel_y_q4, + const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { int i, j; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_horiz; - (void)subpel_y_q4; + (void)subpel_y_qn; (void)filter_params_y; // Check that, even with 12-bit input, the intermediate values will fit @@ -195,7 +195,7 @@ void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m128i zero = _mm_setzero_si128(); - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); for (j = 0; j < w; j += 8) { /* Horizontal filter */ diff --git a/media/libaom/src/aom_dsp/x86/highbd_intrapred_sse2_asm.asm b/media/libaom/src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm index 91b3d126ca..91b3d126ca 100644 --- a/media/libaom/src/aom_dsp/x86/highbd_intrapred_sse2_asm.asm +++ b/media/libaom/src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm diff --git a/media/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c index 097e0778ff..ea7dc6a9e5 100644 --- a/media/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/media/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -90,7 +90,7 @@ static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q, const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); - const __m128i ffff = _mm_set1_epi16(0xFFFF); + const __m128i ffff = _mm_set1_epi16((short)0xFFFF); __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); @@ -112,7 +112,7 @@ static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x, __m128i *hev, __m128i *mask) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); - const __m128i ffff = _mm_set1_epi16(0xFFFF); + const __m128i ffff = _mm_set1_epi16((short)0xFFFF); __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0; __m128i max, max01, h; @@ -497,8 +497,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( } void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, - const uint8_t *blt, const uint8_t *lt, - const uint8_t *thr, int bd) { + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { __m128i p[7], q[7], pq[7]; int i; @@ -507,7 +508,7 @@ void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch)); } - highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd); + highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd); for (i = 0; i < 6; i++) { _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]); @@ -727,8 +728,8 @@ void aom_highbd_lpf_horizontal_14_dual_sse2( _limit1, _thresh1, bd); for (i = 0; i < 6; i++) { - _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]); - _mm_store_si128((__m128i *)(s + i * pitch), q[i]); + _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]); } } diff --git a/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c index 58e5f98e58..1764a4952a 100644 --- a/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c @@ -134,7 +134,7 @@ void aom_highbd_quantize_b_32x32_sse2( for (i = 0; i < idx; i++) { const int rc = idx_arr[i]; const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); + const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; @@ -146,3 +146,61 @@ void aom_highbd_quantize_b_32x32_sse2( } *eob_ptr = eob + 1; } + +void aom_highbd_quantize_b_64x64_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m128i zbins[2]; + __m128i nzbins[2]; + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2); + const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2); + (void)scan; + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); + zbins[1] = _mm_set1_epi32(zbin1_tmp); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + } + *eob_ptr = eob + 1; +} diff --git a/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm b/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm index 3398d8a2ae..09e64d510e 100644 --- a/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm +++ b/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm @@ -372,3 +372,71 @@ HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2 HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2 + +; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD4XN 1-2 0 + HIGH_SAD_FN 4, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + pxor m6, m6 + +.loop: + movq m1, [refq] + movq m2, [refq+ref_strideq*2] + movq m3, [refq+ref_strideq*4] + movq m4, [refq+ref_stride3q*2] + punpcklwd m1, m3 + punpcklwd m2, m4 +%if %2 == 1 + movq m3, [second_predq+8*0] + movq m5, [second_predq+8*2] + punpcklwd m3, m5 + movq m4, [second_predq+8*1] + movq m5, [second_predq+8*3] + punpcklwd m4, m5 + lea second_predq, [second_predq+8*4] + pavgw m1, m3 + pavgw m2, m4 +%endif + movq m5, [srcq] + movq m3, [srcq+src_strideq*4] + punpcklwd m5, m3 + movdqa m3, m1 + psubusw m1, m5 + psubusw m5, m3 + por m1, m5 + movq m5, [srcq+src_strideq*2] + movq m4, [srcq+src_stride3q*2] + punpcklwd m5, m4 + movdqa m4, m2 + psubusw m2, m5 + psubusw m5, m4 + por m2, m5 + paddw m1, m2 + movdqa m2, m1 + punpcklwd m1, m6 + punpckhwd m2, m6 + lea refq, [refq+ref_strideq*8] + paddd m0, m1 + lea srcq, [srcq+src_strideq*8] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD4XN 16 ; highbd_sad4x16_sse2 +HIGH_SAD4XN 8 ; highbd_sad4x8_sse2 +HIGH_SAD4XN 4 ; highbd_sad4x4_sse2 +HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2 +HIGH_SAD4XN 8, 1 ; highbd_sad4x8_avg_sse2 +HIGH_SAD4XN 4, 1 ; highbd_sad4x4_avg_sse2 diff --git a/media/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/media/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm index 61f5b8e865..5c78933df5 100644 --- a/media/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/media/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -114,45 +114,33 @@ SECTION .text cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, \ - sec, sec_stride, height, sse, \ - g_bilin_filter, g_pw_8 + sec, sec_stride, height, sse %define block_height dword heightm %define sec_str sec_stridemp - - ; Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, \ - dst, dst_stride, height, sse, \ - g_bilin_filter, g_pw_8 + dst, dst_stride, height, sse %define block_height heightd + %endif - ; Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx + ; Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx - LOAD_IF_USED 0, 1 ; load eax, ecx back - %endif + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back %else %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ diff --git a/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c index 18eb03d12c..b72d1cf8ba 100644 --- a/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c +++ b/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c @@ -29,15 +29,15 @@ static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride, __m128i x0, x1, x2, x3; int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); - u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); - u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); - u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); - u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride)); - v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); - v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); - v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); - v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride)); x0 = _mm_sub_epi16(u0, v0); x1 = _mm_sub_epi16(u1, v1); @@ -61,23 +61,23 @@ static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride, __m128i x0, x1, x2, x3, x4, x5, x6, x7; int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); - u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); - u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); - u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); - u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); - u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); - u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); - u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); - u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); + u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride)); + u4 = _mm_loadl_epi64((__m128i const *)(src + 4 * src_stride)); + u5 = _mm_loadl_epi64((__m128i const *)(src + 5 * src_stride)); + u6 = _mm_loadl_epi64((__m128i const *)(src + 6 * src_stride)); + u7 = _mm_loadl_epi64((__m128i const *)(src + 7 * src_stride)); - v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); - v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); - v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); - v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); - v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); - v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); - v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); - v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); + v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride)); + v4 = _mm_loadl_epi64((__m128i const *)(pred + 4 * pred_stride)); + v5 = _mm_loadl_epi64((__m128i const *)(pred + 5 * pred_stride)); + v6 = _mm_loadl_epi64((__m128i const *)(pred + 6 * pred_stride)); + v7 = _mm_loadl_epi64((__m128i const *)(pred + 7 * pred_stride)); x0 = _mm_sub_epi16(u0, v0); x1 = _mm_sub_epi16(u1, v1); diff --git a/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c index 47b052abc9..b7d15f93ec 100644 --- a/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c +++ b/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c @@ -20,9 +20,10 @@ #include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" #include "av1/common/filter.h" -#include "av1/common/onyxc_int.h" #include "av1/common/reconinter.h" +#include "av1/encoder/reconinter_enc.h" typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, @@ -192,7 +193,6 @@ VAR_FN(16, 16, 16, 8); VAR_FN(16, 8, 8, 7); VAR_FN(8, 16, 8, 7); VAR_FN(8, 8, 8, 6); -VAR_FN(16, 4, 16, 6); VAR_FN(8, 32, 8, 8); VAR_FN(32, 8, 8, 8); VAR_FN(16, 64, 16, 10); @@ -287,30 +287,38 @@ DECLS(sse2); uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ - uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ - NULL); \ - if (w > wf) { \ - unsigned int sse2; \ + int se = 0; \ + unsigned int sse = 0; \ + unsigned int sse2; \ + int row_rep = (w > 64) ? 2 : 1; \ + for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ + src += wd_64 * 64; \ + dst += wd_64 * 64; \ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ - &sse2, NULL, NULL); \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ + if (w > wf) { \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ + dst_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ + dst_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ } \ } \ *sse_ptr = sse; \ @@ -322,33 +330,42 @@ DECLS(sse2); const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ int64_t var; \ uint32_t sse; \ + uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ - NULL); \ - if (w > wf) { \ - uint32_t sse2; \ + int se = 0; \ + int row_rep = (w > 64) ? 2 : 1; \ + for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ + src += wd_64 * 64; \ + dst += wd_64 * 64; \ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ - &sse2, NULL, NULL); \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + NULL); \ se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ + long_sse += sse; \ + if (w > wf) { \ + uint32_t sse2; \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ - sse += sse2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ + dst_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ + dst_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ } \ } \ se = ROUND_POWER_OF_TWO(se, 2); \ - sse = ROUND_POWER_OF_TWO(sse, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4); \ *sse_ptr = sse; \ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ return (var >= 0) ? (uint32_t)var : 0; \ @@ -364,35 +381,38 @@ DECLS(sse2); uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int row_rep = (w > 64) ? 2 : 1; \ for (start_row = 0; start_row < h; start_row += 16) { \ uint32_t sse2; \ int height = h - start_row < 16 ? h - start_row : 16; \ - int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + (start_row * src_stride), src_stride, x_offset, y_offset, \ - dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \ - NULL); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf) { \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \ - &sse2, NULL, NULL); \ + uint16_t *src_tmp = src + (start_row * src_stride); \ + uint16_t *dst_tmp = dst + (start_row * dst_stride); \ + for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ + src_tmp += wd_64 * 64; \ + dst_tmp += wd_64 * 64; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride, \ + height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ - if (w > wf * 2) { \ + if (w > wf) { \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ - height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ - height, &sse2, NULL, NULL); \ + src_tmp + wf, src_stride, x_offset, y_offset, dst_tmp + wf, \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp + 2 * wf, src_stride, x_offset, y_offset, \ + dst_tmp + 2 * wf, dst_stride, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp + 3 * wf, src_stride, x_offset, y_offset, \ + dst_tmp + 3 * wf, dst_stride, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ } \ } \ } \ @@ -403,22 +423,25 @@ DECLS(sse2); return (var >= 0) ? (uint32_t)var : 0; \ } -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int64_t)); \ - FN(8, 16, 8, 3, 4, opt, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt, (int64_t)); \ - FN(16, 4, 16, 4, 2, opt, (int64_t)); \ - FN(8, 32, 8, 3, 5, opt, (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (int64_t)); \ - FN(16, 64, 16, 4, 6, opt, (int64_t)); \ +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t)); \ + FN(128, 64, 16, 7, 6, opt, (int64_t)); \ + FN(64, 128, 16, 6, 7, opt, (int64_t)); \ + FN(64, 64, 16, 6, 6, opt, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int64_t)); \ + FN(8, 16, 8, 3, 4, opt, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt, (int64_t)); \ + FN(16, 4, 16, 4, 2, opt, (int64_t)); \ + FN(8, 32, 8, 3, 5, opt, (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t)); \ FN(64, 16, 16, 6, 4, opt, (int64_t)) FNS(sse2); @@ -456,19 +479,19 @@ DECLS(sse2); if (w > wf) { \ uint32_t sse2; \ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ - sec + 16, w, h, &sse2, NULL, NULL); \ + src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \ + sec + wf, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ - sec + 32, w, h, &sse2, NULL, NULL); \ + src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ + dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ - sec + 48, w, h, &sse2, NULL, NULL); \ + src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ + dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -492,19 +515,19 @@ DECLS(sse2); if (w > wf) { \ uint32_t sse2; \ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ - sec + 16, w, h, &sse2, NULL, NULL); \ + src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \ + sec + wf, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ - sec + 32, w, h, &sse2, NULL, NULL); \ + src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ + dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ - sec + 48, w, h, &sse2, NULL, NULL); \ + src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ + dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -539,22 +562,22 @@ DECLS(sse2); long_sse += sse2; \ if (w > wf) { \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \ - sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ + src + wf + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + wf + (start_row * dst_stride), dst_stride, \ + sec + wf + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ - sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ + src + 2 * wf + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 2 * wf + (start_row * dst_stride), dst_stride, \ + sec + 2 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ - sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ + src + 3 * wf + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 3 * wf + (start_row * dst_stride), dst_stride, \ + sec + 3 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ } \ @@ -603,85 +626,34 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, const int ref_num = 0; const int is_intrabc = is_intrabc_block(mi); const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; const int is_scaled = av1_is_scaled(sf); if (is_scaled) { - // Note: This is mostly a copy from the >=8X8 case in - // build_inter_predictors() function, with some small tweaks. - // Some assumptions. - const int plane = 0; - - // Get pre-requisites. + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int ssx = pd->subsampling_x; - const int ssy = pd->subsampling_y; - assert(ssx == 0 && ssy == 0); const struct buf_2d *const dst_buf = &pd->dst; const struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref_num]; - const int mi_x = mi_col * MI_SIZE; - const int mi_y = mi_row * MI_SIZE; - // Calculate subpel_x/y and x/y_step. - const int row_start = 0; // Because ss_y is 0. - const int col_start = 0; // Because ss_x is 0. - const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; - const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; - int orig_pos_y = pre_y << SUBPEL_BITS; - orig_pos_y += mv->row * (1 << (1 - ssy)); - int orig_pos_x = pre_x << SUBPEL_BITS; - orig_pos_x += mv->col * (1 << (1 - ssx)); - int pos_y = sf->scale_value_y(orig_pos_y, sf); - int pos_x = sf->scale_value_x(orig_pos_x, sf); - pos_x += SCALE_EXTRA_OFF; - pos_y += SCALE_EXTRA_OFF; - - const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); - const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); - const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - const int right = (pre_buf->width + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - pos_y = clamp(pos_y, top, bottom); - pos_x = clamp(pos_x, left, right); - - const uint8_t *const pre = - pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + - (pos_x >> SCALE_SUBPEL_BITS); - const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, - pos_x & SCALE_SUBPEL_MASK, - pos_y & SCALE_SUBPEL_MASK }; - - // Get warp types. - const WarpedMotionParams *const wm = - &xd->global_motion[mi->ref_frame[ref_num]]; - const int is_global = is_global_mv_block(mi, wm->wmtype); - WarpTypesAllowed warp_types; - warp_types.global_warp_allowed = is_global; - warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; - - // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); - const InterpFilters filters = + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); - - // Get the inter predictor. - const int build_for_obmc = 0; - av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width, - &subpel_params, sf, width, height, &conv_params, - filters, &warp_types, mi_x >> pd->subsampling_x, - mi_y >> pd->subsampling_y, plane, ref_num, mi, - build_for_obmc, xd, cm->allow_warped_motion); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred8, width, mv, + &inter_pred_params); return; } } - const InterpFilterParams *filter = - (subpel_search == 1) - ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) - : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); - + const InterpFilterParams *filter = av1_get_filter(subpel_search); + int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS; if (!subpel_x_q3 && !subpel_y_q3) { uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); @@ -729,17 +701,20 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1); + uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS) + ? temp + (filter_taps >> 1) * MAX_SB_SIZE + : temp; + uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1), - ref_stride, CONVERT_TO_BYTEPTR(temp), - MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, - intermediate_height, bd); - aom_highbd_convolve8_vert( - CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), - MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, - bd); + aom_highbd_convolve8_horiz( + ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz), + MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd); + aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE, + comp_pred8, width, NULL, -1, kernel_y, 16, width, + height, bd); } } @@ -765,11 +740,11 @@ void aom_highbd_comp_avg_upsampled_pred_sse2( } } -static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1, - const __m128i *w0, - const __m128i *w1, - const __m128i *r, - void *const result) { +static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1, + const __m128i *w0, + const __m128i *w1, + const __m128i *r, + void *const result) { assert(DIST_PRECISION_BITS <= 4); __m128i mult0 = _mm_mullo_epi16(*p0, *w0); __m128i mult1 = _mm_mullo_epi16(*p1, *w1); @@ -780,11 +755,10 @@ static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1, xx_storeu_128(result, shift); } -void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8, - const uint8_t *pred8, int width, - int height, const uint8_t *ref8, - int ref_stride, - const JNT_COMP_PARAMS *jcp_param) { +void aom_highbd_dist_wtd_comp_avg_pred_sse2( + uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, + const uint8_t *ref8, int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { int i; const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset; const uint16_t wt1 = (uint16_t)jcp_param->bck_offset; @@ -806,7 +780,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8, __m128i p0 = xx_loadu_128(ref); __m128i p1 = xx_loadu_128(pred); - highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); comp_pred += 8; pred += 8; @@ -823,7 +797,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8, __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); __m128i p1 = xx_loadu_128(pred); - highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); comp_pred += 8; pred += 8; @@ -832,11 +806,11 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8, } } -void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( +void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); int n; @@ -860,7 +834,7 @@ void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( __m128i p0 = xx_loadu_128(comp_pred16); __m128i p1 = xx_loadu_128(pred); - highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16); + highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16); comp_pred16 += 8; pred += 8; diff --git a/media/libaom/src/aom_dsp/x86/intrapred_sse2_asm.asm b/media/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm index 9aece27beb..0eb632326b 100644 --- a/media/libaom/src/aom_dsp/x86/intrapred_sse2_asm.asm +++ b/media/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm @@ -27,23 +27,6 @@ pw2_32: times 8 dw 16 SECTION .text -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 - pavgb %4, %1, %3 - pxor %3, %1 - pand %3, [GLOBAL(pb_1)] - psubb %4, %3 - pavgb %4, %2 -%endmacro - INIT_XMM sse2 cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset GET_GOT goffsetq diff --git a/media/libaom/src/aom_dsp/x86/intrapred_avx2.c b/media/libaom/src/aom_dsp/x86/intrapred_avx2.c index 1e67d392e8..546ee74bb3 100644 --- a/media/libaom/src/aom_dsp/x86/intrapred_avx2.c +++ b/media/libaom/src/aom_dsp/x86/intrapred_avx2.c @@ -12,6 +12,8 @@ #include <immintrin.h> #include "config/aom_dsp_rtcd.h" +#include "aom_dsp/x86/intrapred_x86.h" +#include "aom_dsp/x86/lpf_common_sse2.h" static INLINE __m256i dc_sum_64(const uint8_t *ref) { const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref); @@ -63,6 +65,255 @@ static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst, } } +static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, + { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, +}; + +static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = { + { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }, + { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 }, + { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 }, + { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 } +}; + +static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = { + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, + 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }, + { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, + 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }, + { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, + 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 }, + { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, + 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, + 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 } +}; + +static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, + 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, + 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, + 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff } +}; + +static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) { + __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + + r0 = _mm_unpacklo_epi16(x[0], x[1]); + r1 = _mm_unpacklo_epi16(x[2], x[3]); + r2 = _mm_unpacklo_epi16(x[4], x[5]); + r3 = _mm_unpacklo_epi16(x[6], x[7]); + + r4 = _mm_unpacklo_epi16(x[8], x[9]); + r5 = _mm_unpacklo_epi16(x[10], x[11]); + r6 = _mm_unpacklo_epi16(x[12], x[13]); + r7 = _mm_unpacklo_epi16(x[14], x[15]); + + r8 = _mm_unpacklo_epi32(r0, r1); + r9 = _mm_unpackhi_epi32(r0, r1); + r10 = _mm_unpacklo_epi32(r2, r3); + r11 = _mm_unpackhi_epi32(r2, r3); + + r12 = _mm_unpacklo_epi32(r4, r5); + r13 = _mm_unpackhi_epi32(r4, r5); + r14 = _mm_unpacklo_epi32(r6, r7); + r15 = _mm_unpackhi_epi32(r6, r7); + + r0 = _mm_unpacklo_epi64(r8, r9); + r1 = _mm_unpackhi_epi64(r8, r9); + r2 = _mm_unpacklo_epi64(r10, r11); + r3 = _mm_unpackhi_epi64(r10, r11); + + r4 = _mm_unpacklo_epi64(r12, r13); + r5 = _mm_unpackhi_epi64(r12, r13); + r6 = _mm_unpacklo_epi64(r14, r15); + r7 = _mm_unpackhi_epi64(r14, r15); + + d[0] = _mm_unpacklo_epi64(r0, r2); + d[1] = _mm_unpacklo_epi64(r4, r6); + d[2] = _mm_unpacklo_epi64(r1, r3); + d[3] = _mm_unpacklo_epi64(r5, r7); + + d[4] = _mm_unpackhi_epi64(r0, r2); + d[5] = _mm_unpackhi_epi64(r4, r6); + d[6] = _mm_unpackhi_epi64(r1, r3); + d[7] = _mm_unpackhi_epi64(r5, r7); +} + +static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) { + __m256i w0, w1, w2, w3, ww0, ww1; + + w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13 + w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33 + w2 = _mm256_unpackhi_epi16(x[0], x[1]); // 40 50 41 51 42 52 43 53 + w3 = _mm256_unpackhi_epi16(x[2], x[3]); // 60 70 61 71 62 72 63 73 + + ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 + + d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 + d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 + + ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 + + d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 + d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 +} + +static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) { + __m256i w0, w1, w2, w3, ww0, ww1; + + w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13 + w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33 + w2 = _mm256_unpacklo_epi16(x[4], x[5]); // 40 50 41 51 42 52 43 53 + w3 = _mm256_unpacklo_epi16(x[6], x[7]); // 60 70 61 71 62 72 63 73 + + ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 + + d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 + d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 + + ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 + + d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 + d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 + + w0 = _mm256_unpackhi_epi16(x[0], x[1]); // 04 14 05 15 06 16 07 17 + w1 = _mm256_unpackhi_epi16(x[2], x[3]); // 24 34 25 35 26 36 27 37 + w2 = _mm256_unpackhi_epi16(x[4], x[5]); // 44 54 45 55 46 56 47 57 + w3 = _mm256_unpackhi_epi16(x[6], x[7]); // 64 74 65 75 66 76 67 77 + + ww0 = _mm256_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 + ww1 = _mm256_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 + + d[4] = _mm256_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 + d[5] = _mm256_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 + + ww0 = _mm256_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 + ww1 = _mm256_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 + + d[6] = _mm256_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 + d[7] = _mm256_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 +} + +static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) { + __m256i w0, w1, w2, w3, ww0, ww1; + __m256i dd[16]; + w0 = _mm256_unpacklo_epi16(x[0], x[1]); + w1 = _mm256_unpacklo_epi16(x[2], x[3]); + w2 = _mm256_unpacklo_epi16(x[4], x[5]); + w3 = _mm256_unpacklo_epi16(x[6], x[7]); + + ww0 = _mm256_unpacklo_epi32(w0, w1); // + ww1 = _mm256_unpacklo_epi32(w2, w3); // + + dd[0] = _mm256_unpacklo_epi64(ww0, ww1); + dd[1] = _mm256_unpackhi_epi64(ww0, ww1); + + ww0 = _mm256_unpackhi_epi32(w0, w1); // + ww1 = _mm256_unpackhi_epi32(w2, w3); // + + dd[2] = _mm256_unpacklo_epi64(ww0, ww1); + dd[3] = _mm256_unpackhi_epi64(ww0, ww1); + + w0 = _mm256_unpackhi_epi16(x[0], x[1]); + w1 = _mm256_unpackhi_epi16(x[2], x[3]); + w2 = _mm256_unpackhi_epi16(x[4], x[5]); + w3 = _mm256_unpackhi_epi16(x[6], x[7]); + + ww0 = _mm256_unpacklo_epi32(w0, w1); // + ww1 = _mm256_unpacklo_epi32(w2, w3); // + + dd[4] = _mm256_unpacklo_epi64(ww0, ww1); + dd[5] = _mm256_unpackhi_epi64(ww0, ww1); + + ww0 = _mm256_unpackhi_epi32(w0, w1); // + ww1 = _mm256_unpackhi_epi32(w2, w3); // + + dd[6] = _mm256_unpacklo_epi64(ww0, ww1); + dd[7] = _mm256_unpackhi_epi64(ww0, ww1); + + w0 = _mm256_unpacklo_epi16(x[8], x[9]); + w1 = _mm256_unpacklo_epi16(x[10], x[11]); + w2 = _mm256_unpacklo_epi16(x[12], x[13]); + w3 = _mm256_unpacklo_epi16(x[14], x[15]); + + ww0 = _mm256_unpacklo_epi32(w0, w1); + ww1 = _mm256_unpacklo_epi32(w2, w3); + + dd[8] = _mm256_unpacklo_epi64(ww0, ww1); + dd[9] = _mm256_unpackhi_epi64(ww0, ww1); + + ww0 = _mm256_unpackhi_epi32(w0, w1); + ww1 = _mm256_unpackhi_epi32(w2, w3); + + dd[10] = _mm256_unpacklo_epi64(ww0, ww1); + dd[11] = _mm256_unpackhi_epi64(ww0, ww1); + + w0 = _mm256_unpackhi_epi16(x[8], x[9]); + w1 = _mm256_unpackhi_epi16(x[10], x[11]); + w2 = _mm256_unpackhi_epi16(x[12], x[13]); + w3 = _mm256_unpackhi_epi16(x[14], x[15]); + + ww0 = _mm256_unpacklo_epi32(w0, w1); + ww1 = _mm256_unpacklo_epi32(w2, w3); + + dd[12] = _mm256_unpacklo_epi64(ww0, ww1); + dd[13] = _mm256_unpackhi_epi64(ww0, ww1); + + ww0 = _mm256_unpackhi_epi32(w0, w1); + ww1 = _mm256_unpackhi_epi32(w2, w3); + + dd[14] = _mm256_unpacklo_epi64(ww0, ww1); + dd[15] = _mm256_unpackhi_epi64(ww0, ww1); + + for (int i = 0; i < 8; i++) { + d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1); + d[i + 8] = _mm256_insertf128_si256(dd[i + 8], + _mm256_extracti128_si256(dd[i], 1), 0); + } +} + void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i sum_above = dc_sum_32(above); @@ -169,34 +420,12 @@ void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, // ----------------------------------------------------------------------------- // Rectangle - -// TODO(luoyi) The following two functions are shared with intrapred_sse2.c. -// Use a header file, intrapred_common_x86.h -static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) { - __m128i x = _mm_load_si128((__m128i const *)ref); - const __m128i zero = _mm_setzero_si128(); - x = _mm_sad_epu8(x, zero); - const __m128i high = _mm_unpackhi_epi64(x, x); - return _mm_add_epi16(x, high); -} - -static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) { - __m128i x0 = _mm_load_si128((__m128i const *)ref); - __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); - const __m128i zero = _mm_setzero_si128(); - x0 = _mm_sad_epu8(x0, zero); - x1 = _mm_sad_epu8(x1, zero); - x0 = _mm_add_epi16(x0, x1); - const __m128i high = _mm_unpackhi_epi64(x0, x0); - return _mm_add_epi16(x0, high); -} - void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i top_sum = dc_sum_32_sse2(above); __m128i left_sum = dc_sum_16_sse2(left); left_sum = _mm_add_epi16(top_sum, left_sum); - uint32_t sum = _mm_cvtsi128_si32(left_sum); + uint16_t sum = _mm_cvtsi128_si32(left_sum); sum += 24; sum /= 48; const __m256i row = _mm256_set1_epi8((uint8_t)sum); @@ -208,7 +437,7 @@ void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, const __m256i sum_above = dc_sum_32(above); __m256i sum_left = dc_sum_64(left); sum_left = _mm256_add_epi16(sum_left, sum_above); - uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); sum += 48; sum /= 96; const __m256i row = _mm256_set1_epi8((uint8_t)sum); @@ -220,7 +449,7 @@ void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, const __m256i sum_above = dc_sum_64(above); __m256i sum_left = dc_sum_64(left); sum_left = _mm256_add_epi16(sum_left, sum_above); - uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); sum += 64; sum /= 128; const __m256i row = _mm256_set1_epi8((uint8_t)sum); @@ -232,7 +461,7 @@ void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, const __m256i sum_above = dc_sum_64(above); __m256i sum_left = dc_sum_32(left); sum_left = _mm256_add_epi16(sum_left, sum_above); - uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); sum += 48; sum /= 96; const __m256i row = _mm256_set1_epi8((uint8_t)sum); @@ -244,7 +473,7 @@ void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, const __m256i sum_above = dc_sum_64(above); __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left)); sum_left = _mm256_add_epi16(sum_left, sum_above); - uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); sum += 40; sum /= 80; const __m256i row = _mm256_set1_epi8((uint8_t)sum); @@ -525,7 +754,7 @@ void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride, __m128i x = _mm_loadl_epi64((const __m128i *)left); const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); - __m256i rep = _mm256_set1_epi16(0x8000); + __m256i rep = _mm256_set1_epi16((short)0x8000); const __m256i one = _mm256_set1_epi16(1); const __m256i top = get_top_vector(above); @@ -549,7 +778,7 @@ void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i l = get_left_vector(left); const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); - __m256i rep = _mm256_set1_epi16(0x8000); + __m256i rep = _mm256_set1_epi16((short)0x8000); const __m256i one = _mm256_set1_epi16(1); const __m256i top = get_top_vector(above); @@ -568,7 +797,7 @@ void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m256i l = get_left_vector(left); const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); - __m256i rep = _mm256_set1_epi16(0x8000); + __m256i rep = _mm256_set1_epi16((short)0x8000); const __m256i one = _mm256_set1_epi16(1); const __m256i top = get_top_vector(above); @@ -583,7 +812,7 @@ void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, } l = get_left_vector(left + 16); - rep = _mm256_set1_epi16(0x8000); + rep = _mm256_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); @@ -602,7 +831,7 @@ void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride, for (int j = 0; j < 4; ++j) { const __m256i l = get_left_vector(left + j * 16); - __m256i rep = _mm256_set1_epi16(0x8000); + __m256i rep = _mm256_set1_epi16((short)0x8000); for (int i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); @@ -635,7 +864,7 @@ void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const __m256i t0 = get_top_vector(above); const __m256i t1 = get_top_vector(above + 16); const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); - __m256i rep = _mm256_set1_epi16(0x8000); + __m256i rep = _mm256_set1_epi16((short)0x8000); const __m256i one = _mm256_set1_epi16(1); int i; @@ -657,7 +886,7 @@ void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, const __m256i t0 = get_top_vector(above); const __m256i t1 = get_top_vector(above + 16); const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); - __m256i rep = _mm256_set1_epi16(0x8000); + __m256i rep = _mm256_set1_epi16((short)0x8000); const __m256i one = _mm256_set1_epi16(1); int i; @@ -675,7 +904,7 @@ void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, } l = get_left_vector(left + 16); - rep = _mm256_set1_epi16(0x8000); + rep = _mm256_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); @@ -700,7 +929,7 @@ void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, int i, j; for (j = 0; j < 4; ++j) { const __m256i l = get_left_vector(left + j * 16); - __m256i rep = _mm256_set1_epi16(0x8000); + __m256i rep = _mm256_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); @@ -728,7 +957,7 @@ void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, int i, j; for (j = 0; j < 2; ++j) { const __m256i l = get_left_vector(left + j * 16); - __m256i rep = _mm256_set1_epi16(0x8000); + __m256i rep = _mm256_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); @@ -760,7 +989,7 @@ void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, int i, j; for (j = 0; j < 4; ++j) { const __m256i l = get_left_vector(left + j * 16); - __m256i rep = _mm256_set1_epi16(0x8000); + __m256i rep = _mm256_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); @@ -791,7 +1020,7 @@ void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, int i; const __m256i l = get_left_vector(left); - __m256i rep = _mm256_set1_epi16(0x8000); + __m256i rep = _mm256_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); @@ -809,3 +1038,3858 @@ void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, rep = _mm256_add_epi16(rep, one); } } + +#define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6) +#define PERM2x128(c0, c1) c0 + (c1 << 4) + +static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2( + int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((N + 4) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i diff, c3f; + __m128i a_mbase_x, max_base_x128, base_inc128, mask128; + __m128i a0_128, a1_128; + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm_set1_epi16(above[max_base_x]); + max_base_x128 = _mm_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dst[i] = a_mbase_x; // save 4 values + } + return; + } + + a0_128 = _mm_loadu_si128((__m128i *)(above + base)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1)); + + if (upsample_above) { + a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]); + a1_128 = _mm_srli_si128(a0_128, 8); + + base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8, + base + 10, base + 12, base + 14); + shift = _mm256_srli_epi16( + _mm256_and_si256( + _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), + _mm256_set1_epi16(0x3f)), + 1); + } else { + base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4, + base + 5, base + 6, base + 7); + shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + } + a0 = _mm256_castsi128_si256(a0_128); + a1 = _mm256_castsi128_si256(a1_128); + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + res1 = _mm256_castsi256_si128(res); + + mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128); + dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128); + x += dx; + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2( + int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((N + 4) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i diff; + __m128i a_mbase_x, max_base_x128, base_inc128, mask128; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm_set1_epi16(above[max_base_x]); + max_base_x128 = _mm_set1_epi32(max_base_x); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dst[i] = a_mbase_x; // save 4 values + } + return; + } + + a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); + a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); + + if (upsample_above) { + a0 = _mm256_permutevar8x32_epi32( + a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); + base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6); + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), + _mm256_set1_epi32(0x3f)), + 1); + } else { + base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3); + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); + } + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + res1 = _mm256_castsi256_si128(res); + res1 = _mm_packus_epi32(res1, res1); + + mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128); + mask128 = _mm_packs_epi32(mask128, mask128); // goto 16 bit + dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128); + x += dx; + } +} + +static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx, + int bd) { + __m128i dstvec[16]; + if (bd < 12) { + highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above, + dx); + } else { + highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above, + upsample_above, dx); + } + for (int i = 0; i < N; i++) { + _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2( + int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((8 + N) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a0_1, a1_1, a32, a16; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi32(max_base_x); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, res1, shift; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values + } + return; + } + + a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); + a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); + + if (upsample_above) { + a0 = _mm256_permutevar8x32_epi32( + a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); + + a0_1 = + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); + a0_1 = _mm256_permutevar8x32_epi32( + a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1)); + + a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1); + a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1); + base_inc256 = + _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8, + base + 10, base + 12, base + 14); + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), + _mm256_set1_epi32(0x3f)), + 1); + } else { + base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3, + base + 4, base + 5, base + 6, base + 7); + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); + } + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + res1 = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + + mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256); + mask256 = _mm256_packs_epi32( + mask256, _mm256_castsi128_si256( + _mm256_extracti128_si256(mask256, 1))); // goto 16 bit + res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); + dst[r] = _mm256_castsi256_si128(res1); + x += dx; + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2( + int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((8 + N) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + __m128i a0_x128, a1_x128; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, res1, shift; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values + } + return; + } + + a0_x128 = _mm_loadu_si128((__m128i *)(above + base)); + if (upsample_above) { + __m128i mask, atmp0, atmp1, atmp2, atmp3; + a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8)); + atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]); + atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]); + atmp2 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16)); + atmp3 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16)); + mask = + _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15)); + a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); + mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16), + _mm_set1_epi8(15)); + a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); + + base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6, + base + 8, base + 10, base + 12, base + 14, + 0, 0, 0, 0, 0, 0, 0, 0); + shift = _mm256_srli_epi16( + _mm256_and_si256( + _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f), + 1); + } else { + a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1)); + base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, + base + 4, base + 5, base + 6, base + 7, 0, + 0, 0, 0, 0, 0, 0, 0); + shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + } + a0 = _mm256_castsi128_si256(a0_x128); + a1 = _mm256_castsi128_si256(a1_x128); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256); + dst[r] = _mm256_castsi256_si128(res1); + x += dx; + } +} + +static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx, + int bd) { + __m128i dstvec[32]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above, + dx); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above, + upsample_above, dx); + } + for (int i = 0; i < N; i++) { + _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2( + int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((16 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a0_1, a1, a1_1, a32, a16; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res[2], res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 16 values + } + return; + } + __m256i shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); + + a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); + a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[0] = _mm256_add_epi32(a32, b); + res[0] = _mm256_srli_epi32(res[0], 5); + res[0] = _mm256_packus_epi32( + res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); + + int mdif = max_base_x - base; + if (mdif > 8) { + a0_1 = + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); + a1_1 = + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9))); + + diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[1] = _mm256_add_epi32(a32, b); + res[1] = _mm256_srli_epi32(res[1], 5); + res[1] = _mm256_packus_epi32( + res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); + } else { + res[1] = a_mbase_x; + } + res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), + 1); // 16 16bit values + + base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, + base + 4, base + 5, base + 6, base + 7, + base + 8, base + 9, base + 10, base + 11, + base + 12, base + 13, base + 14, base + 15); + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256); + x += dx; + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2( + int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((16 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 16 values + } + return; + } + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + a0 = _mm256_loadu_si256((__m256i *)(above + base)); + a1 = _mm256_loadu_si256((__m256i *)(above + base + 1)); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); // 16 16bit values + + base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, + base + 4, base + 5, base + 6, base + 7, + base + 8, base + 9, base + 10, base + 11, + base + 12, base + 13, base + 14, base + 15); + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256); + x += dx; + } +} + +static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx, + int bd) { + __m256i dstvec[64]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above, + dx); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above, + upsample_above, dx); + } + for (int i = 0; i < N; i++) { + _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2( + int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((32 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a0_1, a1, a1_1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res[2], res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 32 values + dstvec[i + N] = a_mbase_x; + } + return; + } + + __m256i shift = + _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1); + + for (int j = 0; j < 32; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + res1 = a_mbase_x; + } else { + a0 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((__m128i *)(above + base + j))); + a1 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((__m128i *)(above + base + 1 + j))); + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[0] = _mm256_add_epi32(a32, b); + res[0] = _mm256_srli_epi32(res[0], 5); + res[0] = _mm256_packus_epi32( + res[0], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); + if (mdif > 8) { + a0_1 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((__m128i *)(above + base + 8 + j))); + a1_1 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((__m128i *)(above + base + 9 + j))); + + diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[1] = _mm256_add_epi32(a32, b); + res[1] = _mm256_srli_epi32(res[1], 5); + res[1] = _mm256_packus_epi32( + res[1], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); + } else { + res[1] = a_mbase_x; + } + res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), + 1); // 16 16bit values + base_inc256 = _mm256_setr_epi16( + base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, + base + j + 5, base + j + 6, base + j + 7, base + j + 8, + base + j + 9, base + j + 10, base + j + 11, base + j + 12, + base + j + 13, base + j + 14, base + j + 15); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); + } + if (!j) { + dstvec[r] = res1; + } else { + dstvec[r + N] = res1; + } + } + x += dx; + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2( + int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((32 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 32 values + dstvec[i + N] = a_mbase_x; + } + return; + } + + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + for (int j = 0; j < 32; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + res = a_mbase_x; + } else { + a0 = _mm256_loadu_si256((__m256i *)(above + base + j)); + a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j)); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + base_inc256 = _mm256_setr_epi16( + base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, + base + j + 5, base + j + 6, base + j + 7, base + j + 8, + base + j + 9, base + j + 10, base + j + 11, base + j + 12, + base + j + 13, base + j + 14, base + j + 15); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res = _mm256_blendv_epi8(a_mbase_x, res, mask256); + } + if (!j) { + dstvec[r] = res; + } else { + dstvec[r + N] = res; + } + } + x += dx; + } +} + +static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx, + int bd) { + __m256i dstvec[128]; + if (bd < 12) { + highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, + dx); + } else { + highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above, + upsample_above, dx); + } + for (int i = 0; i < N; i++) { + _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); + _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]); + } +} + +static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, + int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((64 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a0_1, a1, a1_1, a32, a16; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + + int x = dx; + for (int r = 0; r < N; r++, dst += stride) { + __m256i b, res[2], res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values + _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x); + _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); + _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x); + dst += stride; + } + return; + } + + __m256i shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); + + __m128i a0_128, a0_1_128, a1_128, a1_1_128; + for (int j = 0; j < 64; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x); + } else { + a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); + a0 = _mm256_cvtepu16_epi32(a0_128); + a1 = _mm256_cvtepu16_epi32(a1_128); + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[0] = _mm256_add_epi32(a32, b); + res[0] = _mm256_srli_epi32(res[0], 5); + res[0] = _mm256_packus_epi32( + res[0], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); + if (mdif > 8) { + a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j)); + a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j)); + a0_1 = _mm256_cvtepu16_epi32(a0_1_128); + a1_1 = _mm256_cvtepu16_epi32(a1_1_128); + + diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[1] = _mm256_add_epi32(a32, b); + res[1] = _mm256_srli_epi32(res[1], 5); + res[1] = _mm256_packus_epi32( + res[1], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); + } else { + res[1] = a_mbase_x; + } + res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), + 1); // 16 16bit values + base_inc256 = _mm256_setr_epi16( + base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, + base + j + 5, base + j + 6, base + j + 7, base + j + 8, + base + j + 9, base + j + 10, base + j + 11, base + j + 12, + base + j + 13, base + j + 14, base + j + 15); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); + _mm256_storeu_si256((__m256i *)(dst + j), res1); + } + } + x += dx; + } +} + +static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((64 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++, dst += stride) { + __m256i b, res; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values + _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x); + _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); + _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x); + dst += stride; + } + return; + } + + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + for (int j = 0; j < 64; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x); + } else { + a0 = _mm256_loadu_si256((__m256i *)(above + base + j)); + a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j)); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + base_inc256 = _mm256_setr_epi16( + base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, + base + j + 5, base + j + 6, base + j + 7, base + j + 8, + base + j + 9, base + j + 10, base + j + 11, base + j + 12, + base + j + 13, base + j + 14, base + j + 15); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res = _mm256_blendv_epi8(a_mbase_x, res, mask256); + _mm256_storeu_si256((__m256i *)(dst + j), res); // 16 16bit values + } + } + x += dx; + } +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int dx, int dy, int bd) { + (void)left; + (void)dy; + + switch (bw) { + case 4: + highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, + dx, bd); + break; + case 8: + highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, + dx, bd); + break; + case 16: + highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, + dx, bd); + break; + case 32: + highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, + dx, bd); + break; + case 64: + if (bd < 12) { + highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above, + upsample_above, dx); + } else { + highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above, + upsample_above, dx); + } + break; + default: break; + } + return; +} + +static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc, + uint16_t *dst, ptrdiff_t pitchDst) { + __m256i r[16]; + __m256i d[16]; + for (int j = 0; j < 16; j++) { + r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc)); + } + highbd_transpose16x16_avx2(r, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]); + } +} + +static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc, + uint16_t *dst, ptrdiff_t pitchDst, int width, + int height) { + for (int j = 0; j < height; j += 16) + for (int i = 0; i < width; i += 16) + highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc, + dst + j * pitchDst + i, pitchDst); +} + +static void highbd_dr_prediction_32bit_z2_Nx4_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a32, a16; + __m256i diff; + __m128i c3f, min_base_y128; + + a16 = _mm256_set1_epi32(16); + c3f = _mm_set1_epi32(0x3f); + min_base_y128 = _mm_set1_epi32(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + __m128i a0_x128, a1_x128; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 4) { + base_min_diff = 4; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 3) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + a0_x128 = _mm_shuffle_epi8(a0_x128, + *(__m128i *)HighbdEvenOddMaskx4[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 8); + + shift = _mm256_castsi128_si256(_mm_srli_epi32( + _mm_and_si128( + _mm_slli_epi32( + _mm_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx), + upsample_above), + c3f), + 1)); + } else { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 2); + + shift = _mm256_castsi128_si256(_mm_srli_epi32( + _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx), + c3f), + 1)); + } + a0_x = _mm256_cvtepu16_epi32(a0_x128); + a1_x = _mm256_cvtepu16_epi32(a1_x128); + } + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; + DECLARE_ALIGNED(32, int, base_y_c[4]); + r6 = _mm_set1_epi32(r << 6); + dy128 = _mm_set1_epi32(dy); + c1234 = _mm_setr_epi32(1, 2, 3, 4); + y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128)); + base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]]); + a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1], + left[base_y_c[2] + 1], left[base_y_c[3] + 1]); + + if (upsample_left) { + shifty = _mm_srli_epi32( + _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resx = _mm256_castsi256_si128(res); + resx = _mm_packus_epi32(resx, resx); + + resy = _mm256_extracti128_si256(res, 1); + resy = _mm_packus_epi32(resy, resy); + + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storel_epi64((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_z2_Nx4_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a32, a16; + __m256i diff; + __m128i c3f, min_base_y128; + + a16 = _mm256_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + __m128i a0_x128, a1_x128; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 4) { + base_min_diff = 4; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 3) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + a0_x128 = _mm_shuffle_epi8(a0_x128, + *(__m128i *)HighbdEvenOddMaskx4[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 8); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, + (3 << 6) - y * dx, 0, 0, 0, 0), + upsample_above), + c3f), + 1)); + } else { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 2); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, + (3 << 6) - y * dx, 0, 0, 0, 0), + c3f), + 1)); + } + a0_x = _mm256_castsi128_si256(a0_x128); + a1_x = _mm256_castsi128_si256(a1_x128); + } + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; + DECLARE_ALIGNED(32, int16_t, base_y_c[8]); + r6 = _mm_set1_epi16(r << 6); + dy128 = _mm_set1_epi16(dy); + c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); + a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], + left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0, + 0, 0); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + resx = _mm256_castsi256_si128(res); + resy = _mm256_extracti128_si256(res, 1); + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storel_epi64((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_32bit_z2_Nx8_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256; + __m256i diff; + __m128i a0_x128, a1_x128; + + a16 = _mm256_set1_epi32(16); + c3f = _mm256_set1_epi32(0x3f); + min_base_y256 = _mm256_set1_epi32(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 8) { + base_min_diff = 8; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + resx = _mm_setzero_si128(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + __m128i mask, atmp0, atmp1, atmp2, atmp3; + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift)); + atmp0 = _mm_shuffle_epi8(a0_x128, + *(__m128i *)HighbdEvenOddMaskx[base_shift]); + atmp1 = _mm_shuffle_epi8(a1_x128, + *(__m128i *)HighbdEvenOddMaskx[base_shift]); + atmp2 = _mm_shuffle_epi8( + a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); + atmp3 = _mm_shuffle_epi8( + a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); + mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift], + _mm_set1_epi8(15)); + a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); + mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16), + _mm_set1_epi8(15)); + a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_slli_epi32( + _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + upsample_above), + c3f), + 1); + } else { + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, + (3 << 6) - y * dx, (4 << 6) - y * dx, + (5 << 6) - y * dx, (6 << 6) - y * dx, + (7 << 6) - y * dx), + c3f), + 1); + } + a0_x = _mm256_cvtepu16_epi32(a0_x128); + a1_x = _mm256_cvtepu16_epi32(a1_x128); + + diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resx = _mm256_castsi256_si128(_mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } + // y calc + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int, base_y_c[8]); + __m256i r6, c256, dy256, y_c256, base_y_c256, mask256; + r6 = _mm256_set1_epi32(r << 6); + dy256 = _mm256_set1_epi32(dy); + c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); + base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]])); + a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], + left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1])); + + if (upsample_left) { + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f), + 1); + } else { + shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); + } + diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resy = _mm256_castsi256_si128(_mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } else { + resy = resx; + } + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storeu_si128((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_z2_Nx8_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m128i c3f, min_base_y128; + __m256i a0_x, a1_x, diff, a32, a16; + __m128i a0_x128, a1_x128; + + a16 = _mm256_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 8) { + base_min_diff = 8; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + __m128i mask, atmp0, atmp1, atmp2, atmp3; + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift)); + atmp0 = _mm_shuffle_epi8(a0_x128, + *(__m128i *)HighbdEvenOddMaskx[base_shift]); + atmp1 = _mm_shuffle_epi8(a1_x128, + *(__m128i *)HighbdEvenOddMaskx[base_shift]); + atmp2 = _mm_shuffle_epi8( + a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); + atmp3 = _mm_shuffle_epi8( + a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); + mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift], + _mm_set1_epi8(15)); + a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); + mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16), + _mm_set1_epi8(15)); + a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16( + _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + upsample_above), + c3f), + 1)); + } else { + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + c3f), + 1)); + } + a0_x = _mm256_castsi128_si256(a0_x128); + a1_x = _mm256_castsi128_si256(a1_x128); + } + + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int16_t, base_y_c[8]); + __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; + r6 = _mm_set1_epi16(r << 6); + dy128 = _mm_set1_epi16(dy); + c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], + left[base_y_c[2] + 1], left[base_y_c[3] + 1], + left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1]); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + resx = _mm256_castsi256_si128(res); + resy = _mm256_extracti128_si256(res, 1); + + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storeu_si128((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_32bit_z2_HxW_avx2( + int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + const int min_base_y = -1; + (void)upsample_above; + (void)upsample_left; + const int frac_bits_x = 6; + const int frac_bits_y = 6; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1; + __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8; + __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128; + DECLARE_ALIGNED(32, int, base_y_c[16]); + + a16 = _mm256_set1_epi32(16); + c1 = _mm256_srli_epi32(a16, 4); + c8 = _mm256_srli_epi32(a16, 1); + min_base_y256 = _mm256_set1_epi16(min_base_y); + c3f = _mm256_set1_epi32(0x3f); + dy256 = _mm256_set1_epi32(dy); + c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + c1234 = _mm256_add_epi32(c0123, c1); + + for (int r = 0; r < H; r++) { + __m256i b, res, shift, ydx; + __m256i resx[2], resy[2]; + __m256i resxy, j256, r6; + for (int j = 0; j < W; j += 16) { + j256 = _mm256_set1_epi32(j); + int y = r + 1; + ydx = _mm256_set1_epi32(y * dx); + + int base_x = ((j << 6) - y * dx) >> frac_bits_x; + int base_shift = 0; + if ((base_x) < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1); + } + int base_min_diff = (min_base_x - base_x); + if (base_min_diff > 16) { + base_min_diff = 16; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + resx[0] = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + a0_x = _mm256_cvtepu16_epi32(a0_x128); + a1_x = _mm256_cvtepu16_epi32(a1_x128); + + r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6); + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1); + + diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resx[0] = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + } + int base_shift8 = 0; + if ((base_x + 8) < (min_base_x - 1)) { + base_shift8 = (min_base_x - (base_x + 8) - 1); + } + if (base_shift8 > 7) { + resx[1] = _mm256_setzero_si256(); + } else { + a0_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8)); + a1_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9)); + a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift8]); + a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift8]); + + a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128); + a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128); + + r6 = _mm256_slli_epi32( + _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6); + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1); + + diff = _mm256_sub_epi32(a1_1_x, a0_1_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + resx[1] = _mm256_add_epi32(a32, b); + resx[1] = _mm256_srli_epi32(resx[1], 5); + resx[1] = _mm256_packus_epi32( + resx[1], + _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1))); + } + resx[0] = + _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]), + 1); // 16 16bit values + + // y calc + resy[0] = _mm256_setzero_si256(); + if ((base_x < min_base_x)) { + __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256; + r6 = _mm256_set1_epi32(r << 6); + c256 = _mm256_add_epi32(j256, c1234); + y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); + base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + c256 = _mm256_add_epi32(c256, c8); + y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); + base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y); + mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256); + + a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]])); + a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], + left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1])); + + shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); + + diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resy[0] = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + + a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]], + left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]], + left[base_y_c[14]], left[base_y_c[15]])); + a1_y = _mm256_cvtepu16_epi32( + _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1], + left[base_y_c[10] + 1], left[base_y_c[11] + 1], + left[base_y_c[12] + 1], left[base_y_c[13] + 1], + left[base_y_c[14] + 1], left[base_y_c[15] + 1])); + shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1); + + diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resy[1] = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + + resy[0] = + _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]), + 1); // 16 16bit values + } + + resxy = _mm256_blendv_epi8(resx[0], resy[0], + *(__m256i *)HighbdBaseMask[base_min_diff]); + _mm256_storeu_si256((__m256i *)(dst + j), resxy); + } // for j + dst += stride; + } +} + +static void highbd_dr_prediction_z2_HxW_avx2( + int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + const int min_base_y = -1; + (void)upsample_above; + (void)upsample_left; + const int frac_bits_x = 6; + const int frac_bits_y = 6; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a32, a16, c3f, c1; + __m256i diff, min_base_y256, dy256, c1234, c0123; + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + + a16 = _mm256_set1_epi16(16); + c1 = _mm256_srli_epi16(a16, 4); + min_base_y256 = _mm256_set1_epi16(min_base_y); + c3f = _mm256_set1_epi16(0x3f); + dy256 = _mm256_set1_epi16(dy); + c0123 = + _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + c1234 = _mm256_add_epi16(c0123, c1); + + for (int r = 0; r < H; r++) { + __m256i b, res, shift; + __m256i resx, resy, ydx; + __m256i resxy, j256, r6; + __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128; + int y = r + 1; + ydx = _mm256_set1_epi16((short)(y * dx)); + + for (int j = 0; j < W; j += 16) { + j256 = _mm256_set1_epi16(j); + int base_x = ((j << 6) - y * dx) >> frac_bits_x; + int base_shift = 0; + if ((base_x) < (min_base_x - 1)) { + base_shift = (min_base_x - (base_x)-1); + } + int base_min_diff = (min_base_x - base_x); + if (base_min_diff > 16) { + base_min_diff = 16; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift < 8) { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + a0_x = _mm256_castsi128_si256(a0_x128); + a1_x = _mm256_castsi128_si256(a1_x128); + } else { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + } + + int base_shift1 = 0; + if (base_shift > 8) { + base_shift1 = base_shift - 8; + } + if (base_shift1 < 8) { + a0_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8)); + a1_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9)); + a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift1]); + a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift1]); + + a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1); + } + r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6); + shift = _mm256_srli_epi16( + _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1); + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + resx = _mm256_srli_epi16(res, 5); // 16 16-bit values + + // y calc + resy = _mm256_setzero_si256(); + __m256i a0_y, a1_y, shifty; + if ((base_x < min_base_x)) { + __m256i c256, y_c256, base_y_c256, mask256, mul16; + r6 = _mm256_set1_epi16(r << 6); + c256 = _mm256_add_epi16(j256, c1234); + mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), + _mm256_srli_epi16(min_base_y256, 1)); + y_c256 = _mm256_sub_epi16(r6, mul16); + base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a0_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + base_y_c256 = _mm256_add_epi16(base_y_c256, c1); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a1_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + + shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); + + diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shifty); + res = _mm256_add_epi16(a32, b); + resy = _mm256_srli_epi16(res, 5); + } + + resxy = _mm256_blendv_epi8(resx, resy, + *(__m256i *)HighbdBaseMask[base_min_diff]); + _mm256_storeu_si256((__m256i *)(dst + j), resxy); + } // for j + dst += stride; + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + switch (bw) { + case 4: + if (bd < 12) { + highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + } else { + highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, + dx, dy); + } + break; + case 8: + if (bd < 12) { + highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + } else { + highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, + dx, dy); + } + break; + default: + if (bd < 12) { + highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + } else { + highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left, + upsample_above, upsample_left, + dx, dy); + } + break; + } +} + +// Directional prediction, zone 3 functions +static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[4], d[4]; + if (bd < 12) { + highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left, + upsample_left, dy); + } + highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], + &dstvec[3], &d[0], &d[1], &d[2], &d[3]); + _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); + return; +} + +static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[8], d[8]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left, + upsample_left, dy); + } + highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], + &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], + &d[7]); + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[4], d[8]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left, + upsample_left, dy); + } + + highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], + &d[7]); + for (int i = 0; i < 8; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); + } +} + +static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[8], d[4]; + if (bd < 12) { + highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left, + upsample_left, dy); + } + + highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], + &d[0], &d[1], &d[2], &d[3]); + _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]); + _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]); + _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]); + _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]); +} + +static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[8], d[8]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left, + upsample_left, dy); + } + highbd_transpose8x16_16x8_avx2(dstvec, d); + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), + _mm256_castsi256_si128(d[i])); + } + for (int i = 8; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), + _mm256_extracti128_si256(d[i - 8], 1)); + } +} + +static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[16], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left, + upsample_left, dy); + } + for (int i = 0; i < 16; i += 8) { + highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i], + &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i], + &dstvec[6 + i], &dstvec[7 + i], &d[0 + i], + &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i], + &d[5 + i], &d[6 + i], &d[7 + i]); + } + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]); + } +} + +static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[4], d[4], d1; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left, + upsample_left, dy); + } + highbd_transpose4x16_avx2(dstvec, d); + for (int i = 0; i < 4; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), + _mm256_castsi256_si128(d[i])); + d1 = _mm256_bsrli_epi128(d[i], 8); + _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride), + _mm256_castsi256_si128(d1)); + _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), + _mm256_extracti128_si256(d[i], 1)); + _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride), + _mm256_extracti128_si256(d1, 1)); + } +} + +static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[16], d[8]; + if (bd < 12) { + highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left, + upsample_left, dy); + } + highbd_transpose16x4_8x8_sse2(dstvec, d); + + _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]); + _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]); + _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]); + _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]); + _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]); + _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]); + _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]); + _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]); +} + +static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[16], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left, + upsample_left, dy); + } + + for (int i = 0; i < 16; i += 8) { + highbd_transpose8x16_16x8_avx2(dstvec + i, d + i); + } + + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), + _mm256_castsi256_si128(d[i])); + } + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride), + _mm256_extracti128_si256(d[i], 1)); + } + for (int i = 8; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride), + _mm256_castsi256_si128(d[i])); + } + for (int i = 8; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride), + _mm256_extracti128_si256(d[i], 1)); + } +} + +static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[32], d[32]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left, + upsample_left, dy); + } + + for (int i = 0; i < 32; i += 8) { + highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i], + &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i], + &dstvec[6 + i], &dstvec[7 + i], &d[0 + i], + &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i], + &d[5 + i], &d[6 + i], &d[7 + i]); + } + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]); + } +} + +static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[16], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left, + upsample_left, dy); + } + + highbd_transpose16x16_avx2(dstvec, d); + + for (int i = 0; i < 16; i++) { + _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]); + } +} + +static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[64], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left, + upsample_left, dy); + } + highbd_transpose16x16_avx2(dstvec, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]); + } + highbd_transpose16x16_avx2(dstvec + 16, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]); + } + highbd_transpose16x16_avx2(dstvec + 32, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]); + } + highbd_transpose16x16_avx2(dstvec + 48, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]); + } +} + +static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]); + if (bd < 12) { + highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); + } else { + highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left, + dy); + } + highbd_transpose(dstT, 64, dst, stride, 64, 64); +} + +static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[32], d[32]; + if (bd < 12) { + highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left, + upsample_left, dy); + } + for (int i = 0; i < 32; i += 8) { + highbd_transpose8x16_16x8_avx2(dstvec + i, d + i); + } + // store + for (int j = 0; j < 32; j += 16) { + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + j) * stride), + _mm256_castsi256_si128(d[(i + j)])); + } + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8), + _mm256_castsi256_si128(d[(i + j) + 8])); + } + for (int i = 8; i < 16; i++) { + _mm256_storeu_si256( + (__m256i *)(dst + (i + j) * stride), + _mm256_inserti128_si256( + d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0)); + } + } +} + +static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[32], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left, + upsample_left, dy); + } + for (int i = 0; i < 32; i += 16) { + highbd_transpose16x16_avx2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]); + } + } +} + +static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + uint16_t dstT[64 * 32]; + if (bd < 12) { + highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); + } else { + highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left, + dy); + } + highbd_transpose(dstT, 64, dst, stride, 32, 64); +} + +static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]); + highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd); + highbd_transpose(dstT, 32, dst, stride, 64, 32); + return; +} + +static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]); + if (bd < 12) { + highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); + } else { + highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left, + dy); + } + highbd_transpose(dstT, 64, dst, stride, 16, 64); +} + +static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[64], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left, + upsample_left, dy); + } + for (int i = 0; i < 64; i += 16) { + highbd_transpose16x16_avx2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]); + } + } +} + +void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_left, + int dx, int dy, int bd) { + (void)above; + (void)dx; + + assert(dx == 1); + assert(dy > 0); + if (bw == bh) { + switch (bw) { + case 4: + highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + case 8: + highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + case 16: + highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + case 32: + highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + case 64: + highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + } + } else { + if (bw < bh) { + if (bw + bw == bh) { + switch (bw) { + case 4: + highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 8: + highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 16: + highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 32: + highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + } + } else { + switch (bw) { + case 4: + highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 8: + highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 16: + highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + } + } + } else { + if (bh + bh == bw) { + switch (bh) { + case 4: + highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 8: + highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 16: + highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 32: + highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + } + } else { + switch (bh) { + case 4: + highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 8: + highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 16: + highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + } + } + } + } + return; +} + +// Low bit depth functions +static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, +}; + +static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, + { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, + { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, + { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, + { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }, + { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, +}; + +static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = { + { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, + { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 }, + { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 }, + { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 }, + { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 }, + { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 }, + { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 }, + { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 } +}; +/* clang-format off */ +static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = { + { -1, 0, 0, 0, 0, 0, 0, 0}, + { -1, -1, 0, 0, 0, 0, 0, 0}, + { -1, -1, -1, 0, 0, 0, 0, 0}, + { -1, -1, -1, -1, 0, 0, 0, 0}, + { -1, -1, -1, -1, -1, 0, 0, 0}, + { -1, -1, -1, -1, -1, -1, 0, 0}, + { -1, -1, -1, -1, -1, -1, -1, 0}, + { -1, -1, -1, -1, -1, -1, -1, -1}, +}; +/* clang-format on */ +static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2( + int H, int W, __m128i *dst, const uint8_t *above, int upsample_above, + int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((W + H) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i diff, c3f; + __m128i a_mbase_x; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm_set1_epi8(above[max_base_x]); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < W; r++) { + __m256i b, res, shift; + __m128i res1, a0_128, a1_128; + + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base) >> upsample_above; + if (base_max_diff <= 0) { + for (int i = r; i < W; ++i) { + dst[i] = a_mbase_x; // save 4 values + } + return; + } + if (base_max_diff > H) base_max_diff = H; + a0_128 = _mm_loadu_si128((__m128i *)(above + base)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1)); + + if (upsample_above) { + a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]); + a1_128 = _mm_srli_si128(a0_128, 8); + + shift = _mm256_srli_epi16( + _mm256_and_si256( + _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f), + 1); + } else { + shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + } + a0 = _mm256_cvtepu8_epi16(a0_128); + a1 = _mm256_cvtepu8_epi16(a1_128); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + res = _mm256_packus_epi16( + res, _mm256_castsi128_si256( + _mm256_extracti128_si256(res, 1))); // goto 8 bit + res1 = _mm256_castsi256_si128(res); // 16 8bit values + + dst[r] = + _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]); + x += dx; + } +} + +static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m128i dstvec[16]; + + dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]); + } +} + +static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m128i dstvec[32]; + + dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m128i dstvec[64]; + + dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2( + int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((32 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i a_mbase_x, diff, c3f; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi8(above[max_base_x]); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, res16[2]; + __m128i a0_128, a1_128; + + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base); + if (base_max_diff <= 0) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 32 values + } + return; + } + if (base_max_diff > 32) base_max_diff = 32; + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + for (int j = 0, jj = 0; j < 32; j += 16, jj++) { + int mdiff = base_max_diff - j; + if (mdiff <= 0) { + res16[jj] = a_mbase_x; + } else { + a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1)); + a0 = _mm256_cvtepu8_epi16(a0_128); + a1 = _mm256_cvtepu8_epi16(a1_128); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + res16[jj] = _mm256_packus_epi16( + res, _mm256_castsi128_si256( + _mm256_extracti128_si256(res, 1))); // 16 8bit values + } + } + res16[1] = + _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]), + 1); // 32 8bit values + + dstvec[r] = _mm256_blendv_epi8( + a_mbase_x, res16[1], + *(__m256i *)BaseMask[base_max_diff]); // 32 8bit values + x += dx; + } +} + +static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m256i dstvec[64]; + dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); + } +} + +static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((64 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i a_mbase_x, diff, c3f; + __m128i max_base_x128, base_inc128, mask128; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi8(above[max_base_x]); + max_base_x128 = _mm_set1_epi8(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++, dst += stride) { + __m256i b, res; + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values + _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); + dst += stride; + } + return; + } + + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + __m128i a0_128, a1_128, res128; + for (int j = 0; j < 64; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + _mm_storeu_si128((__m128i *)(dst + j), + _mm256_castsi256_si128(a_mbase_x)); + } else { + a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); + a0 = _mm256_cvtepu8_epi16(a0_128); + a1 = _mm256_cvtepu8_epi16(a1_128); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + res = _mm256_packus_epi16( + res, _mm256_castsi128_si256( + _mm256_extracti128_si256(res, 1))); // 16 8bit values + + base_inc128 = + _mm_setr_epi8((uint8_t)(base + j), (uint8_t)(base + j + 1), + (uint8_t)(base + j + 2), (uint8_t)(base + j + 3), + (uint8_t)(base + j + 4), (uint8_t)(base + j + 5), + (uint8_t)(base + j + 6), (uint8_t)(base + j + 7), + (uint8_t)(base + j + 8), (uint8_t)(base + j + 9), + (uint8_t)(base + j + 10), (uint8_t)(base + j + 11), + (uint8_t)(base + j + 12), (uint8_t)(base + j + 13), + (uint8_t)(base + j + 14), (uint8_t)(base + j + 15)); + + mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128), + _mm_setzero_si128()); + res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x), + _mm256_castsi256_si128(res), mask128); + _mm_storeu_si128((__m128i *)(dst + j), res128); + } + } + x += dx; + } +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int dx, int dy) { + (void)left; + (void)dy; + switch (bw) { + case 4: + dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 8: + dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 16: + dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 32: + dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 64: + dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + default: break; + } + return; +} + +static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, + int dx, int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m128i a0_x, a1_x, a32, a16, diff; + __m128i c3f, min_base_y128, c1234, dy128; + + a16 = _mm_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0); + dy128 = _mm_set1_epi16(dy); + + for (int r = 0; r < N; r++) { + __m128i b, res, shift, r6, ydx; + __m128i resx, resy, resxy; + __m128i a0_x128, a1_x128; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 4) { + base_min_diff = 4; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 3) { + a0_x = _mm_setzero_si128(); + a1_x = _mm_setzero_si128(); + shift = _mm_setzero_si128(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + ydx = _mm_set1_epi16(y * dx); + r6 = _mm_slli_epi16(c1234, 6); + + if (upsample_above) { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 8); + + shift = _mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), + 1); + } else { + a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 1); + + shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); + } + a0_x = _mm_cvtepu8_epi16(a0_x128); + a1_x = _mm_cvtepu8_epi16(a1_x128); + } + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int16_t, base_y_c[8]); + __m128i y_c128, base_y_c128, mask128, c1234_; + c1234_ = _mm_srli_si128(c1234, 2); + r6 = _mm_set1_epi16(r << 6); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); + base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4)); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm_unpacklo_epi64(a0_x, a0_y); + a1_x = _mm_unpacklo_epi64(a1_x, a1_y); + shift = _mm_unpacklo_epi64(shift, shifty); + } + + diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shift); + res = _mm_add_epi16(a32, b); + res = _mm_srli_epi16(res, 5); + + resx = _mm_packus_epi16(res, res); + resy = _mm_srli_si128(resx, 4); + + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); + *(uint32_t *)(dst) = _mm_cvtsi128_si32(resxy); + dst += stride; + } +} + +static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, + int dx, int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i diff, a32, a16; + __m256i a0_x, a1_x; + __m128i a0_x128, a1_x128, min_base_y128, c3f; + __m128i c1234, dy128; + + a16 = _mm256_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + dy128 = _mm_set1_epi16(dy); + c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy, r6, ydx; + + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 8) { + base_min_diff = 8; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + ydx = _mm_set1_epi16(y * dx); + r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6); + if (upsample_above) { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 8); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), + 1)); + } else { + a1_x128 = _mm_srli_si128(a0_x128, 1); + a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); + a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); + + shift = _mm256_castsi128_si256( + _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1)); + } + a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128)); + a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128)); + } + + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + __m128i y_c128, base_y_c128, mask128; + r6 = _mm_set1_epi16(r << 6); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + base_y_c128 = _mm_add_epi16( + base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4)); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + resx = _mm_packus_epi16(_mm256_castsi256_si128(res), + _mm256_castsi256_si128(res)); + resy = _mm256_extracti128_si256(res, 1); + resy = _mm_packus_epi16(resy, resy); + + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); + _mm_storel_epi64((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst, + ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int upsample_above, + int upsample_left, int dx, int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + const int min_base_y = -1; + (void)upsample_above; + (void)upsample_left; + const int frac_bits_x = 6; + const int frac_bits_y = 6; + + __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123; + __m256i diff, min_base_y256, c3f, shifty, dy256, c1; + __m128i a0_x128, a1_x128; + + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + a16 = _mm256_set1_epi16(16); + c1 = _mm256_srli_epi16(a16, 4); + min_base_y256 = _mm256_set1_epi16(min_base_y); + c3f = _mm256_set1_epi16(0x3f); + dy256 = _mm256_set1_epi16(dy); + c0123 = + _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + c1234 = _mm256_add_epi16(c0123, c1); + + for (int r = 0; r < H; r++) { + __m256i b, res, shift, j256, r6, ydx; + __m128i resx, resy; + __m128i resxy; + int y = r + 1; + ydx = _mm256_set1_epi16((uint16_t)(y * dx)); + + int base_x = (-y * dx) >> frac_bits_x; + for (int j = 0; j < W; j += 16) { + j256 = _mm256_set1_epi16(j); + int base_shift = 0; + if ((base_x + j) < (min_base_x - 1)) { + base_shift = (min_base_x - (base_x + j) - 1); + } + int base_min_diff = (min_base_x - base_x - j); + if (base_min_diff > 16) { + base_min_diff = 16; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift < 16) { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); + a1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); + a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); + a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); + + a0_x = _mm256_cvtepu8_epi16(a0_x128); + a1_x = _mm256_cvtepu8_epi16(a1_x128); + + r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6); + shift = _mm256_srli_epi16( + _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1); + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); // 16 16-bit values + resx = _mm256_castsi256_si128(_mm256_packus_epi16( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } else { + resx = _mm_setzero_si128(); + } + + // y calc + if (base_x < min_base_x) { + __m256i c256, y_c256, base_y_c256, mask256, mul16; + r6 = _mm256_set1_epi16(r << 6); + c256 = _mm256_add_epi16(j256, c1234); + mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), + _mm256_srli_epi16(min_base_y256, 1)); + y_c256 = _mm256_sub_epi16(r6, mul16); + + base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); + + base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256); + int16_t min_y = (int16_t)_mm_extract_epi16( + _mm256_extracti128_si256(base_y_c256, 1), 7); + int16_t max_y = + (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0); + int16_t offset_diff = max_y - min_y; + + if (offset_diff < 16) { + __m256i min_y256 = _mm256_set1_epi16(min_y); + + __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256); + __m128i base_y_offset128 = + _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0), + _mm256_extracti128_si256(base_y_offset, 1)); + + __m128i a0_y128 = _mm_maskload_epi32( + (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]); + __m128i a1_y128 = + _mm_maskload_epi32((int *)(left + min_y + 1), + *(__m128i *)LoadMaskz2[offset_diff / 4]); + a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128); + a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128); + a0_y = _mm256_cvtepu8_epi16(a0_y128); + a1_y = _mm256_cvtepu8_epi16(a1_y128); + } else { + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a0_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + base_y_c256 = _mm256_add_epi16(base_y_c256, c1); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a1_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + } + shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); + + diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shifty); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); // 16 16-bit values + resy = _mm256_castsi256_si128(_mm256_packus_epi16( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } else { + resy = _mm_setzero_si128(); + } + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); + _mm_storeu_si128((__m128i *)(dst + j), resxy); + } // for j + dst += stride; + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int dx, + int dy) { + assert(dx > 0); + assert(dy > 0); + switch (bw) { + case 4: + dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above, + upsample_left, dx, dy); + break; + case 8: + dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above, + upsample_left, dx, dy); + break; + default: + dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + break; + } + return; +} + +// z3 functions +static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) { + __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3; + w0 = _mm_unpacklo_epi8(x[0], x[1]); + w1 = _mm_unpacklo_epi8(x[2], x[3]); + w2 = _mm_unpackhi_epi8(x[0], x[1]); + w3 = _mm_unpackhi_epi8(x[2], x[3]); + + ww0 = _mm_unpacklo_epi16(w0, w1); + ww1 = _mm_unpacklo_epi16(w2, w3); + ww2 = _mm_unpackhi_epi16(w0, w1); + ww3 = _mm_unpackhi_epi16(w2, w3); + + w0 = _mm_unpacklo_epi32(ww0, ww1); + w2 = _mm_unpacklo_epi32(ww2, ww3); + w1 = _mm_unpackhi_epi32(ww0, ww1); + w3 = _mm_unpackhi_epi32(ww2, ww3); + + d[0] = _mm_unpacklo_epi64(w0, w2); + d[1] = _mm_unpackhi_epi64(w0, w2); + d[2] = _mm_unpacklo_epi64(w1, w3); + d[3] = _mm_unpackhi_epi64(w1, w3); + + d[4] = _mm_srli_si128(d[0], 8); + d[5] = _mm_srli_si128(d[1], 8); + d[6] = _mm_srli_si128(d[2], 8); + d[7] = _mm_srli_si128(d[3], 8); + + d[8] = _mm_srli_si128(d[0], 4); + d[9] = _mm_srli_si128(d[1], 4); + d[10] = _mm_srli_si128(d[2], 4); + d[11] = _mm_srli_si128(d[3], 4); + + d[12] = _mm_srli_si128(d[0], 12); + d[13] = _mm_srli_si128(d[1], 12); + d[14] = _mm_srli_si128(d[2], 12); + d[15] = _mm_srli_si128(d[3], 12); +} + +static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) { + __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m256i w10, w11, w12, w13, w14, w15; + + w0 = _mm256_unpacklo_epi8(x[0], x[1]); + w1 = _mm256_unpacklo_epi8(x[2], x[3]); + w2 = _mm256_unpacklo_epi8(x[4], x[5]); + w3 = _mm256_unpacklo_epi8(x[6], x[7]); + + w8 = _mm256_unpacklo_epi8(x[8], x[9]); + w9 = _mm256_unpacklo_epi8(x[10], x[11]); + w10 = _mm256_unpacklo_epi8(x[12], x[13]); + w11 = _mm256_unpacklo_epi8(x[14], x[15]); + + w4 = _mm256_unpacklo_epi16(w0, w1); + w5 = _mm256_unpacklo_epi16(w2, w3); + w12 = _mm256_unpacklo_epi16(w8, w9); + w13 = _mm256_unpacklo_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[0] = _mm256_unpacklo_epi64(w6, w14); + d[1] = _mm256_unpackhi_epi64(w6, w14); + d[2] = _mm256_unpacklo_epi64(w7, w15); + d[3] = _mm256_unpackhi_epi64(w7, w15); + + w4 = _mm256_unpackhi_epi16(w0, w1); + w5 = _mm256_unpackhi_epi16(w2, w3); + w12 = _mm256_unpackhi_epi16(w8, w9); + w13 = _mm256_unpackhi_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[4] = _mm256_unpacklo_epi64(w6, w14); + d[5] = _mm256_unpackhi_epi64(w6, w14); + d[6] = _mm256_unpacklo_epi64(w7, w15); + d[7] = _mm256_unpackhi_epi64(w7, w15); + + // upper half + w0 = _mm256_unpackhi_epi8(x[0], x[1]); + w1 = _mm256_unpackhi_epi8(x[2], x[3]); + w2 = _mm256_unpackhi_epi8(x[4], x[5]); + w3 = _mm256_unpackhi_epi8(x[6], x[7]); + + w8 = _mm256_unpackhi_epi8(x[8], x[9]); + w9 = _mm256_unpackhi_epi8(x[10], x[11]); + w10 = _mm256_unpackhi_epi8(x[12], x[13]); + w11 = _mm256_unpackhi_epi8(x[14], x[15]); + + w4 = _mm256_unpacklo_epi16(w0, w1); + w5 = _mm256_unpacklo_epi16(w2, w3); + w12 = _mm256_unpacklo_epi16(w8, w9); + w13 = _mm256_unpacklo_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[8] = _mm256_unpacklo_epi64(w6, w14); + d[9] = _mm256_unpackhi_epi64(w6, w14); + d[10] = _mm256_unpacklo_epi64(w7, w15); + d[11] = _mm256_unpackhi_epi64(w7, w15); + + w4 = _mm256_unpackhi_epi16(w0, w1); + w5 = _mm256_unpackhi_epi16(w2, w3); + w12 = _mm256_unpackhi_epi16(w8, w9); + w13 = _mm256_unpackhi_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[12] = _mm256_unpacklo_epi64(w6, w14); + d[13] = _mm256_unpackhi_epi64(w6, w14); + d[14] = _mm256_unpacklo_epi64(w7, w15); + d[15] = _mm256_unpackhi_epi64(w7, w15); +} + +static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(x[0], x[1]); + w1 = _mm_unpacklo_epi8(x[2], x[3]); + w2 = _mm_unpacklo_epi8(x[4], x[5]); + w3 = _mm_unpacklo_epi8(x[6], x[7]); + + w8 = _mm_unpacklo_epi8(x[8], x[9]); + w9 = _mm_unpacklo_epi8(x[10], x[11]); + w10 = _mm_unpacklo_epi8(x[12], x[13]); + w11 = _mm_unpacklo_epi8(x[14], x[15]); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[0] = _mm_unpacklo_epi64(w6, w14); + d[1] = _mm_unpackhi_epi64(w6, w14); + d[2] = _mm_unpacklo_epi64(w7, w15); + d[3] = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[4] = _mm_unpacklo_epi64(w6, w14); + d[5] = _mm_unpackhi_epi64(w6, w14); + d[6] = _mm_unpacklo_epi64(w7, w15); + d[7] = _mm_unpackhi_epi64(w7, w15); + + // upper half + w0 = _mm_unpackhi_epi8(x[0], x[1]); + w1 = _mm_unpackhi_epi8(x[2], x[3]); + w2 = _mm_unpackhi_epi8(x[4], x[5]); + w3 = _mm_unpackhi_epi8(x[6], x[7]); + + w8 = _mm_unpackhi_epi8(x[8], x[9]); + w9 = _mm_unpackhi_epi8(x[10], x[11]); + w10 = _mm_unpackhi_epi8(x[12], x[13]); + w11 = _mm_unpackhi_epi8(x[14], x[15]); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[8] = _mm_unpacklo_epi64(w6, w14); + d[9] = _mm_unpackhi_epi64(w6, w14); + d[10] = _mm_unpacklo_epi64(w7, w15); + d[11] = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[12] = _mm_unpacklo_epi64(w6, w14); + d[13] = _mm_unpackhi_epi64(w6, w14); + d[14] = _mm_unpacklo_epi64(w7, w15); + d[15] = _mm_unpackhi_epi64(w7, w15); +} + +static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc, + uint8_t *dst, ptrdiff_t pitchDst) { + __m128i r[16]; + __m128i d[16]; + for (int j = 0; j < 16; j++) { + r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc)); + } + transpose16x16_sse2(r, d); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]); + } +} + +static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst, + ptrdiff_t pitchDst, int width, int height) { + for (int j = 0; j < height; j += 16) + for (int i = 0; i < width; i += 16) + transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc, + dst + j * pitchDst + i, pitchDst); +} + +static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[4]; + + dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy); + transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &d[0], &d[1], &d[2], &d[3]); + + *(uint32_t *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]); + *(uint32_t *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]); + *(uint32_t *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]); + *(uint32_t *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]); + return; +} + +static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[8]; + + dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy); + transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], + &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], + &d[3]); + + _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8)); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8)); + _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8)); + _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]); + _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8)); +} + +static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[8]; + + dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy); + transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], + &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + for (int i = 0; i < 8; i++) { + *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); + } +} + +static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[4]; + + dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy); + transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], + &d[1], &d[2], &d[3]); + _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); +} + +static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[8]; + + dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy); + transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3, + dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d, + d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7); + for (int i = 0; i < 8; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); + _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), + _mm_srli_si128(d[i], 8)); + } +} + +static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[16]; + + dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy); + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[16]; + + dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy); + transpose4x16_sse2(dstvec, d); + for (int i = 0; i < 16; i++) { + *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); + } +} + +static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[8]; + + dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy); + for (int i = 4; i < 8; i++) { + d[i] = _mm_setzero_si128(); + } + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + + for (int i = 0; i < 4; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m256i dstvec[16], d[16]; + + dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy); + for (int i = 8; i < 16; i++) { + dstvec[i] = _mm256_setzero_si256(); + } + transpose16x32_avx2(dstvec, d); + + for (int i = 0; i < 16; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), + _mm256_castsi256_si128(d[i])); + } + for (int i = 0; i < 16; i++) { + _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), + _mm256_extracti128_si256(d[i], 1)); + } +} + +static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[32], d[16]; + + dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy); + + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + transpose16x8_8x16_sse2( + &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16], + &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16], + &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16], + &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16], + &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8], + &d[6 + 8], &d[7 + 8]); + + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]); + } +} + +static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[16]; + + dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy); + transpose16x16_sse2(dstvec, d); + + for (int i = 0; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m256i dstvec[32], d[32]; + + dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy); + transpose16x32_avx2(dstvec, d); + transpose16x32_avx2(dstvec + 16, d + 16); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride), + _mm256_castsi256_si128(d[j])); + _mm_storeu_si128((__m128i *)(dst + j * stride + 16), + _mm256_castsi256_si128(d[j + 16])); + } + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), + _mm256_extracti128_si256(d[j], 1)); + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), + _mm256_extracti128_si256(d[j + 16], 1)); + } +} + +static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]); + dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 64, 64); +} + +static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m256i dstvec[16], d[16]; + + dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy); + transpose16x32_avx2(dstvec, d); + // store + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride), + _mm256_castsi256_si128(d[j])); + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), + _mm256_extracti128_si256(d[j], 1)); + } +} + +static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[32], d[16]; + + dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy); + for (int i = 0; i < 32; i += 16) { + transpose16x16_sse2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); + } + } +} + +static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8_t dstT[64 * 32]; + dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 32, 64); +} + +static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8_t dstT[32 * 64]; + dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy); + transpose(dstT, 32, dst, stride, 64, 32); + return; +} + +static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8_t dstT[64 * 16]; + dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 16, 64); +} + +static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[64], d[16]; + + dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy); + for (int i = 0; i < 64; i += 16) { + transpose16x16_sse2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); + } + } +} + +void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_left, int dx, int dy) { + (void)above; + (void)dx; + assert(dx == 1); + assert(dy > 0); + + if (bw == bh) { + switch (bw) { + case 4: + dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy); + break; + case 64: + dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy); + break; + } + } else { + if (bw < bh) { + if (bw + bw == bh) { + switch (bw) { + case 4: + dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy); + break; + } + } else { + switch (bw) { + case 4: + dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy); + break; + } + } + } else { + if (bh + bh == bw) { + switch (bh) { + case 4: + dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy); + break; + } + } else { + switch (bh) { + case 4: + dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy); + break; + } + } + } + } +} diff --git a/media/libaom/src/aom_dsp/x86/intrapred_sse2.c b/media/libaom/src/aom_dsp/x86/intrapred_sse2.c index 5b2452c8eb..5afef68c39 100644 --- a/media/libaom/src/aom_dsp/x86/intrapred_sse2.c +++ b/media/libaom/src/aom_dsp/x86/intrapred_sse2.c @@ -10,7 +10,7 @@ */ #include <emmintrin.h> - +#include "aom_dsp/x86/intrapred_x86.h" #include "config/aom_dsp_rtcd.h" static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst, @@ -75,25 +75,6 @@ static INLINE __m128i dc_sum_8(const uint8_t *ref) { return _mm_sad_epu8(x, zero); } -static INLINE __m128i dc_sum_16(const uint8_t *ref) { - __m128i x = _mm_load_si128((__m128i const *)ref); - const __m128i zero = _mm_setzero_si128(); - x = _mm_sad_epu8(x, zero); - const __m128i high = _mm_unpackhi_epi64(x, x); - return _mm_add_epi16(x, high); -} - -static INLINE __m128i dc_sum_32(const uint8_t *ref) { - __m128i x0 = _mm_load_si128((__m128i const *)ref); - __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); - const __m128i zero = _mm_setzero_si128(); - x0 = _mm_sad_epu8(x0, zero); - x1 = _mm_sad_epu8(x1, zero); - x0 = _mm_add_epi16(x0, x1); - const __m128i high = _mm_unpackhi_epi64(x0, x0); - return _mm_add_epi16(x0, high); -} - static INLINE __m128i dc_sum_64(const uint8_t *ref) { __m128i x0 = _mm_load_si128((__m128i const *)ref); __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); @@ -142,7 +123,7 @@ void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const __m128i sum_left = dc_sum_16(left); + const __m128i sum_left = dc_sum_16_sse2(left); __m128i sum_above = dc_sum_4(above); sum_above = _mm_add_epi16(sum_left, sum_above); @@ -171,7 +152,7 @@ void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const __m128i sum_left = dc_sum_16(left); + const __m128i sum_left = dc_sum_16_sse2(left); __m128i sum_above = dc_sum_8(above); sum_above = _mm_add_epi16(sum_above, sum_left); @@ -184,7 +165,7 @@ void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const __m128i sum_left = dc_sum_32(left); + const __m128i sum_left = dc_sum_32_sse2(left); __m128i sum_above = dc_sum_8(above); sum_above = _mm_add_epi16(sum_above, sum_left); @@ -198,7 +179,7 @@ void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_4(left); - __m128i sum_above = dc_sum_16(above); + __m128i sum_above = dc_sum_16_sse2(above); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = _mm_cvtsi128_si32(sum_above); @@ -211,7 +192,7 @@ void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_8(left); - __m128i sum_above = dc_sum_16(above); + __m128i sum_above = dc_sum_16_sse2(above); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = _mm_cvtsi128_si32(sum_above); @@ -223,8 +204,8 @@ void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const __m128i sum_left = dc_sum_32(left); - __m128i sum_above = dc_sum_16(above); + const __m128i sum_left = dc_sum_32_sse2(left); + __m128i sum_above = dc_sum_16_sse2(above); sum_above = _mm_add_epi16(sum_left, sum_above); uint32_t sum = _mm_cvtsi128_si32(sum_above); @@ -237,7 +218,7 @@ void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_64(left); - __m128i sum_above = dc_sum_16(above); + __m128i sum_above = dc_sum_16_sse2(above); sum_above = _mm_add_epi16(sum_left, sum_above); uint32_t sum = _mm_cvtsi128_si32(sum_above); @@ -249,7 +230,7 @@ void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - __m128i sum_above = dc_sum_32(above); + __m128i sum_above = dc_sum_32_sse2(above); const __m128i sum_left = dc_sum_8(left); sum_above = _mm_add_epi16(sum_above, sum_left); @@ -262,8 +243,8 @@ void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - __m128i sum_above = dc_sum_32(above); - const __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_32_sse2(above); + const __m128i sum_left = dc_sum_16_sse2(left); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = _mm_cvtsi128_si32(sum_above); @@ -275,7 +256,7 @@ void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - __m128i sum_above = dc_sum_32(above); + __m128i sum_above = dc_sum_32_sse2(above); const __m128i sum_left = dc_sum_64(left); sum_above = _mm_add_epi16(sum_above, sum_left); @@ -302,7 +283,7 @@ void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i sum_above = dc_sum_64(above); - const __m128i sum_left = dc_sum_32(left); + const __m128i sum_left = dc_sum_32_sse2(left); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = _mm_cvtsi128_si32(sum_above); @@ -315,7 +296,7 @@ void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i sum_above = dc_sum_64(above); - const __m128i sum_left = dc_sum_16(left); + const __m128i sum_left = dc_sum_16_sse2(left); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = _mm_cvtsi128_si32(sum_above); @@ -395,7 +376,7 @@ void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; - __m128i sum_above = dc_sum_16(above); + __m128i sum_above = dc_sum_16_sse2(above); const __m128i eight = _mm_set1_epi16((uint16_t)8); sum_above = _mm_add_epi16(sum_above, eight); sum_above = _mm_srai_epi16(sum_above, 4); @@ -408,7 +389,7 @@ void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; - __m128i sum_above = dc_sum_16(above); + __m128i sum_above = dc_sum_16_sse2(above); const __m128i eight = _mm_set1_epi16((uint16_t)8); sum_above = _mm_add_epi16(sum_above, eight); sum_above = _mm_srai_epi16(sum_above, 4); @@ -422,7 +403,7 @@ void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; - __m128i sum_above = dc_sum_16(above); + __m128i sum_above = dc_sum_16_sse2(above); const __m128i eight = _mm_set1_epi16((uint16_t)8); sum_above = _mm_add_epi16(sum_above, eight); sum_above = _mm_srai_epi16(sum_above, 4); @@ -436,7 +417,7 @@ void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; - __m128i sum_above = dc_sum_16(above); + __m128i sum_above = dc_sum_16_sse2(above); const __m128i eight = _mm_set1_epi16((uint16_t)8); sum_above = _mm_add_epi16(sum_above, eight); sum_above = _mm_srai_epi16(sum_above, 4); @@ -449,7 +430,7 @@ void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; - __m128i sum_above = dc_sum_32(above); + __m128i sum_above = dc_sum_32_sse2(above); const __m128i sixteen = _mm_set1_epi16((uint16_t)16); sum_above = _mm_add_epi16(sum_above, sixteen); sum_above = _mm_srai_epi16(sum_above, 5); @@ -463,7 +444,7 @@ void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; - __m128i sum_above = dc_sum_32(above); + __m128i sum_above = dc_sum_32_sse2(above); const __m128i sixteen = _mm_set1_epi16((uint16_t)16); sum_above = _mm_add_epi16(sum_above, sixteen); sum_above = _mm_srai_epi16(sum_above, 5); @@ -477,7 +458,7 @@ void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; - __m128i sum_above = dc_sum_32(above); + __m128i sum_above = dc_sum_32_sse2(above); const __m128i sixteen = _mm_set1_epi16((uint16_t)16); sum_above = _mm_add_epi16(sum_above, sixteen); sum_above = _mm_srai_epi16(sum_above, 5); @@ -550,7 +531,7 @@ void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; - __m128i sum_left = dc_sum_16(left); + __m128i sum_left = dc_sum_16_sse2(left); const __m128i eight = _mm_set1_epi16((uint16_t)8); sum_left = _mm_add_epi16(sum_left, eight); sum_left = _mm_srai_epi16(sum_left, 4); @@ -577,7 +558,7 @@ void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; - __m128i sum_left = dc_sum_16(left); + __m128i sum_left = dc_sum_16_sse2(left); const __m128i eight = _mm_set1_epi16((uint16_t)8); sum_left = _mm_add_epi16(sum_left, eight); sum_left = _mm_srai_epi16(sum_left, 4); @@ -590,7 +571,7 @@ void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; - __m128i sum_left = dc_sum_32(left); + __m128i sum_left = dc_sum_32_sse2(left); const __m128i sixteen = _mm_set1_epi16((uint16_t)16); sum_left = _mm_add_epi16(sum_left, sixteen); sum_left = _mm_srai_epi16(sum_left, 5); @@ -631,7 +612,7 @@ void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; - __m128i sum_left = dc_sum_32(left); + __m128i sum_left = dc_sum_32_sse2(left); const __m128i sixteen = _mm_set1_epi16((uint16_t)16); sum_left = _mm_add_epi16(sum_left, sixteen); sum_left = _mm_srai_epi16(sum_left, 5); @@ -673,7 +654,7 @@ void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; - __m128i sum_left = dc_sum_16(left); + __m128i sum_left = dc_sum_16_sse2(left); const __m128i eight = _mm_set1_epi16((uint16_t)8); sum_left = _mm_add_epi16(sum_left, eight); sum_left = _mm_srai_epi16(sum_left, 4); @@ -715,7 +696,7 @@ void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; - __m128i sum_left = dc_sum_32(left); + __m128i sum_left = dc_sum_32_sse2(left); const __m128i sixteen = _mm_set1_epi16((uint16_t)16); sum_left = _mm_add_epi16(sum_left, sixteen); sum_left = _mm_srai_epi16(sum_left, 5); @@ -729,7 +710,7 @@ void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; - __m128i sum_left = dc_sum_16(left); + __m128i sum_left = dc_sum_16_sse2(left); const __m128i eight = _mm_set1_epi16((uint16_t)8); sum_left = _mm_add_epi16(sum_left, eight); sum_left = _mm_srai_epi16(sum_left, 4); diff --git a/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c b/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c index 807ed1770f..5a34ea0c8e 100644 --- a/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c +++ b/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c @@ -48,7 +48,7 @@ void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const __m128i zero = _mm_setzero_si128(); const __m128i t16 = _mm_unpacklo_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; @@ -69,7 +69,7 @@ void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, const __m128i zero = _mm_setzero_si128(); const __m128i t16 = _mm_unpacklo_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; @@ -90,7 +90,7 @@ void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, const __m128i zero = _mm_setzero_si128(); const __m128i t16 = _mm_unpacklo_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); for (int i = 0; i < 16; ++i) { @@ -110,7 +110,7 @@ void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, const __m128i zero = _mm_setzero_si128(); const __m128i t16 = _mm_unpacklo_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; @@ -131,7 +131,7 @@ void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const __m128i zero = _mm_setzero_si128(); const __m128i t16 = _mm_unpacklo_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; @@ -152,7 +152,7 @@ void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, const __m128i zero = _mm_setzero_si128(); const __m128i t16 = _mm_unpacklo_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; @@ -176,7 +176,7 @@ void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, for (int j = 0; j < 2; ++j) { const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); for (int i = 0; i < 16; ++i) { const __m128i l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); @@ -205,7 +205,7 @@ void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, const __m128i top0 = _mm_unpacklo_epi8(t, zero); const __m128i top1 = _mm_unpackhi_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); for (int i = 0; i < 4; ++i) { @@ -226,7 +226,7 @@ void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, const __m128i top0 = _mm_unpacklo_epi8(t, zero); const __m128i top1 = _mm_unpackhi_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; @@ -249,7 +249,7 @@ void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const __m128i top0 = _mm_unpacklo_epi8(t, zero); const __m128i top1 = _mm_unpackhi_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; @@ -272,7 +272,7 @@ void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, const __m128i top0 = _mm_unpacklo_epi8(t, zero); const __m128i top1 = _mm_unpackhi_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); __m128i l16; @@ -287,7 +287,7 @@ void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, } l = _mm_load_si128((const __m128i *)(left + 16)); - rep = _mm_set1_epi16(0x8000); + rep = _mm_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); @@ -310,7 +310,7 @@ void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, for (int j = 0; j < 4; ++j) { const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); for (int i = 0; i < 16; ++i) { const __m128i l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); @@ -332,7 +332,7 @@ void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, const __m128i bh = _mm_unpackhi_epi8(b, zero); const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); const __m128i l = _mm_loadl_epi64((const __m128i *)left); __m128i l16; @@ -361,7 +361,7 @@ void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, const __m128i bh = _mm_unpackhi_epi8(b, zero); const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); __m128i l = _mm_load_si128((const __m128i *)left); __m128i l16; @@ -391,7 +391,7 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const __m128i bh = _mm_unpackhi_epi8(b, zero); const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); __m128i l = _mm_load_si128((const __m128i *)left); __m128i l16; @@ -408,7 +408,7 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, rep = _mm_add_epi16(rep, one); } - rep = _mm_set1_epi16(0x8000); + rep = _mm_set1_epi16((short)0x8000); l = _mm_load_si128((const __m128i *)(left + 16)); for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); @@ -440,7 +440,7 @@ void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, int i, j; for (j = 0; j < 4; ++j) { const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); @@ -478,7 +478,7 @@ void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, int i, j; for (j = 0; j < 2; ++j) { const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); @@ -520,7 +520,7 @@ void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, int i, j; for (j = 0; j < 4; ++j) { const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); @@ -561,7 +561,7 @@ void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, int i; const __m128i l = _mm_load_si128((const __m128i *)left); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); @@ -636,7 +636,8 @@ static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh, const __m128i one = _mm_set1_epi16(1); const __m128i inc = _mm_set1_epi16(0x202); const __m128i gat = _mm_set1_epi32(0xc080400); - __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); + __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) + : _mm_set1_epi16((short)0x8000); __m128i d = _mm_set1_epi16(0x100); for (int i = 0; i < h; ++i) { @@ -792,7 +793,8 @@ static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh, const __m128i inc = _mm_set1_epi16(0x202); const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); - __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); + __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) + : _mm_set1_epi16((short)0x8000); __m128i d = _mm_set1_epi16(0x100); int i; @@ -1400,7 +1402,7 @@ static INLINE void smooth_h_pred_4xh(const __m128i *pixel, const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); const __m128i one = _mm_set1_epi16(1); const __m128i gat = _mm_set1_epi32(0xc080400); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = _mm_set1_epi16((short)0x8000); for (int i = 0; i < h; ++i) { __m128i b = _mm_shuffle_epi8(pixel[0], rep); @@ -1499,7 +1501,8 @@ static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww, const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); const __m128i one = _mm_set1_epi16(1); const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); - __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); + __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) + : _mm_set1_epi16((short)0x8000); for (int i = 0; i < h; ++i) { __m128i b = _mm_shuffle_epi8(pixels[0], rep); diff --git a/media/libaom/src/aom_dsp/x86/intrapred_x86.h b/media/libaom/src/aom_dsp/x86/intrapred_x86.h new file mode 100644 index 0000000000..b13f575a76 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/intrapred_x86.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_INTRAPRED_X86_H_ +#define AOM_AOM_DSP_X86_INTRAPRED_X86_H_ + +#include <emmintrin.h> // SSE2 +#include "aom/aom_integer.h" +#include "config/aom_config.h" + +static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) { + __m128i x = _mm_load_si128((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_sad_epu8(x, zero); + const __m128i high = _mm_unpackhi_epi64(x, x); + return _mm_add_epi16(x, high); +} + +static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x0 = _mm_add_epi16(x0, x1); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +#endif // AOM_AOM_DSP_X86_INTRAPRED_X86_H_ diff --git a/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c b/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c index c3c88245a4..2e3e2be105 100644 --- a/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c +++ b/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c @@ -192,47 +192,47 @@ unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, return res; } -#define jnt_sadMxN_sse2(m, n) \ - unsigned int aom_jnt_sad##m##x##n##_avg_ssse3( \ +#define dist_wtd_sadMxN_sse2(m, n) \ + unsigned int aom_dist_wtd_sad##m##x##n##_avg_ssse3( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ uint8_t comp_pred[m * n]; \ - aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ - jcp_param); \ + aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n); \ } -#define jnt_sadMxN_avx2(m, n) \ - unsigned int aom_jnt_sad##m##x##n##_avg_avx2( \ +#define dist_wtd_sadMxN_avx2(m, n) \ + unsigned int aom_dist_wtd_sad##m##x##n##_avg_avx2( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ uint8_t comp_pred[m * n]; \ - aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ - jcp_param); \ + aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n); \ } /* clang-format off */ -jnt_sadMxN_sse2(128, 128) -jnt_sadMxN_sse2(128, 64) -jnt_sadMxN_sse2(64, 128) -jnt_sadMxN_sse2(64, 64) -jnt_sadMxN_sse2(64, 32) -jnt_sadMxN_sse2(32, 64) -jnt_sadMxN_sse2(32, 32) -jnt_sadMxN_sse2(32, 16) -jnt_sadMxN_sse2(16, 32) -jnt_sadMxN_sse2(16, 16) -jnt_sadMxN_sse2(16, 8) -jnt_sadMxN_sse2(8, 16) -jnt_sadMxN_sse2(8, 8) -jnt_sadMxN_sse2(8, 4) -jnt_sadMxN_sse2(4, 8) -jnt_sadMxN_sse2(4, 4) -jnt_sadMxN_sse2(4, 16) -jnt_sadMxN_sse2(16, 4) -jnt_sadMxN_sse2(8, 32) -jnt_sadMxN_sse2(32, 8) -jnt_sadMxN_sse2(16, 64) -jnt_sadMxN_sse2(64, 16) +dist_wtd_sadMxN_sse2(128, 128) +dist_wtd_sadMxN_sse2(128, 64) +dist_wtd_sadMxN_sse2(64, 128) +dist_wtd_sadMxN_sse2(64, 64) +dist_wtd_sadMxN_sse2(64, 32) +dist_wtd_sadMxN_sse2(32, 64) +dist_wtd_sadMxN_sse2(32, 32) +dist_wtd_sadMxN_sse2(32, 16) +dist_wtd_sadMxN_sse2(16, 32) +dist_wtd_sadMxN_sse2(16, 16) +dist_wtd_sadMxN_sse2(16, 8) +dist_wtd_sadMxN_sse2(8, 16) +dist_wtd_sadMxN_sse2(8, 8) +dist_wtd_sadMxN_sse2(8, 4) +dist_wtd_sadMxN_sse2(4, 8) +dist_wtd_sadMxN_sse2(4, 4) +dist_wtd_sadMxN_sse2(4, 16) +dist_wtd_sadMxN_sse2(16, 4) +dist_wtd_sadMxN_sse2(8, 32) +dist_wtd_sadMxN_sse2(32, 8) +dist_wtd_sadMxN_sse2(16, 64) +dist_wtd_sadMxN_sse2(64, 16) /* clang-format on */ diff --git a/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c b/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c index f9a41a210b..c8b02f5560 100644 --- a/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c +++ b/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c @@ -29,7 +29,7 @@ void aom_var_filter_block2d_bil_second_pass_ssse3( unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter); -static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, +static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1, const __m128i *w, const __m128i *r, void *const result) { __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1); @@ -45,10 +45,10 @@ static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi)); } -void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, const uint8_t *ref, - int ref_stride, - const JNT_COMP_PARAMS *jcp_param) { +void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { int i; const uint8_t w0 = (uint8_t)jcp_param->fwd_offset; const uint8_t w1 = (uint8_t)jcp_param->bck_offset; @@ -67,7 +67,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, __m128i p0 = xx_loadu_128(ref); __m128i p1 = xx_loadu_128(pred); - compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); comp_pred += 16; pred += 16; @@ -85,7 +85,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); __m128i p1 = xx_loadu_128(pred); - compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); comp_pred += 16; pred += 16; @@ -107,7 +107,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, row3[0], row3[1], row3[2], row3[3]); __m128i p1 = xx_loadu_128(pred); - compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); comp_pred += 16; pred += 16; @@ -116,11 +116,11 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, } } -void aom_jnt_comp_avg_upsampled_pred_ssse3( +void aom_dist_wtd_comp_avg_upsampled_pred_ssse3( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) { + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { int n; int i; aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, @@ -141,52 +141,52 @@ void aom_jnt_comp_avg_upsampled_pred_ssse3( __m128i p0 = xx_loadu_128(comp_pred); __m128i p1 = xx_loadu_128(pred); - compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); comp_pred += 16; pred += 16; } } -#define JNT_SUBPIX_AVG_VAR(W, H) \ - uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - aom_var_filter_block2d_bil_first_pass_ssse3( \ - a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_ssse3( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \ - jcp_param); \ - \ - return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ +#define DIST_WTD_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_ssse3( \ + a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_ssse3( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_dist_wtd_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \ + jcp_param); \ + \ + return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ } -JNT_SUBPIX_AVG_VAR(128, 128) -JNT_SUBPIX_AVG_VAR(128, 64) -JNT_SUBPIX_AVG_VAR(64, 128) -JNT_SUBPIX_AVG_VAR(64, 64) -JNT_SUBPIX_AVG_VAR(64, 32) -JNT_SUBPIX_AVG_VAR(32, 64) -JNT_SUBPIX_AVG_VAR(32, 32) -JNT_SUBPIX_AVG_VAR(32, 16) -JNT_SUBPIX_AVG_VAR(16, 32) -JNT_SUBPIX_AVG_VAR(16, 16) -JNT_SUBPIX_AVG_VAR(16, 8) -JNT_SUBPIX_AVG_VAR(8, 16) -JNT_SUBPIX_AVG_VAR(8, 8) -JNT_SUBPIX_AVG_VAR(8, 4) -JNT_SUBPIX_AVG_VAR(4, 8) -JNT_SUBPIX_AVG_VAR(4, 4) -JNT_SUBPIX_AVG_VAR(4, 16) -JNT_SUBPIX_AVG_VAR(16, 4) -JNT_SUBPIX_AVG_VAR(8, 32) -JNT_SUBPIX_AVG_VAR(32, 8) -JNT_SUBPIX_AVG_VAR(16, 64) -JNT_SUBPIX_AVG_VAR(64, 16) +DIST_WTD_SUBPIX_AVG_VAR(128, 128) +DIST_WTD_SUBPIX_AVG_VAR(128, 64) +DIST_WTD_SUBPIX_AVG_VAR(64, 128) +DIST_WTD_SUBPIX_AVG_VAR(64, 64) +DIST_WTD_SUBPIX_AVG_VAR(64, 32) +DIST_WTD_SUBPIX_AVG_VAR(32, 64) +DIST_WTD_SUBPIX_AVG_VAR(32, 32) +DIST_WTD_SUBPIX_AVG_VAR(32, 16) +DIST_WTD_SUBPIX_AVG_VAR(16, 32) +DIST_WTD_SUBPIX_AVG_VAR(16, 16) +DIST_WTD_SUBPIX_AVG_VAR(16, 8) +DIST_WTD_SUBPIX_AVG_VAR(8, 16) +DIST_WTD_SUBPIX_AVG_VAR(8, 8) +DIST_WTD_SUBPIX_AVG_VAR(8, 4) +DIST_WTD_SUBPIX_AVG_VAR(4, 8) +DIST_WTD_SUBPIX_AVG_VAR(4, 4) +DIST_WTD_SUBPIX_AVG_VAR(4, 16) +DIST_WTD_SUBPIX_AVG_VAR(16, 4) +DIST_WTD_SUBPIX_AVG_VAR(8, 32) +DIST_WTD_SUBPIX_AVG_VAR(32, 8) +DIST_WTD_SUBPIX_AVG_VAR(16, 64) +DIST_WTD_SUBPIX_AVG_VAR(64, 16) diff --git a/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c b/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c index 9d88b5e493..d534683fce 100644 --- a/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c +++ b/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c @@ -16,237 +16,69 @@ #include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" #include "aom_ports/emmintrin_compat.h" +#include "aom_dsp/x86/lpf_common_sse2.h" static INLINE __m128i abs_diff(__m128i a, __m128i b) { return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); } -static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, - __m128i *x2, __m128i *x3, - __m128i *d0, __m128i *d1, - __m128i *d2, __m128i *d3) { - // input - // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx - // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx - // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx - // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx - // output - // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx - // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx - // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx - // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx - - __m128i w0, w1; - - w0 = _mm_unpacklo_epi8( - *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - w1 = _mm_unpacklo_epi8( - *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - - *d0 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - - *d1 = _mm_srli_si128(*d0, - 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx - *d2 = _mm_srli_si128(*d0, - 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx - *d3 = _mm_srli_si128(*d0, - 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx -} - -static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, - __m128i *x3, __m128i *d0, __m128i *d1, - __m128i *d2, __m128i *d3, __m128i *d4, - __m128i *d5, __m128i *d6, - __m128i *d7) { - // input - // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx - // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx - // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx - // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx - // output - // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx - // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx - // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx - // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx - // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx - // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx - // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx - // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx - - __m128i w0, w1, ww0, ww1; - +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them to 4x8 independently while flipping the second matrix horizontally. +// Used for 14 taps pq pairs creation +static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *q0p0, + __m128i *q1p1, __m128i *q2p2, + __m128i *q3p3, __m128i *q4p4, + __m128i *q5p5, __m128i *q6p6, + __m128i *q7p7) { + __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3; w0 = _mm_unpacklo_epi8( *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 w1 = _mm_unpacklo_epi8( *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpackhi_epi8( + *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115 + w3 = _mm_unpackhi_epi8( + *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315 ww0 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 ww1 = _mm_unpackhi_epi16( - w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - - *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx - *d1 = _mm_srli_si128(ww0, - 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx - *d2 = _mm_srli_si128(ww0, - 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx - *d3 = _mm_srli_si128(ww0, - 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx - - *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx - *d5 = _mm_srli_si128(ww1, - 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx - *d6 = _mm_srli_si128(ww1, - 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx - *d7 = _mm_srli_si128(ww1, - 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx -} - -static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, - __m128i *x3, __m128i *x4, __m128i *x5, - __m128i *x6, __m128i *x7, __m128i *d0, - __m128i *d1, __m128i *d2, - __m128i *d3) { - // input - // x0 00 01 02 03 04 05 06 07 - // x1 10 11 12 13 14 15 16 17 - // x2 20 21 22 23 24 25 26 27 - // x3 30 31 32 33 34 35 36 37 - // x4 40 41 42 43 44 45 46 47 - // x5 50 51 52 53 54 55 56 57 - // x6 60 61 62 63 64 65 66 67 - // x7 70 71 72 73 74 75 76 77 - // output - // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx - // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx - // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx - // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx - - __m128i w0, w1, w2, w3, w4, w5; - - w0 = _mm_unpacklo_epi8( - *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - - w1 = _mm_unpacklo_epi8( - *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - - w2 = _mm_unpacklo_epi8( - *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - - w3 = _mm_unpacklo_epi8( - *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - - w4 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - w5 = _mm_unpacklo_epi16( - w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - - *d0 = _mm_unpacklo_epi32( - w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - *d1 = _mm_srli_si128(*d0, 8); - *d2 = _mm_unpackhi_epi32( - w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - *d3 = _mm_srli_si128(*d2, 8); -} - -static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2, - __m128i *x3, __m128i *x4, __m128i *x5, - __m128i *x6, __m128i *x7, __m128i *d0d1, - __m128i *d2d3, __m128i *d4d5, - __m128i *d6d7) { - __m128i w0, w1, w2, w3, w4, w5, w6, w7; - // x0 00 01 02 03 04 05 06 07 - // x1 10 11 12 13 14 15 16 17 - w0 = _mm_unpacklo_epi8( - *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - - // x2 20 21 22 23 24 25 26 27 - // x3 30 31 32 33 34 35 36 37 - w1 = _mm_unpacklo_epi8( - *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - - // x4 40 41 42 43 44 45 46 47 - // x5 50 51 52 53 54 55 56 57 - w2 = _mm_unpacklo_epi8( - *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - - // x6 60 61 62 63 64 65 66 67 - // x7 70 71 72 73 74 75 76 77 - w3 = _mm_unpacklo_epi8( - *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - - w4 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - w5 = _mm_unpacklo_epi16( - w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - - *d0d1 = _mm_unpacklo_epi32( - w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - *d2d3 = _mm_unpackhi_epi32( - w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - - w6 = _mm_unpackhi_epi16( - w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - w7 = _mm_unpackhi_epi16( - w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 - - *d4d5 = _mm_unpacklo_epi32( - w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - *d6d7 = _mm_unpackhi_epi32( - w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 -} + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ww2 = _mm_unpacklo_epi16( + w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311 + ww3 = _mm_unpackhi_epi16( + w2, + w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315 -static INLINE void transpose16x8_8x16_sse2( - __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, - __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9, - __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14, - __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, - __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; - __m128i w10, w11, w12, w13, w14, w15; - - w0 = _mm_unpacklo_epi8(*x0, *x1); - w1 = _mm_unpacklo_epi8(*x2, *x3); - w2 = _mm_unpacklo_epi8(*x4, *x5); - w3 = _mm_unpacklo_epi8(*x6, *x7); - - w8 = _mm_unpacklo_epi8(*x8, *x9); - w9 = _mm_unpacklo_epi8(*x10, *x11); - w10 = _mm_unpacklo_epi8(*x12, *x13); - w11 = _mm_unpacklo_epi8(*x14, *x15); - - w4 = _mm_unpacklo_epi16(w0, w1); - w5 = _mm_unpacklo_epi16(w2, w3); - w12 = _mm_unpacklo_epi16(w8, w9); - w13 = _mm_unpacklo_epi16(w10, w11); - - w6 = _mm_unpacklo_epi32(w4, w5); - w7 = _mm_unpackhi_epi32(w4, w5); - w14 = _mm_unpacklo_epi32(w12, w13); - w15 = _mm_unpackhi_epi32(w12, w13); - - // Store first 4-line result - *d0 = _mm_unpacklo_epi64(w6, w14); - *d1 = _mm_unpackhi_epi64(w6, w14); - *d2 = _mm_unpacklo_epi64(w7, w15); - *d3 = _mm_unpackhi_epi64(w7, w15); - - w4 = _mm_unpackhi_epi16(w0, w1); - w5 = _mm_unpackhi_epi16(w2, w3); - w12 = _mm_unpackhi_epi16(w8, w9); - w13 = _mm_unpackhi_epi16(w10, w11); - - w6 = _mm_unpacklo_epi32(w4, w5); - w7 = _mm_unpackhi_epi32(w4, w5); - w14 = _mm_unpacklo_epi32(w12, w13); - w15 = _mm_unpackhi_epi32(w12, w13); - - // Store second 4-line result - *d4 = _mm_unpacklo_epi64(w6, w14); - *d5 = _mm_unpackhi_epi64(w6, w14); - *d6 = _mm_unpacklo_epi64(w7, w15); - *d7 = _mm_unpackhi_epi64(w7, w15); + *q7p7 = _mm_unpacklo_epi32( + ww0, + _mm_srli_si128( + ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx + *q6p6 = _mm_unpackhi_epi32( + _mm_slli_si128(ww0, 4), + ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx + *q5p5 = _mm_unpackhi_epi32( + ww0, + _mm_slli_si128( + ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx + *q4p4 = _mm_unpacklo_epi32( + _mm_srli_si128(ww0, 12), + ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx + *q3p3 = _mm_unpacklo_epi32( + ww1, + _mm_srli_si128( + ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx + *q2p2 = _mm_unpackhi_epi32( + _mm_slli_si128(ww1, 4), + ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx + *q1p1 = _mm_unpackhi_epi32( + ww1, + _mm_slli_si128( + ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx + *q0p0 = _mm_unpacklo_epi32( + _mm_srli_si128(ww1, 12), + ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx } // this function treats its input as 2 parallel 8x4 matrices, transposes each of @@ -306,116 +138,6 @@ static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1, *pq3 = _mm_unpackhi_epi64(d2, d3); // pq } -static INLINE void transpose8x16_16x8_sse2( - __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, - __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, - __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11, - __m128i *d12d13, __m128i *d14d15) { - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; - __m128i w10, w11, w12, w13, w14, w15; - - w0 = _mm_unpacklo_epi8(*x0, *x1); - w1 = _mm_unpacklo_epi8(*x2, *x3); - w2 = _mm_unpacklo_epi8(*x4, *x5); - w3 = _mm_unpacklo_epi8(*x6, *x7); - - w8 = _mm_unpackhi_epi8(*x0, *x1); - w9 = _mm_unpackhi_epi8(*x2, *x3); - w10 = _mm_unpackhi_epi8(*x4, *x5); - w11 = _mm_unpackhi_epi8(*x6, *x7); - - w4 = _mm_unpacklo_epi16(w0, w1); - w5 = _mm_unpacklo_epi16(w2, w3); - w12 = _mm_unpacklo_epi16(w8, w9); - w13 = _mm_unpacklo_epi16(w10, w11); - - w6 = _mm_unpacklo_epi32(w4, w5); - w7 = _mm_unpackhi_epi32(w4, w5); - w14 = _mm_unpacklo_epi32(w12, w13); - w15 = _mm_unpackhi_epi32(w12, w13); - - // Store first 4-line result - *d0d1 = _mm_unpacklo_epi64(w6, w14); - *d2d3 = _mm_unpackhi_epi64(w6, w14); - *d4d5 = _mm_unpacklo_epi64(w7, w15); - *d6d7 = _mm_unpackhi_epi64(w7, w15); - - w4 = _mm_unpackhi_epi16(w0, w1); - w5 = _mm_unpackhi_epi16(w2, w3); - w12 = _mm_unpackhi_epi16(w8, w9); - w13 = _mm_unpackhi_epi16(w10, w11); - - w6 = _mm_unpacklo_epi32(w4, w5); - w7 = _mm_unpackhi_epi32(w4, w5); - w14 = _mm_unpacklo_epi32(w12, w13); - w15 = _mm_unpackhi_epi32(w12, w13); - - // Store second 4-line result - *d8d9 = _mm_unpacklo_epi64(w6, w14); - *d10d11 = _mm_unpackhi_epi64(w6, w14); - *d12d13 = _mm_unpacklo_epi64(w7, w15); - *d14d15 = _mm_unpackhi_epi64(w7, w15); -} - -// this function treats its input as 2 parallel 8x4 matrices, transposes each of -// them to 4x8 independently while flipping the second matrix horizontaly. Used -// for 14 taps pq pairs creation -static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2, - __m128i *x3, __m128i *q0p0, - __m128i *q1p1, __m128i *q2p2, - __m128i *q3p3, __m128i *q4p4, - __m128i *q5p5, __m128i *q6p6, - __m128i *q7p7) { - __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3; - w0 = _mm_unpacklo_epi8( - *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - w1 = _mm_unpacklo_epi8( - *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - w2 = _mm_unpackhi_epi8( - *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115 - w3 = _mm_unpackhi_epi8( - *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315 - - ww0 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - ww1 = _mm_unpackhi_epi16( - w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - ww2 = _mm_unpacklo_epi16( - w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311 - ww3 = _mm_unpackhi_epi16( - w2, - w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315 - - *q7p7 = _mm_unpacklo_epi32( - ww0, - _mm_srli_si128( - ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx - *q6p6 = _mm_unpackhi_epi32( - _mm_slli_si128(ww0, 4), - ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx - *q5p5 = _mm_unpackhi_epi32( - ww0, - _mm_slli_si128( - ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx - *q4p4 = _mm_unpacklo_epi32( - _mm_srli_si128(ww0, 12), - ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx - *q3p3 = _mm_unpacklo_epi32( - ww1, - _mm_srli_si128( - ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx - *q2p2 = _mm_unpackhi_epi32( - _mm_slli_si128(ww1, 4), - ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx - *q1p1 = _mm_unpackhi_epi32( - ww1, - _mm_slli_si128( - ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx - *q0p0 = _mm_unpacklo_epi32( - _mm_srli_si128(ww1, 12), - ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx -} - static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, __m128i *hev, __m128i *mask, __m128i *qs1qs0, __m128i *ps1ps0) { @@ -424,7 +146,7 @@ static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, __m128i hev1; const __m128i t3t4 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4); - const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t80 = _mm_set1_epi8((char)0x80); const __m128i ff = _mm_cmpeq_epi8(t80, t80); ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ @@ -473,7 +195,7 @@ static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0, __m128i *ps1ps0) { const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); - const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t80 = _mm_set1_epi8((char)0x80); __m128i filter, filter2filter1, work; __m128i ps1ps0_work, qs1qs0_work; __m128i hev1; @@ -616,10 +338,10 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, __m128i qs1qs0, ps1ps0; __m128i p1, p0, q0, q1; - p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); - p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); - q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p)); - q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); + p1 = xx_loadl_32(s - 2 * p); + p0 = xx_loadl_32(s - 1 * p); + q0 = xx_loadl_32(s - 0 * p); + q1 = xx_loadl_32(s + 1 * p); lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0); @@ -688,7 +410,7 @@ static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2( __m128i fe, ff, work; abs_p1p0 = abs_diff(*q1p1, *q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - fe = _mm_set1_epi8(0xfe); + fe = _mm_set1_epi8((char)0xfe); ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); abs_p0q0 = abs_diff(p1p0, q1q0); abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); @@ -992,7 +714,7 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2( __m128i abs_p1q1, abs_p0q0, abs_q1q0; abs_p1p0 = abs_diff(*q1p1, *q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); - fe = _mm_set1_epi8(0xfe); + fe = _mm_set1_epi8((char)0xfe); ff = _mm_cmpeq_epi8(fe, fe); abs_p0q0 = abs_diff(p1p0, q1q0); abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); @@ -1241,23 +963,16 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, __m128i limit = _mm_load_si128((const __m128i *)_limit); __m128i thresh = _mm_load_si128((const __m128i *)_thresh); - q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)), - _mm_cvtsi32_si128(*(int *)(s + 4 * p))); - q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)), - _mm_cvtsi32_si128(*(int *)(s + 3 * p))); - q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)), - _mm_cvtsi32_si128(*(int *)(s + 2 * p))); - q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)), - _mm_cvtsi32_si128(*(int *)(s + 1 * p))); + q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p)); + q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p)); + q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p)); + q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p)); - q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)), - _mm_cvtsi32_si128(*(int *)(s - 0 * p))); + q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p)); - q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)), - _mm_cvtsi32_si128(*(int *)(s + 5 * p))); + q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p)); - q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)), - _mm_cvtsi32_si128(*(int *)(s + 6 * p))); + q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p)); lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, &limit, &thresh); @@ -1288,7 +1003,7 @@ static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2( *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((char)0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); { @@ -1417,7 +1132,7 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2( *q1q0 = _mm_unpacklo_epi32(*q0, *q1); const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((char)0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); { // filter_mask and hev_mask @@ -1543,12 +1258,12 @@ void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, __m128i limit = _mm_load_si128((__m128i *)_limit); __m128i thresh = _mm_load_si128((__m128i *)_thresh); - p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p)); - p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); - p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); - q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p)); - q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); - q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p)); + p2 = xx_loadl_32(s - 3 * p); + p1 = xx_loadl_32(s - 2 * p); + p0 = xx_loadl_32(s - 1 * p); + q0 = xx_loadl_32(s - 0 * p); + q1 = xx_loadl_32(s + 1 * p); + q2 = xx_loadl_32(s + 2 * p); lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, &limit, &thresh); @@ -1622,7 +1337,7 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2( // otherwise - not const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((char)0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; @@ -1777,7 +1492,7 @@ static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2( // otherwise - not const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((char)0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; @@ -1895,20 +1610,20 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i q1q0, p1p0; __m128i blimit = _mm_load_si128((const __m128i *)_blimit); __m128i limit = _mm_load_si128((const __m128i *)_limit); __m128i thresh = _mm_load_si128((const __m128i *)_thresh); - p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p)); - p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p)); - p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); - p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); - q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p)); - q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); - q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p)); - q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p)); + p3 = xx_loadl_32(s - 4 * p); + p2 = xx_loadl_32(s - 3 * p); + p1 = xx_loadl_32(s - 2 * p); + p0 = xx_loadl_32(s - 1 * p); + q0 = xx_loadl_32(s - 0 * p); + q1 = xx_loadl_32(s + 1 * p); + q2 = xx_loadl_32(s + 2 * p); + q3 = xx_loadl_32(s + 3 * p); lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, &limit, &thresh); diff --git a/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h b/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h index 8970fe7dd6..6ed2cbfdf4 100644 --- a/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h +++ b/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h @@ -212,4 +212,284 @@ static INLINE void highbd_transpose8x16_sse2( d4 + 1, d5 + 1, d6 + 1, d7 + 1); } +// Low bit depth functions +static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + *d0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + + *d1 = _mm_srli_si128(*d0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(*d0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(*d0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, + __m128i *d5, __m128i *d6, + __m128i *d7) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1, ww0, ww1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + ww0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + + *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + *d1 = _mm_srli_si128(ww0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(ww0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(ww0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + *d5 = _mm_srli_si128(ww1, + 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + *d6 = _mm_srli_si128(ww1, + 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + *d7 = _mm_srli_si128(ww1, + 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0, + __m128i *d1, __m128i *d2, + __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + // output + // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx + // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx + // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx + + __m128i w0, w1, w2, w3, w4, w5; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d1 = _mm_srli_si128(*d0, 8); + *d2 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + *d3 = _mm_srli_si128(*d2, 8); +} + +static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0d1, + __m128i *d2d3, __m128i *d4d5, + __m128i *d6d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0d1 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d2d3 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + w6 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + w7 = _mm_unpackhi_epi16( + w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + + *d4d5 = _mm_unpacklo_epi32( + w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + *d6d7 = _mm_unpackhi_epi32( + w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 +} + +static INLINE void transpose16x8_8x16_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9, + __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14, + __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpacklo_epi8(*x8, *x9); + w9 = _mm_unpacklo_epi8(*x10, *x11); + w10 = _mm_unpacklo_epi8(*x12, *x13); + w11 = _mm_unpacklo_epi8(*x14, *x15); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0 = _mm_unpacklo_epi64(w6, w14); + *d1 = _mm_unpackhi_epi64(w6, w14); + *d2 = _mm_unpacklo_epi64(w7, w15); + *d3 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d4 = _mm_unpacklo_epi64(w6, w14); + *d5 = _mm_unpackhi_epi64(w6, w14); + *d6 = _mm_unpacklo_epi64(w7, w15); + *d7 = _mm_unpackhi_epi64(w7, w15); +} + +static INLINE void transpose8x16_16x8_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, + __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11, + __m128i *d12d13, __m128i *d14d15) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpackhi_epi8(*x0, *x1); + w9 = _mm_unpackhi_epi8(*x2, *x3); + w10 = _mm_unpackhi_epi8(*x4, *x5); + w11 = _mm_unpackhi_epi8(*x6, *x7); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0d1 = _mm_unpacklo_epi64(w6, w14); + *d2d3 = _mm_unpackhi_epi64(w6, w14); + *d4d5 = _mm_unpacklo_epi64(w7, w15); + *d6d7 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d8d9 = _mm_unpacklo_epi64(w6, w14); + *d10d11 = _mm_unpackhi_epi64(w6, w14); + *d12d13 = _mm_unpacklo_epi64(w7, w15); + *d14d15 = _mm_unpackhi_epi64(w7, w15); +} + #endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ diff --git a/media/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c b/media/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c new file mode 100644 index 0000000000..8ef7ee0d7b --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdio.h> +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#include "aom_dsp/x86/masked_sad_intrin_ssse3.h" + +#define MASK_SAD16XH_ONE_REF(idx) \ + a = _mm_loadu_si128((const __m128i *)&ref##idx[x]); \ + data_l = _mm_unpacklo_epi8(a, b); \ + mask_l = _mm_unpacklo_epi8(m, m_inv); \ + pred_l = _mm_maddubs_epi16(data_l, mask_l); \ + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \ + \ + data_r = _mm_unpackhi_epi8(a, b); \ + mask_r = _mm_unpackhi_epi8(m, m_inv); \ + pred_r = _mm_maddubs_epi16(data_r, mask_r); \ + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \ + \ + pred = _mm_packus_epi16(pred_l, pred_r); \ + res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); + +static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr[], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height, int inv_mask, + unsigned sad_array[]) { + int x, y; + __m128i a; + __m128i data_l, data_r, mask_l, mask_r, pred_l, pred_r, pred; + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + __m128i res0 = _mm_setzero_si128(); + __m128i res1 = _mm_setzero_si128(); + __m128i res2 = _mm_setzero_si128(); + __m128i res3 = _mm_setzero_si128(); + const uint8_t *ref0 = a_ptr[0]; + const uint8_t *ref1 = a_ptr[1]; + const uint8_t *ref2 = a_ptr[2]; + const uint8_t *ref3 = a_ptr[3]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m_copy = _mm_loadu_si128((const __m128i *)&m_ptr[x]); + __m128i m_inv = _mm_sub_epi8(mask_max, m_copy); + __m128i m = inv_mask ? m_inv : m_copy; + m_inv = inv_mask ? m_copy : m_inv; + + MASK_SAD16XH_ONE_REF(0) + MASK_SAD16XH_ONE_REF(1) + MASK_SAD16XH_ONE_REF(2) + MASK_SAD16XH_ONE_REF(3) + } + + src_ptr += src_stride; + ref0 += a_stride; + ref1 += a_stride; + ref2 += a_stride; + ref3 += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1), + _mm_unpackhi_epi32(res0, res1)); + res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3), + _mm_unpackhi_epi32(res2, res3)); + + res0 = _mm_unpacklo_epi64(res0, res2); + _mm_storeu_si128((__m128i *)sad_array, res0); +} + +#define MASK_SAD8XH_ONE_REF(idx) \ + const __m128i a##idx##0 = _mm_loadl_epi64((__m128i *)ref##idx); \ + const __m128i a##idx##1 = _mm_loadl_epi64((__m128i *)(ref##idx + a_stride)); \ + data_l = _mm_unpacklo_epi8(a##idx##0, b0); \ + mask_l = _mm_unpacklo_epi8(m, m_inv); \ + pred_l = _mm_maddubs_epi16(data_l, mask_l); \ + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \ + \ + data_r = _mm_unpacklo_epi8(a##idx##1, b1); \ + mask_r = _mm_unpackhi_epi8(m, m_inv); \ + pred_r = _mm_maddubs_epi16(data_r, mask_r); \ + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \ + \ + pred = _mm_packus_epi16(pred_l, pred_r); \ + res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); + +void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_array[], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height, + int inv_mask, unsigned sad_array[]) { + const uint8_t *ref0 = ref_array[0]; + const uint8_t *ref1 = ref_array[1]; + const uint8_t *ref2 = ref_array[2]; + const uint8_t *ref3 = ref_array[3]; + __m128i data_l, data_r, pred_l, pred_r, mask_l, mask_r, pred; + __m128i res0 = _mm_setzero_si128(); + __m128i res1 = _mm_setzero_si128(); + __m128i res2 = _mm_setzero_si128(); + __m128i res3 = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (int y = 0; y < height; y += 2) { + const __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride))); + const __m128i b0 = _mm_loadl_epi64((__m128i *)b_ptr); + const __m128i b1 = _mm_loadl_epi64((__m128i *)(b_ptr + b_stride)); + const __m128i m0 = _mm_loadl_epi64((__m128i *)m_ptr); + const __m128i m1 = _mm_loadl_epi64((__m128i *)(m_ptr + m_stride)); + __m128i m_copy = _mm_unpacklo_epi64(m0, m1); + __m128i m_inv = _mm_sub_epi8(mask_max, m_copy); + __m128i m = inv_mask ? m_inv : m_copy; + m_inv = inv_mask ? m_copy : m_inv; + + MASK_SAD8XH_ONE_REF(0) + MASK_SAD8XH_ONE_REF(1) + MASK_SAD8XH_ONE_REF(2) + MASK_SAD8XH_ONE_REF(3) + + ref0 += 2 * a_stride; + ref1 += 2 * a_stride; + ref2 += 2 * a_stride; + ref3 += 2 * a_stride; + src_ptr += 2 * src_stride; + b_ptr += 2 * b_stride; + m_ptr += 2 * m_stride; + } + res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1), + _mm_unpackhi_epi32(res0, res1)); + res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3), + _mm_unpackhi_epi32(res2, res3)); + res0 = _mm_unpacklo_epi64(res0, res2); + _mm_storeu_si128((__m128i *)sad_array, res0); +} + +#define MASK_SAD4XH_ONE_REF(idx) \ + a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)ref##idx), \ + _mm_cvtsi32_si128(*(uint32_t *)&ref##idx[a_stride])); \ + data = _mm_unpacklo_epi8(a, b); \ + mask = _mm_unpacklo_epi8(m, m_inv); \ + pred = _mm_maddubs_epi16(data, mask); \ + pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS); \ + \ + pred = _mm_packus_epi16(pred, _mm_setzero_si128()); \ + res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); + +void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_array[], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height, + int inv_mask, unsigned sad_array[]) { + const uint8_t *ref0 = ref_array[0]; + const uint8_t *ref1 = ref_array[1]; + const uint8_t *ref2 = ref_array[2]; + const uint8_t *ref3 = ref_array[3]; + __m128i data, pred, mask; + __m128i res0 = _mm_setzero_si128(); + __m128i res1 = _mm_setzero_si128(); + __m128i res2 = _mm_setzero_si128(); + __m128i res3 = _mm_setzero_si128(); + __m128i a; + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (int y = 0; y < height; y += 2) { + const __m128i src = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(uint32_t *)src_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride])); + const __m128i b = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride])); + const __m128i m_copy = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride])); + + __m128i m_inv = _mm_sub_epi8(mask_max, m_copy); + __m128i m = inv_mask ? m_inv : m_copy; + m_inv = inv_mask ? m_copy : m_inv; + + MASK_SAD4XH_ONE_REF(0) + MASK_SAD4XH_ONE_REF(1) + MASK_SAD4XH_ONE_REF(2) + MASK_SAD4XH_ONE_REF(3) + + ref0 += 2 * a_stride; + ref1 += 2 * a_stride; + ref2 += 2 * a_stride; + ref3 += 2 * a_stride; + src_ptr += 2 * src_stride; + b_ptr += 2 * b_stride; + m_ptr += 2 * m_stride; + } + res0 = _mm_unpacklo_epi32(res0, res1); + res2 = _mm_unpacklo_epi32(res2, res3); + res0 = _mm_unpacklo_epi64(res0, res2); + _mm_storeu_si128((__m128i *)sad_array, res0); +} + +#define MASKSADMXN_SSSE3(m, n) \ + void aom_masked_sad##m##x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[]) { \ + masked_sadx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, \ + msk_stride, m, n, inv_mask, sad_array); \ + } + +#define MASKSAD8XN_SSSE3(n) \ + void aom_masked_sad8x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[]) { \ + aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \ + 8, msk, msk_stride, n, inv_mask, sad_array); \ + } + +#define MASKSAD4XN_SSSE3(n) \ + void aom_masked_sad4x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[]) { \ + aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \ + 4, msk, msk_stride, n, inv_mask, sad_array); \ + } + +MASKSADMXN_SSSE3(128, 128) +MASKSADMXN_SSSE3(128, 64) +MASKSADMXN_SSSE3(64, 128) +MASKSADMXN_SSSE3(64, 64) +MASKSADMXN_SSSE3(64, 32) +MASKSADMXN_SSSE3(32, 64) +MASKSADMXN_SSSE3(32, 32) +MASKSADMXN_SSSE3(32, 16) +MASKSADMXN_SSSE3(16, 32) +MASKSADMXN_SSSE3(16, 16) +MASKSADMXN_SSSE3(16, 8) +MASKSAD8XN_SSSE3(16) +MASKSAD8XN_SSSE3(8) +MASKSAD8XN_SSSE3(4) +MASKSAD4XN_SSSE3(8) +MASKSAD4XN_SSSE3(4) +MASKSAD4XN_SSSE3(16) +MASKSADMXN_SSSE3(16, 4) +MASKSAD8XN_SSSE3(32) +MASKSADMXN_SSSE3(32, 8) +MASKSADMXN_SSSE3(16, 64) +MASKSADMXN_SSSE3(64, 16) diff --git a/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c index 584b5e7e37..60f0ab3390 100644 --- a/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c +++ b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c @@ -17,7 +17,7 @@ #include "aom_dsp/blend.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86//masked_sad_intrin_ssse3.h" +#include "aom_dsp/x86/masked_sad_intrin_ssse3.h" static INLINE unsigned int masked_sad32xh_avx2( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, @@ -64,7 +64,7 @@ static INLINE unsigned int masked_sad32xh_avx2( res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int32_t sad = _mm256_extract_epi32(res, 0); - return (sad + 31) >> 6; + return sad; } static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) { @@ -117,7 +117,7 @@ static INLINE unsigned int masked_sad16xh_avx2( res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int32_t sad = _mm256_extract_epi32(res, 0); - return (sad + 31) >> 6; + return sad; } static INLINE unsigned int aom_masked_sad_avx2( @@ -253,7 +253,7 @@ static INLINE unsigned int highbd_masked_sad8xh_avx2( res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); - return (sad + 31) >> 6; + return sad; } static INLINE unsigned int highbd_masked_sad16xh_avx2( @@ -311,7 +311,7 @@ static INLINE unsigned int highbd_masked_sad16xh_avx2( res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); - return (sad + 31) >> 6; + return sad; } static INLINE unsigned int aom_highbd_masked_sad_avx2( diff --git a/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c index 493f9bd8f2..7168277963 100644 --- a/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c +++ b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -19,7 +19,7 @@ #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86//masked_sad_intrin_ssse3.h" +#include "aom_dsp/x86/masked_sad_intrin_ssse3.h" // For width a multiple of 16 static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, @@ -134,7 +134,7 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. int32_t sad = _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); - return (sad + 31) >> 6; + return sad; } unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, @@ -179,7 +179,7 @@ unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, } int32_t sad = _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); - return (sad + 31) >> 6; + return sad; } unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, @@ -223,7 +223,7 @@ unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, } // At this point, the SAD is stored in lane 0 of 'res' int32_t sad = _mm_cvtsi128_si32(res); - return (sad + 31) >> 6; + return sad; } // For width a multiple of 8 @@ -338,7 +338,7 @@ static INLINE unsigned int highbd_masked_sad_ssse3( res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); - return (sad + 31) >> 6; + return sad; } unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, @@ -398,5 +398,5 @@ unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); - return (sad + 31) >> 6; + return sad; } diff --git a/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c b/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c index d7dbefd7d9..fa93f0df4f 100644 --- a/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c +++ b/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -218,15 +218,15 @@ static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, } } -static INLINE __m128i filter_block_2rows(const __m128i a0, const __m128i b0, - const __m128i a1, const __m128i b1, - const __m128i filter) { - __m128i v0 = _mm_unpacklo_epi8(a0, b0); - v0 = _mm_maddubs_epi16(v0, filter); +static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0, + const __m128i *a1, const __m128i *b1, + const __m128i *filter) { + __m128i v0 = _mm_unpacklo_epi8(*a0, *b0); + v0 = _mm_maddubs_epi16(v0, *filter); v0 = xx_roundn_epu16(v0, FILTER_BITS); - __m128i v1 = _mm_unpacklo_epi8(a1, b1); - v1 = _mm_maddubs_epi16(v1, filter); + __m128i v1 = _mm_unpacklo_epi8(*a1, *b1); + v1 = _mm_maddubs_epi16(v1, *filter); v1 = xx_roundn_epu16(v1, FILTER_BITS); return _mm_packus_epi16(v0, v1); @@ -262,7 +262,7 @@ static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, const __m128i z0 = _mm_srli_si128(x0, 1); const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); const __m128i z1 = _mm_srli_si128(x1, 1); - const __m128i res = filter_block_2rows(x0, z0, x1, z1, hfilter_vec); + const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec); _mm_storeu_si128((__m128i *)b, res); src += src_stride * 2; @@ -296,7 +296,7 @@ static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, const __m128i x = _mm_loadl_epi64((__m128i *)dst); const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]); const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]); - const __m128i res = filter_block_2rows(x, y, y, z, vfilter_vec); + const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec); _mm_storeu_si128((__m128i *)dst, res); dst += 16; @@ -343,7 +343,7 @@ static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, const __m128i b0 = _mm_unpacklo_epi32(z0, z1); const __m128i a1 = _mm_unpacklo_epi32(x2, x3); const __m128i b1 = _mm_unpacklo_epi32(z2, z3); - const __m128i res = filter_block_2rows(a0, b0, a1, b1, hfilter_vec); + const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec); _mm_storeu_si128((__m128i *)b, res); src += src_stride * 4; @@ -384,7 +384,7 @@ static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, const __m128i b0 = _mm_unpacklo_epi32(b, c); const __m128i a1 = _mm_unpacklo_epi32(c, d); const __m128i b1 = _mm_unpacklo_epi32(d, e); - const __m128i res = filter_block_2rows(a0, b0, a1, b1, vfilter_vec); + const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec); _mm_storeu_si128((__m128i *)dst, res); dst += 16; @@ -392,29 +392,29 @@ static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, } } -static INLINE void accumulate_block(const __m128i src, const __m128i a, - const __m128i b, const __m128i m, +static INLINE void accumulate_block(const __m128i *src, const __m128i *a, + const __m128i *b, const __m128i *m, __m128i *sum, __m128i *sum_sq) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); - const __m128i m_inv = _mm_sub_epi8(mask_max, m); + const __m128i m_inv = _mm_sub_epi8(mask_max, *m); // Calculate 16 predicted pixels. // Note that the maximum value of any entry of 'pred_l' or 'pred_r' // is 64 * 255, so we have plenty of space to add rounding constants. - const __m128i data_l = _mm_unpacklo_epi8(a, b); - const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); + const __m128i data_l = _mm_unpacklo_epi8(*a, *b); + const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv); __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); - const __m128i data_r = _mm_unpackhi_epi8(a, b); - const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); + const __m128i data_r = _mm_unpackhi_epi8(*a, *b); + const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv); __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); - const __m128i src_l = _mm_unpacklo_epi8(src, zero); - const __m128i src_r = _mm_unpackhi_epi8(src, zero); + const __m128i src_l = _mm_unpacklo_epi8(*src, zero); + const __m128i src_r = _mm_unpackhi_epi8(*src, zero); const __m128i diff_l = _mm_sub_epi16(pred_l, src_l); const __m128i diff_r = _mm_sub_epi16(pred_r, src_r); @@ -440,7 +440,7 @@ static void masked_variance(const uint8_t *src_ptr, int src_stride, const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); - accumulate_block(src, a, b, m, &sum, &sum_sq); + accumulate_block(&src, &a, &b, &m, &sum, &sum_sq); } src_ptr += src_stride; @@ -471,7 +471,7 @@ static void masked_variance8xh(const uint8_t *src_ptr, int src_stride, const __m128i m = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); - accumulate_block(src, a, b, m, &sum, &sum_sq); + accumulate_block(&src, &a, &b, &m, &sum, &sum_sq); src_ptr += src_stride * 2; a_ptr += 16; @@ -503,7 +503,7 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, const __m128i m = _mm_setr_epi32( *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride], *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]); - accumulate_block(src, a, b, m, &sum, &sum_sq); + accumulate_block(&src, &a, &b, &m, &sum, &sum_sq); src_ptr += src_stride * 4; a_ptr += 16; @@ -517,6 +517,7 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); } +#if CONFIG_AV1_HIGHBITDEPTH // For width a multiple of 8 static void highbd_bilinear_filter(const uint16_t *src, int src_stride, int xoffset, int yoffset, uint16_t *dst, @@ -797,17 +798,17 @@ static void highbd_bilinear_filter(const uint16_t *src, int src_stride, } } -static INLINE __m128i highbd_filter_block_2rows(const __m128i a0, - const __m128i b0, - const __m128i a1, - const __m128i b1, - const __m128i filter) { - __m128i v0 = _mm_unpacklo_epi16(a0, b0); - v0 = _mm_madd_epi16(v0, filter); +static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0, + const __m128i *b0, + const __m128i *a1, + const __m128i *b1, + const __m128i *filter) { + __m128i v0 = _mm_unpacklo_epi16(*a0, *b0); + v0 = _mm_madd_epi16(v0, *filter); v0 = xx_roundn_epu32(v0, FILTER_BITS); - __m128i v1 = _mm_unpacklo_epi16(a1, b1); - v1 = _mm_madd_epi16(v1, filter); + __m128i v1 = _mm_unpacklo_epi16(*a1, *b1); + v1 = _mm_madd_epi16(v1, *filter); v1 = xx_roundn_epu32(v1, FILTER_BITS); return _mm_packs_epi32(v0, v1); @@ -845,7 +846,7 @@ static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); const __m128i z1 = _mm_srli_si128(x1, 2); const __m128i res = - highbd_filter_block_2rows(x0, z0, x1, z1, hfilter_vec); + highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec); _mm_storeu_si128((__m128i *)b, res); src += src_stride * 2; @@ -879,7 +880,8 @@ static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, const __m128i x = _mm_loadl_epi64((__m128i *)dst); const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]); const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]); - const __m128i res = highbd_filter_block_2rows(x, y, y, z, vfilter_vec); + const __m128i res = + highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec); _mm_storeu_si128((__m128i *)dst, res); dst += 8; @@ -1024,6 +1026,7 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, *sum_ = _mm_cvtsi128_si32(sum); *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); } +#endif // CONFIG_AV1_HIGHBITDEPTH void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, diff --git a/media/libaom/src/aom_dsp/x86/obmc_variance_sse4.c b/media/libaom/src/aom_dsp/x86/obmc_variance_sse4.c index 72eda0e578..aa73c392dd 100644 --- a/media/libaom/src/aom_dsp/x86/obmc_variance_sse4.c +++ b/media/libaom/src/aom_dsp/x86/obmc_variance_sse4.c @@ -166,7 +166,7 @@ OBMC_SUBPIX_VAR(64, 16) //////////////////////////////////////////////////////////////////////////////// // High bit-depth //////////////////////////////////////////////////////////////////////////////// - +#if CONFIG_AV1_HIGHBITDEPTH static INLINE void hbd_obmc_variance_w4( const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) { @@ -378,3 +378,4 @@ HBD_OBMCVARWXH(8, 32) HBD_OBMCVARWXH(32, 8) HBD_OBMCVARWXH(16, 64) HBD_OBMCVARWXH(64, 16) +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm b/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm index 216a0bd8f9..d6e15c4be5 100644 --- a/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm +++ b/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm @@ -126,7 +126,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ punpckhqdq m3, m3 pmullw m13, m3 ; dqc[i] = qc[i] * q - ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff + ; Store 16bit numbers as 32bit numbers in array pointed to by dqcoeff pcmpgtw m6, m5, m8 punpckhwd m6, m8, m6 pmovsxwd m11, m8 @@ -198,10 +198,7 @@ DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \ mova m4, [r2] ; m4 = shift mov r4, dqcoeffmp mov r5, iscanmp -%ifidn %1, b_32x32 - psllw m4, 1 -%endif - pxor m5, m5 ; m5 = dedicated zero + pxor m5, m5 ; m5 = dedicated zero DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob @@ -255,9 +252,26 @@ DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \ pmulhw m13, m11, m2 ; m13 = m11*q>>16 paddw m8, m6 ; m8 += m6 paddw m13, m11 ; m13 += m11 + %ifidn %1, b_32x32 + pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh + %endif pmulhw m8, m4 ; m8 = m8*qsh>>16 + %ifidn %1, b_32x32 + psllw m8, 1 + psrlw m5, 15 + por m8, m5 + %endif punpckhqdq m4, m4 + %ifidn %1, b_32x32 + pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh + %endif pmulhw m13, m4 ; m13 = m13*qsh>>16 + %ifidn %1, b_32x32 + psllw m13, 1 + psrlw m5, 15 + por m13, m5 + pxor m5, m5 ; reset m5 to zero register + %endif psignw m8, m9 ; m8 = reinsert sign psignw m13, m10 ; m13 = reinsert sign pand m8, m7 @@ -289,7 +303,7 @@ DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \ psignw m13, m10 %endif - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff pcmpgtw m6, m5, m8 punpckhwd m6, m8, m6 pmovsxwd m11, m8 @@ -359,8 +373,23 @@ DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \ pmulhw m13, m11, m2 ; m13 = m11*q>>16 paddw m14, m6 ; m14 += m6 paddw m13, m11 ; m13 += m11 + %ifidn %1, b_32x32 + pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh + %endif pmulhw m14, m4 ; m14 = m14*qsh>>16 + %ifidn %1, b_32x32 + psllw m14, 1 + psrlw m5, 15 + por m14, m5 + pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh + %endif pmulhw m13, m4 ; m13 = m13*qsh>>16 + %ifidn %1, b_32x32 + psllw m13, 1 + psrlw m5, 15 + por m13, m5 + pxor m5, m5 ; reset m5 to zero register + %endif psignw m14, m9 ; m14 = reinsert sign psignw m13, m10 ; m13 = reinsert sign pand m14, m7 @@ -391,7 +420,7 @@ DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \ psignw m13, m10 %endif - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff pcmpgtw m6, m5, m14 punpckhwd m6, m14, m6 pmovsxwd m11, m14 diff --git a/media/libaom/src/aom_dsp/x86/quantize_sse2.c b/media/libaom/src/aom_dsp/x86/quantize_sse2.c index d3de6e24db..ebef1fbac2 100644 --- a/media/libaom/src/aom_dsp/x86/quantize_sse2.c +++ b/media/libaom/src/aom_dsp/x86/quantize_sse2.c @@ -18,28 +18,6 @@ #include "aom/aom_integer.h" #include "aom_dsp/x86/quantize_x86.h" -static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { - assert(sizeof(tran_low_t) == 4); - - return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], - (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], - (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); -} - -static INLINE void store_coefficients(__m128i coeff_vals, - tran_low_t *coeff_ptr) { - assert(sizeof(tran_low_t) == 4); - - __m128i one = _mm_set1_epi16(1); - __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); - __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); - __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); - __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); -} - void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, diff --git a/media/libaom/src/aom_dsp/x86/quantize_ssse3.c b/media/libaom/src/aom_dsp/x86/quantize_ssse3.c new file mode 100644 index 0000000000..25980a055a --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/quantize_ssse3.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <tmmintrin.h> +#include <emmintrin.h> +#include <xmmintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round, + const __m128i quant, + const __m128i *shift) { + __m128i tmp, qcoeff, tmp1; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + tmp = _mm_mullo_epi16(qcoeff, *shift); + tmp = _mm_srli_epi16(tmp, 14); + tmp1 = _mm_mulhi_epi16(qcoeff, *shift); + tmp1 = _mm_slli_epi16(tmp1, 2); + *coeff = _mm_or_si128(tmp, tmp1); +} + +static INLINE void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff, + const __m128i dequant, + const __m128i zero, + tran_low_t *dqcoeff) { + // Un-sign to bias rounding like C. + const __m128i coeff = _mm_abs_epi16(qcoeff); + + const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff); + const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff); + + const __m128i low = _mm_mullo_epi16(coeff, dequant); + const __m128i high = _mm_mulhi_epi16(coeff, dequant); + __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + // "Divide" by 4. + dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2); + dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2); + + dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0); + dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +} + +void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i two = _mm_set1_epi16(2); + int index; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1, all_zero; + __m128i eob = zero, eob0; + + (void)scan; + (void)n_coeffs; + + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + // Shift with rounding. + zbin = _mm_add_epi16(zbin, two); + round = _mm_add_epi16(round, two); + zbin = _mm_srli_epi16(zbin, 2); + round = _mm_srli_epi16(round, 2); + zbin = _mm_sub_epi16(zbin, one); + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs. + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + } + + // AC only loop. + for (index = 16; index < 1024; index += 16) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + continue; + } + calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift); + calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, + dqcoeff_ptr + 8 + index); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/media/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm b/media/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm index 39d4ca674c..fa616a6f1a 100644 --- a/media/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm +++ b/media/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm @@ -48,9 +48,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ mov r3, qcoeffmp mov r4, dqcoeffmp mov r5, iscanmp -%ifidn %1, b_32x32 - psllw m4, 1 -%endif pxor m5, m5 ; m5 = dedicated zero DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob lea coeffq, [ coeffq+ncoeffq*4] @@ -78,9 +75,26 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ pmulhw m13, m11, m2 ; m13 = m11*q>>16 paddw m8, m6 ; m8 += m6 paddw m13, m11 ; m13 += m11 + %ifidn %1, b_32x32 + pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh + %endif pmulhw m8, m4 ; m8 = m8*qsh>>16 + %ifidn %1, b_32x32 + psllw m8, 1 + psrlw m5, 15 + por m8, m5 + %endif punpckhqdq m4, m4 + %ifidn %1, b_32x32 + pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh + %endif pmulhw m13, m4 ; m13 = m13*qsh>>16 + %ifidn %1, b_32x32 + psllw m13, 1 + psrlw m5, 15 + por m13, m5 + pxor m5, m5 ; reset m5 to zero register + %endif psignw m8, m9 ; m8 = reinsert sign psignw m13, m10 ; m13 = reinsert sign pand m8, m7 @@ -117,7 +131,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ psignw m8, m9 psignw m13, m10 %endif - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff mova m11, m8 mova m6, m8 pcmpgtw m5, m8 @@ -169,12 +183,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ pmulhw m13, m11, m2 ; m13 = m11*q>>16 paddw m14, m6 ; m14 += m6 paddw m13, m11 ; m13 += m11 + %ifidn %1, b_32x32 + pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh + %endif pmulhw m14, m4 ; m14 = m14*qsh>>16 + %ifidn %1, b_32x32 + psllw m14, 1 + psrlw m5, 15 + por m14, m5 + pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh + %endif pmulhw m13, m4 ; m13 = m13*qsh>>16 + %ifidn %1, b_32x32 + psllw m13, 1 + psrlw m5, 15 + por m13, m5 + pxor m5, m5 ; reset m5 to zero register + %endif psignw m14, m9 ; m14 = reinsert sign psignw m13, m10 ; m13 = reinsert sign pand m14, m7 pand m13, m12 + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff pxor m11, m11 mova m11, m14 @@ -207,7 +237,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ psignw m13, m10 %endif - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff mova m11, m14 mova m6, m14 pcmpgtw m5, m14 diff --git a/media/libaom/src/aom_dsp/x86/quantize_x86.h b/media/libaom/src/aom_dsp/x86/quantize_x86.h index 4eed7dd29a..5b040a278a 100644 --- a/media/libaom/src/aom_dsp/x86/quantize_x86.h +++ b/media/libaom/src/aom_dsp/x86/quantize_x86.h @@ -32,6 +32,11 @@ static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { return _mm_sub_epi16(a, sign); } +static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi32(a, sign); +} + static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, const __m128i quant, const __m128i shift) { __m128i tmp, qcoeff; @@ -41,10 +46,53 @@ static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, *coeff = _mm_mulhi_epi16(qcoeff, shift); } +static INLINE void calculate_qcoeff_log_scale(__m128i *coeff, + const __m128i round, + const __m128i quant, + const __m128i *shift, + const int *log_scale) { + __m128i tmp, tmp1, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + tmp = _mm_mullo_epi16(qcoeff, *shift); + tmp = _mm_srli_epi16(tmp, (16 - *log_scale)); + tmp1 = _mm_mulhi_epi16(qcoeff, *shift); + tmp1 = _mm_slli_epi16(tmp1, *log_scale); + *coeff = _mm_or_si128(tmp, tmp1); +} + static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { return _mm_mullo_epi16(qcoeff, dequant); } +static INLINE void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff, + __m128i dequant, + const __m128i zero, + tran_low_t *dqcoeff, + const int *log_scale) { + // calculate abs + __m128i coeff_sign = _mm_srai_epi16(qcoeff, 15); + __m128i coeff = invert_sign_sse2(qcoeff, coeff_sign); + + const __m128i sign_0 = _mm_unpacklo_epi16(coeff_sign, zero); + const __m128i sign_1 = _mm_unpackhi_epi16(coeff_sign, zero); + + const __m128i low = _mm_mullo_epi16(coeff, dequant); + const __m128i high = _mm_mulhi_epi16(coeff, dequant); + __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, *log_scale); + dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, *log_scale); + + dqcoeff32_0 = invert_sign_32_sse2(dqcoeff32_0, sign_0); + dqcoeff32_1 = invert_sign_32_sse2(dqcoeff32_1, sign_1); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +} + // Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing // to zbin to add 1 to the index in 'scan'. static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, @@ -75,3 +123,80 @@ static INLINE int16_t accumulate_eob(__m128i eob) { eob = _mm_max_epi16(eob, eob_shuffled); return _mm_extract_epi16(eob, 1); } + +static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { + assert(sizeof(tran_low_t) == 4); + const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr)); + const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); + return _mm_packs_epi32(coeff1, coeff2); +} + +static INLINE void store_coefficients(__m128i coeff_vals, + tran_low_t *coeff_ptr) { + assert(sizeof(tran_low_t) == 4); + + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); +} + +static INLINE void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1, + const int16_t *iscan_ptr, int *is_found, + __m128i *mask) { + __m128i all_zero; + __m128i temp_mask = _mm_setzero_si128(); + all_zero = _mm_or_si128(*cmp_mask0, *cmp_mask1); + if (_mm_movemask_epi8(all_zero)) { + __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr)); + __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0); + __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8)); + __m128i mask1 = _mm_and_si128(*cmp_mask1, iscan1); + temp_mask = _mm_max_epi16(mask0, mask1); + *is_found = 1; + } + *mask = _mm_max_epi16(temp_mask, *mask); +} + +static INLINE void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1, + __m128i *threshold, const int16_t *iscan_ptr, + int *is_found, __m128i *mask) { + __m128i zero = _mm_setzero_si128(); + __m128i coeff[4], cmp_mask0, cmp_mask1, cmp_mask2, cmp_mask3; + + coeff[0] = _mm_unpacklo_epi16(*qcoeff0, zero); + coeff[1] = _mm_unpackhi_epi16(*qcoeff0, zero); + coeff[2] = _mm_unpacklo_epi16(*qcoeff1, zero); + coeff[3] = _mm_unpackhi_epi16(*qcoeff1, zero); + + coeff[0] = _mm_slli_epi32(coeff[0], AOM_QM_BITS); + cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]); + coeff[1] = _mm_slli_epi32(coeff[1], AOM_QM_BITS); + cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]); + coeff[2] = _mm_slli_epi32(coeff[2], AOM_QM_BITS); + cmp_mask2 = _mm_cmpgt_epi32(coeff[2], threshold[1]); + coeff[3] = _mm_slli_epi32(coeff[3], AOM_QM_BITS); + cmp_mask3 = _mm_cmpgt_epi32(coeff[3], threshold[1]); + + cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1); + cmp_mask1 = _mm_packs_epi32(cmp_mask2, cmp_mask3); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan_ptr, is_found, mask); +} + +static INLINE int calculate_non_zero_count(__m128i mask) { + __m128i mask0, mask1; + int non_zero_count = 0; + mask0 = _mm_unpackhi_epi64(mask, mask); + mask1 = _mm_max_epi16(mask0, mask); + mask0 = _mm_shuffle_epi32(mask1, 1); + mask0 = _mm_max_epi16(mask0, mask1); + mask1 = _mm_srli_epi32(mask0, 16); + mask0 = _mm_max_epi16(mask0, mask1); + non_zero_count = _mm_extract_epi16(mask0, 0) + 1; + + return non_zero_count; +} diff --git a/media/libaom/src/aom_dsp/x86/sad4d_avx2.c b/media/libaom/src/aom_dsp/x86/sad4d_avx2.c index f662b62b16..0771252584 100644 --- a/media/libaom/src/aom_dsp/x86/sad4d_avx2.c +++ b/media/libaom/src/aom_dsp/x86/sad4d_avx2.c @@ -14,41 +14,43 @@ #include "aom/aom_integer.h" -void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { +void aom_sadMxNx4d_avx2(int M, int N, const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; - __m256i sum_mlow, sum_mhigh; - int i; + int i, j; const uint8_t *ref0, *ref1, *ref2, *ref3; ref0 = ref[0]; ref1 = ref[1]; ref2 = ref[2]; ref3 = ref[3]; - sum_ref0 = _mm256_set1_epi16(0); - sum_ref1 = _mm256_set1_epi16(0); - sum_ref2 = _mm256_set1_epi16(0); - sum_ref3 = _mm256_set1_epi16(0); - for (i = 0; i < 32; i++) { - // load src and all refs - src_reg = _mm256_loadu_si256((const __m256i *)src); - ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); - ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); - ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); - ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); - // sum of the absolute differences between every ref-i to src - ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); - ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); - ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); - ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); - // sum every ref-i - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); - + sum_ref0 = _mm256_setzero_si256(); + sum_ref2 = _mm256_setzero_si256(); + sum_ref1 = _mm256_setzero_si256(); + sum_ref3 = _mm256_setzero_si256(); + + for (i = 0; i < N; i++) { + for (j = 0; j < M; j += 32) { + // load src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)(src + j)); + ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j)); + ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j)); + ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j)); + ref3_reg = _mm256_loadu_si256((const __m256i *)(ref3 + j)); + + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + } src += src_stride; ref0 += ref_stride; ref1 += ref_stride; @@ -57,6 +59,7 @@ void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride, } { __m128i sum; + __m256i sum_mlow, sum_mhigh; // in sum_ref-i the result is saved in the first 4 bytes // the other 4 bytes are zeroed. // sum_ref1 and sum_ref3 are shifted left by 4 bytes @@ -80,139 +83,24 @@ void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride, _mm_storeu_si128((__m128i *)(res), sum); } - _mm256_zeroupper(); } -void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; - __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; - __m256i ref3_reg, ref3next_reg; - __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; - __m256i sum_mlow, sum_mhigh; - int i; - const uint8_t *ref0, *ref1, *ref2, *ref3; - - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - sum_ref0 = _mm256_set1_epi16(0); - sum_ref1 = _mm256_set1_epi16(0); - sum_ref2 = _mm256_set1_epi16(0); - sum_ref3 = _mm256_set1_epi16(0); - for (i = 0; i < 64; i++) { - // load 64 bytes from src and all refs - src_reg = _mm256_loadu_si256((const __m256i *)src); - srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32)); - ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); - ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32)); - ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); - ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32)); - ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); - ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32)); - ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); - ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32)); - // sum of the absolute differences between every ref-i to src - ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); - ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); - ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); - ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); - ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); - ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); - ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); - ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); - - // sum every ref-i - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; - } - { - __m128i sum; - - // in sum_ref-i the result is saved in the first 4 bytes - // the other 4 bytes are zeroed. - // sum_ref1 and sum_ref3 are shifted left by 4 bytes - sum_ref1 = _mm256_slli_si256(sum_ref1, 4); - sum_ref3 = _mm256_slli_si256(sum_ref3, 4); - - // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 - sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); - sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); - - // merge every 64 bit from each sum_ref-i - sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); - sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); - - // add the low 64 bit to the high 64 bit - sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); - - // add the low 128 bit to the high 128 bit - sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), - _mm256_extractf128_si256(sum_mlow, 1)); - - _mm_storeu_si128((__m128i *)(res), sum); +#define sadMxN_avx2(m, n) \ + void aom_sad##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res); \ } - _mm256_zeroupper(); -} -void aom_sad32x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - const uint8_t *rf[4]; - uint32_t sum0[4]; - uint32_t sum1[4]; +sadMxN_avx2(32, 8); +sadMxN_avx2(32, 16); +sadMxN_avx2(32, 32); +sadMxN_avx2(32, 64); - rf[0] = ref[0]; - rf[1] = ref[1]; - rf[2] = ref[2]; - rf[3] = ref[3]; - aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0); - src += src_stride << 5; - rf[0] += ref_stride << 5; - rf[1] += ref_stride << 5; - rf[2] += ref_stride << 5; - rf[3] += ref_stride << 5; - aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1); - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - res[2] = sum0[2] + sum1[2]; - res[3] = sum0[3] + sum1[3]; -} +sadMxN_avx2(64, 16); +sadMxN_avx2(64, 32); +sadMxN_avx2(64, 64); +sadMxN_avx2(64, 128); -void aom_sad64x32x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - const uint8_t *rf[4]; - uint32_t sum0[4]; - uint32_t sum1[4]; - unsigned int half_width = 32; - - rf[0] = ref[0]; - rf[1] = ref[1]; - rf[2] = ref[2]; - rf[3] = ref[3]; - aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0); - src += half_width; - rf[0] += half_width; - rf[1] += half_width; - rf[2] += half_width; - rf[3] += half_width; - aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1); - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - res[2] = sum0[2] + sum1[2]; - res[3] = sum0[3] + sum1[3]; -} +sadMxN_avx2(128, 64); +sadMxN_avx2(128, 128); diff --git a/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm b/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm index 55a856985a..a9043742d4 100644 --- a/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm +++ b/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm @@ -15,15 +15,85 @@ SECTION .text -; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_4x2x4 5-6 0 - movd m0, [srcq +%2] +%macro AVG_4x2x4 2 + movh m2, [second_predq] + movlhps m2, m2 + pavgb %1, m2 + pavgb %2, m2 + lea second_predq, [second_predq+8] +%endmacro +; 'mflag' affect a lot how the code works. +; +; When 'mflag' is false, the 'src_strideq' resides in register, +; [srcq + src_strideq + offset] is allowed, so we can simply +; use such form to access src memory and don't bother to update +; 'srcq' at each line. We only update 'srcq' each two-lines using +; a compact LEA instruction like [srcq+src_strideq*2]. +; +; When 'mflag' is true, the 'src_strideq' resides in memory. +; we cannot use above form to access memory, we have to update +; 'srcq' at each line break. As we process two parts (first,second) +; together in each macro function, the second part may also sit +; in the next line, which means we also need to possibly add +; one 'src_strideq' to 'srcq' before processing second part. + +%macro HANDLE_FIRST_OFFSET 2 + %define first_offset %2 + %if mflag == 0 && %1 == 1 + %define first_offset (src_strideq + %2) + %endif +%endmacro + +; first_extraline, second_extraline, in_line_offset +%macro HANDLE_SECOND_OFFSET 3 + %define second_offset %3 + %if mflag && %1 == 0 && %2 == 1 + add srcq, src_strideq + %endif + %if mflag == 0 && %2 == 1 + %define second_offset (src_strideq + %3) + %endif +%endmacro + +; Notes for line_ending: +; 0 -- not a line ending +; 1 -- line ending of a odd line [line numbers starts from one] +; 2 -- line ending of a even line +; This is specically designed to handle when src_strideq is a +; memory position, under such case, we can not accomplish +; complex address calculation using LEA, and fall back to +; using simple ADD instruction at each line ending. +%macro ADVANCE_END_OF_LINE 1 + %if mflag + add srcq, src_strideq + %endif + %if mflag == 0 && %1 == 2 + lea srcq, [srcq +src_strideq*2] + %endif + + %if %1 == 2 + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] + %endif +%endmacro + +; Please note that the second_offset of src is for in_line_offset, +; so it is less than src_stride. +; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, do_avg, +; {first, second}_extraline, line_ending +%macro PROCESS_4x2x4 9 + HANDLE_FIRST_OFFSET %7, %2 + movd m0, [srcq + first_offset] + HANDLE_SECOND_OFFSET %7, %8, %4 %if %1 == 1 movd m6, [ref1q+%3] movd m4, [ref2q+%3] movd m7, [ref3q+%3] movd m5, [ref4q+%3] - movd m1, [srcq +%4] + + movd m1, [srcq + second_offset] movd m2, [ref1q+%5] punpckldq m0, m1 punpckldq m6, m2 @@ -36,6 +106,9 @@ SECTION .text movlhps m0, m0 movlhps m6, m4 movlhps m7, m5 +%if %6 == 1 + AVG_4x2x4 m6, m7 +%endif psadbw m6, m0 psadbw m7, m0 %else @@ -51,38 +124,48 @@ SECTION .text movd m4, [ref4q+%3] movd m5, [ref4q+%5] punpckldq m4, m5 - movd m5, [srcq +%4] + movd m5, [srcq + second_offset] punpckldq m0, m5 movlhps m0, m0 movlhps m1, m2 movlhps m3, m4 +%if %6 == 1 + AVG_4x2x4 m1, m3 +%endif psadbw m1, m0 psadbw m3, m0 paddd m6, m1 paddd m7, m3 %endif -%if %6 == 1 - lea srcq, [srcq +src_strideq*2] - lea ref1q, [ref1q+ref_strideq*2] - lea ref2q, [ref2q+ref_strideq*2] - lea ref3q, [ref3q+ref_strideq*2] - lea ref4q, [ref4q+ref_strideq*2] +%if %9 > 0 + ADVANCE_END_OF_LINE %9 %endif %endmacro -; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_8x2x4 5-6 0 - movh m0, [srcq +%2] +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, do_avg, +; {first,second}_extraline, line_ending +%macro PROCESS_8x2x4 9 + HANDLE_FIRST_OFFSET %7, %2 + movh m0, [srcq + first_offset] + HANDLE_SECOND_OFFSET %7, %8, %4 %if %1 == 1 movh m4, [ref1q+%3] movh m5, [ref2q+%3] movh m6, [ref3q+%3] movh m7, [ref4q+%3] - movhps m0, [srcq +%4] + movhps m0, [srcq + second_offset] movhps m4, [ref1q+%5] movhps m5, [ref2q+%5] movhps m6, [ref3q+%5] movhps m7, [ref4q+%5] +%if %6 == 1 + movu m3, [second_predq] + pavgb m4, m3 + pavgb m5, m3 + pavgb m6, m3 + pavgb m7, m3 + lea second_predq, [second_predq+mmsize] +%endif psadbw m4, m0 psadbw m5, m0 psadbw m6, m0 @@ -90,105 +173,148 @@ SECTION .text %else movh m1, [ref1q+%3] movh m2, [ref2q+%3] - movh m3, [ref3q+%3] - movhps m0, [srcq +%4] + movhps m0, [srcq + second_offset] movhps m1, [ref1q+%5] movhps m2, [ref2q+%5] - movhps m3, [ref3q+%5] +%if %6 == 1 + movu m3, [second_predq] + pavgb m1, m3 + pavgb m2, m3 +%endif psadbw m1, m0 psadbw m2, m0 - psadbw m3, m0 paddd m4, m1 - movh m1, [ref4q+%3] - movhps m1, [ref4q+%5] paddd m5, m2 - paddd m6, m3 + + movh m1, [ref3q+%3] + movhps m1, [ref3q+%5] + movh m2, [ref4q+%3] + movhps m2, [ref4q+%5] +%if %6 == 1 + pavgb m1, m3 + pavgb m2, m3 + lea second_predq, [second_predq+mmsize] +%endif psadbw m1, m0 - paddd m7, m1 + psadbw m2, m0 + paddd m6, m1 + paddd m7, m2 %endif -%if %6 == 1 - lea srcq, [srcq +src_strideq*2] - lea ref1q, [ref1q+ref_strideq*2] - lea ref2q, [ref2q+ref_strideq*2] - lea ref3q, [ref3q+ref_strideq*2] - lea ref4q, [ref4q+ref_strideq*2] +%if %9 > 0 + ADVANCE_END_OF_LINE %9 %endif %endmacro -; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_16x2x4 5-6 0 +; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, do_avg, +; {first,second}_extraline, line_ending +%macro PROCESS_16x2x4 9 ; 1st 16 px - mova m0, [srcq +%2] + HANDLE_FIRST_OFFSET %7, %2 + mova m0, [srcq + first_offset] + HANDLE_SECOND_OFFSET %7, %8, %4 %if %1 == 1 movu m4, [ref1q+%3] movu m5, [ref2q+%3] movu m6, [ref3q+%3] movu m7, [ref4q+%3] +%if %6 == 1 + movu m3, [second_predq] + pavgb m4, m3 + pavgb m5, m3 + pavgb m6, m3 + pavgb m7, m3 + lea second_predq, [second_predq+mmsize] +%endif psadbw m4, m0 psadbw m5, m0 psadbw m6, m0 psadbw m7, m0 -%else +%else ; %1 == 1 movu m1, [ref1q+%3] movu m2, [ref2q+%3] - movu m3, [ref3q+%3] +%if %6 == 1 + movu m3, [second_predq] + pavgb m1, m3 + pavgb m2, m3 +%endif psadbw m1, m0 psadbw m2, m0 - psadbw m3, m0 paddd m4, m1 - movu m1, [ref4q+%3] paddd m5, m2 - paddd m6, m3 - psadbw m1, m0 - paddd m7, m1 + + movu m1, [ref3q+%3] + movu m2, [ref4q+%3] +%if %6 == 1 + pavgb m1, m3 + pavgb m2, m3 + lea second_predq, [second_predq+mmsize] %endif + psadbw m1, m0 + psadbw m2, m0 + paddd m6, m1 + paddd m7, m2 +%endif ; %1 == 1 ; 2nd 16 px - mova m0, [srcq +%4] + mova m0, [srcq + second_offset] movu m1, [ref1q+%5] movu m2, [ref2q+%5] - movu m3, [ref3q+%5] + +%if %6 == 1 + movu m3, [second_predq] + pavgb m1, m3 + pavgb m2, m3 +%endif psadbw m1, m0 psadbw m2, m0 - psadbw m3, m0 paddd m4, m1 - movu m1, [ref4q+%5] paddd m5, m2 - paddd m6, m3 + + movu m1, [ref3q+%5] + movu m2, [ref4q+%5] + +%if %9 > 0 + ADVANCE_END_OF_LINE %9 +%endif + %if %6 == 1 - lea srcq, [srcq +src_strideq*2] - lea ref1q, [ref1q+ref_strideq*2] - lea ref2q, [ref2q+ref_strideq*2] - lea ref3q, [ref3q+ref_strideq*2] - lea ref4q, [ref4q+ref_strideq*2] + pavgb m1, m3 + pavgb m2, m3 + lea second_predq, [second_predq+mmsize] %endif psadbw m1, m0 - paddd m7, m1 + psadbw m2, m0 + paddd m6, m1 + paddd m7, m2 %endmacro -; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_32x2x4 5-6 0 - PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 - PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 +; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, do_avg, +; {first,second}_extraline, line_ending +%macro PROCESS_32x2x4 9 + PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16, %6, %7, %7, %8 - %7 + PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6, %8, %8, %9 %endmacro -; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_64x2x4 5-6 0 - PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 - PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 +; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, do_avg, +; {first,second}_extraline, line_ending +%macro PROCESS_64x2x4 9 + PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32, %6, %7, %7, %8 - %7 + PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6, %8, %8, %9 %endmacro -; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_128x2x4 5-6 0 - PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64 - PROCESS_64x2x4 0, %4, %5, %4 + 64, %5 + 64, %6 +; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, do_avg, +; {first,second}_extraline, line_ending +%macro PROCESS_128x2x4 9 + PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64, %6, %7, %7, %8 - %7 + PROCESS_64x2x4 0, %4, %5, %4 + 64, %5 + 64, %6, %8, %8, %9 %endmacro ; void aom_sadNxNx4d_sse2(uint8_t *src, int src_stride, ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 -%macro SADNXN4D 2 +%macro SADNXN4D 2-3 0 +%if %3 == 0 %if UNIX64 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ res, ref2, ref3, ref4 @@ -196,18 +322,41 @@ cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ ref2, ref3, ref4 %endif +%else ; avg + +%if UNIX64 +cglobal sad%1x%2x4d_avg, 6, 10, 8, src, src_stride, ref1, ref_stride, \ + second_pred, res, ref2, ref3, ref4 +%else +cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \ + second_pred, ref2, ref3 + %define src_strideq r1mp + %define src_strided r1mp +%endif +%endif + + %define mflag ((1 - UNIX64) & %3) movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] mov ref3q, [ref1q+gprsize*2] mov ref4q, [ref1q+gprsize*3] mov ref1q, [ref1q+gprsize*0] - PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 + PROCESS_%1x2x4 1, 0, 0, 0, ref_strideq, %3, 0, 1, 2 %rep (%2-4)/2 - PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 + PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2 %endrep - PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2 + +%if %3 == 0 + %define resultq r4 + %define resultmp r4mp +%else + %define resultq r5 + %define resultmp r5mp +%endif %if %1 > 4 pslldq m5, 4 @@ -218,16 +367,16 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ mova m7, m6 punpcklqdq m4, m6 punpckhqdq m5, m7 - movifnidn r4, r4mp paddd m4, m5 - movu [r4], m4 + movifnidn resultq, resultmp + movu [resultq], m4 RET %else - movifnidn r4, r4mp pshufd m6, m6, 0x08 pshufd m7, m7, 0x08 - movq [r4+0], m6 - movq [r4+8], m7 + movifnidn resultq, resultmp + movq [resultq+0], m6 + movq [resultq+8], m7 RET %endif %endmacro @@ -255,3 +404,25 @@ SADNXN4D 8, 32 SADNXN4D 32, 8 SADNXN4D 16, 64 SADNXN4D 64, 16 +SADNXN4D 128, 128, 1 +SADNXN4D 128, 64, 1 +SADNXN4D 64, 128, 1 +SADNXN4D 64, 64, 1 +SADNXN4D 64, 32, 1 +SADNXN4D 32, 64, 1 +SADNXN4D 32, 32, 1 +SADNXN4D 32, 16, 1 +SADNXN4D 16, 32, 1 +SADNXN4D 16, 16, 1 +SADNXN4D 16, 8, 1 +SADNXN4D 8, 16, 1 +SADNXN4D 8, 8, 1 +SADNXN4D 8, 4, 1 +SADNXN4D 4, 8, 1 +SADNXN4D 4, 4, 1 +SADNXN4D 4, 16, 1 +SADNXN4D 16, 4, 1 +SADNXN4D 8, 32, 1 +SADNXN4D 32, 8, 1 +SADNXN4D 16, 64, 1 +SADNXN4D 64, 16, 1 diff --git a/media/libaom/src/aom_dsp/x86/sad_highbd_avx2.c b/media/libaom/src/aom_dsp/x86/sad_highbd_avx2.c index b506d46639..2cff2e6a9f 100644 --- a/media/libaom/src/aom_dsp/x86/sad_highbd_avx2.c +++ b/media/libaom/src/aom_dsp/x86/sad_highbd_avx2.c @@ -37,487 +37,257 @@ static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) { return (unsigned int)_mm_cvtsi128_si32(lo128); } -unsigned int aom_highbd_sad16x8_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); - const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); +static INLINE void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r, + __m256i *sad_acc) { + const __m256i zero = _mm256_setzero_si256(); + int i; + for (i = 0; i < 4; i++) { + s[i] = _mm256_sub_epi16(s[i], r[i]); + s[i] = _mm256_abs_epi16(s[i]); + } - // first 4 rows - __m256i s0 = _mm256_loadu_si256((const __m256i *)src_ptr); - __m256i s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); - __m256i s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); - __m256i s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); - - __m256i r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); - __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); - __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); - __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); - - __m256i u0 = _mm256_sub_epi16(s0, r0); - __m256i u1 = _mm256_sub_epi16(s1, r1); - __m256i u2 = _mm256_sub_epi16(s2, r2); - __m256i u3 = _mm256_sub_epi16(s3, r3); - __m256i zero = _mm256_setzero_si256(); - __m256i sum0, sum1; - - u0 = _mm256_abs_epi16(u0); - u1 = _mm256_abs_epi16(u1); - u2 = _mm256_abs_epi16(u2); - u3 = _mm256_abs_epi16(u3); - - sum0 = _mm256_add_epi16(u0, u1); - sum0 = _mm256_add_epi16(sum0, u2); - sum0 = _mm256_add_epi16(sum0, u3); - - // second 4 rows - src_ptr += src_stride << 2; - ref_ptr += ref_stride << 2; - s0 = _mm256_loadu_si256((const __m256i *)src_ptr); - s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); - s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); - s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); - - r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); - r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); - r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); - r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); - - u0 = _mm256_sub_epi16(s0, r0); - u1 = _mm256_sub_epi16(s1, r1); - u2 = _mm256_sub_epi16(s2, r2); - u3 = _mm256_sub_epi16(s3, r3); - - u0 = _mm256_abs_epi16(u0); - u1 = _mm256_abs_epi16(u1); - u2 = _mm256_abs_epi16(u2); - u3 = _mm256_abs_epi16(u3); - - sum1 = _mm256_add_epi16(u0, u1); - sum1 = _mm256_add_epi16(sum1, u2); - sum1 = _mm256_add_epi16(sum1, u3); - - // find out the SAD - s0 = _mm256_unpacklo_epi16(sum0, zero); - s1 = _mm256_unpackhi_epi16(sum0, zero); - r0 = _mm256_unpacklo_epi16(sum1, zero); - r1 = _mm256_unpackhi_epi16(sum1, zero); - s0 = _mm256_add_epi32(s0, s1); - r0 = _mm256_add_epi32(r0, r1); - sum0 = _mm256_add_epi32(s0, r0); - // 8 32-bit summation + s[0] = _mm256_add_epi16(s[0], s[1]); + s[0] = _mm256_add_epi16(s[0], s[2]); + s[0] = _mm256_add_epi16(s[0], s[3]); - return (unsigned int)get_sad_from_mm256_epi32(&sum0); + r[0] = _mm256_unpacklo_epi16(s[0], zero); + r[1] = _mm256_unpackhi_epi16(s[0], zero); + + r[0] = _mm256_add_epi32(r[0], r[1]); + *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); } -unsigned int aom_highbd_sad16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { +// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD. +static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[4], r[4]; + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + if (sec_ptr) { + r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + } + highbd_sad16x4_core_avx2(s, r, sad_acc); +} + +static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_avx2(int N, + const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); - __m256i s0, s1, s2, s3, r0, r1, r2, r3, u0, u1, u2, u3; - __m256i sum0; - __m256i sum = _mm256_setzero_si256(); - const __m256i zero = _mm256_setzero_si256(); - int row = 0; - - // Loop for every 4 rows - while (row < 16) { - s0 = _mm256_loadu_si256((const __m256i *)src_ptr); - s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); - s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); - s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); - - r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); - r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); - r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); - r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); - - u0 = _mm256_sub_epi16(s0, r0); - u1 = _mm256_sub_epi16(s1, r1); - u2 = _mm256_sub_epi16(s2, r2); - u3 = _mm256_sub_epi16(s3, r3); - - u0 = _mm256_abs_epi16(u0); - u1 = _mm256_abs_epi16(u1); - u2 = _mm256_abs_epi16(u2); - u3 = _mm256_abs_epi16(u3); - - sum0 = _mm256_add_epi16(u0, u1); - sum0 = _mm256_add_epi16(sum0, u2); - sum0 = _mm256_add_epi16(sum0, u3); - - s0 = _mm256_unpacklo_epi16(sum0, zero); - s1 = _mm256_unpackhi_epi16(sum0, zero); - sum = _mm256_add_epi32(sum, s0); - sum = _mm256_add_epi32(sum, s1); - // 8 32-bit summation - - row += 4; + int i; + __m256i sad = _mm256_setzero_si256(); + for (i = 0; i < N; i += 4) { + sad16x4(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad); src_ptr += src_stride << 2; ref_ptr += ref_stride << 2; } - return get_sad_from_mm256_epi32(&sum); + return (unsigned int)get_sad_from_mm256_epi32(&sad); } static void sad32x4(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, const uint16_t *sec_ptr, __m256i *sad_acc) { - __m256i s0, s1, s2, s3, r0, r1, r2, r3; - const __m256i zero = _mm256_setzero_si256(); + __m256i s[4], r[4]; int row_sections = 0; while (row_sections < 2) { - s0 = _mm256_loadu_si256((const __m256i *)src_ptr); - s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); - s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); - s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); - r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); - r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); - r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); - r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); if (sec_ptr) { - r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr)); - r1 = _mm256_avg_epu16( - r1, _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); - r2 = _mm256_avg_epu16( - r2, _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); - r3 = _mm256_avg_epu16( - r3, _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + r[0] = + _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + sec_ptr += 32 << 1; } - s0 = _mm256_sub_epi16(s0, r0); - s1 = _mm256_sub_epi16(s1, r1); - s2 = _mm256_sub_epi16(s2, r2); - s3 = _mm256_sub_epi16(s3, r3); - - s0 = _mm256_abs_epi16(s0); - s1 = _mm256_abs_epi16(s1); - s2 = _mm256_abs_epi16(s2); - s3 = _mm256_abs_epi16(s3); - - s0 = _mm256_add_epi16(s0, s1); - s0 = _mm256_add_epi16(s0, s2); - s0 = _mm256_add_epi16(s0, s3); - - r0 = _mm256_unpacklo_epi16(s0, zero); - r1 = _mm256_unpackhi_epi16(s0, zero); - - r0 = _mm256_add_epi32(r0, r1); - *sad_acc = _mm256_add_epi32(*sad_acc, r0); + highbd_sad16x4_core_avx2(s, r, sad_acc); row_sections += 1; src_ptr += src_stride << 1; ref_ptr += ref_stride << 1; - if (sec_ptr) sec_ptr += 32 << 1; } } -unsigned int aom_highbd_sad32x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { +static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_avx2(int N, + const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride) { __m256i sad = _mm256_setzero_si256(); uint16_t *srcp = CONVERT_TO_SHORTPTR(src); uint16_t *refp = CONVERT_TO_SHORTPTR(ref); const int left_shift = 2; - int row_section = 0; + int i; - while (row_section < 4) { + for (i = 0; i < N; i += 4) { sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad); srcp += src_stride << left_shift; refp += ref_stride << left_shift; - row_section += 1; } return get_sad_from_mm256_epi32(&sad); } -unsigned int aom_highbd_sad16x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - uint32_t sum = aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride); - src += src_stride << 4; - ref += ref_stride << 4; - sum += aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride); - return sum; -} - -unsigned int aom_highbd_sad32x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - uint32_t sum = aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride); - src += src_stride << 4; - ref += ref_stride << 4; - sum += aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride); - return sum; -} - -unsigned int aom_highbd_sad32x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - uint32_t sum = aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride); - src += src_stride << 5; - ref += ref_stride << 5; - sum += aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride); - return sum; -} - static void sad64x2(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, const uint16_t *sec_ptr, __m256i *sad_acc) { - __m256i s[8], r[8]; - const __m256i zero = _mm256_setzero_si256(); - - s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); - s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); - s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); - s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); - s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); - s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); - s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 32)); - s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 48)); - - r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); - r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); - r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); - r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); - r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); - r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); - r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 32)); - r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 48)); - - if (sec_ptr) { - r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); - r[1] = _mm256_avg_epu16( - r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); - r[2] = _mm256_avg_epu16( - r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); - r[3] = _mm256_avg_epu16( - r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); - r[4] = _mm256_avg_epu16( - r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64))); - r[5] = _mm256_avg_epu16( - r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80))); - r[6] = _mm256_avg_epu16( - r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96))); - r[7] = _mm256_avg_epu16( - r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112))); + __m256i s[4], r[4]; + int i; + for (i = 0; i < 2; i++) { + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); + if (sec_ptr) { + r[0] = + _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + sec_ptr += 64; + } + highbd_sad16x4_core_avx2(s, r, sad_acc); + src_ptr += src_stride; + ref_ptr += ref_stride; } - - s[0] = _mm256_sub_epi16(s[0], r[0]); - s[1] = _mm256_sub_epi16(s[1], r[1]); - s[2] = _mm256_sub_epi16(s[2], r[2]); - s[3] = _mm256_sub_epi16(s[3], r[3]); - s[4] = _mm256_sub_epi16(s[4], r[4]); - s[5] = _mm256_sub_epi16(s[5], r[5]); - s[6] = _mm256_sub_epi16(s[6], r[6]); - s[7] = _mm256_sub_epi16(s[7], r[7]); - - s[0] = _mm256_abs_epi16(s[0]); - s[1] = _mm256_abs_epi16(s[1]); - s[2] = _mm256_abs_epi16(s[2]); - s[3] = _mm256_abs_epi16(s[3]); - s[4] = _mm256_abs_epi16(s[4]); - s[5] = _mm256_abs_epi16(s[5]); - s[6] = _mm256_abs_epi16(s[6]); - s[7] = _mm256_abs_epi16(s[7]); - - s[0] = _mm256_add_epi16(s[0], s[1]); - s[0] = _mm256_add_epi16(s[0], s[2]); - s[0] = _mm256_add_epi16(s[0], s[3]); - - s[4] = _mm256_add_epi16(s[4], s[5]); - s[4] = _mm256_add_epi16(s[4], s[6]); - s[4] = _mm256_add_epi16(s[4], s[7]); - - r[0] = _mm256_unpacklo_epi16(s[0], zero); - r[1] = _mm256_unpackhi_epi16(s[0], zero); - r[2] = _mm256_unpacklo_epi16(s[4], zero); - r[3] = _mm256_unpackhi_epi16(s[4], zero); - - r[0] = _mm256_add_epi32(r[0], r[1]); - r[0] = _mm256_add_epi32(r[0], r[2]); - r[0] = _mm256_add_epi32(r[0], r[3]); - *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); } -unsigned int aom_highbd_sad64x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { +static AOM_FORCE_INLINE unsigned int aom_highbd_sad64xN_avx2(int N, + const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride) { __m256i sad = _mm256_setzero_si256(); uint16_t *srcp = CONVERT_TO_SHORTPTR(src); uint16_t *refp = CONVERT_TO_SHORTPTR(ref); const int left_shift = 1; - int row_section = 0; - - while (row_section < 16) { + int i; + for (i = 0; i < N; i += 2) { sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad); srcp += src_stride << left_shift; refp += ref_stride << left_shift; - row_section += 1; } return get_sad_from_mm256_epi32(&sad); } -unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - uint32_t sum = aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride); - src += src_stride << 5; - ref += ref_stride << 5; - sum += aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride); - return sum; -} - static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr, const uint16_t *sec_ptr, __m256i *sad_acc) { - __m256i s[8], r[8]; - const __m256i zero = _mm256_setzero_si256(); - - s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); - s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); - s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); - s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); - s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + 64)); - s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + 80)); - s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + 96)); - s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + 112)); - - r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); - r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); - r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); - r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); - r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 64)); - r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 80)); - r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 96)); - r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 112)); - - if (sec_ptr) { - r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); - r[1] = _mm256_avg_epu16( - r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); - r[2] = _mm256_avg_epu16( - r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); - r[3] = _mm256_avg_epu16( - r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); - r[4] = _mm256_avg_epu16( - r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64))); - r[5] = _mm256_avg_epu16( - r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80))); - r[6] = _mm256_avg_epu16( - r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96))); - r[7] = _mm256_avg_epu16( - r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112))); + __m256i s[4], r[4]; + int i; + for (i = 0; i < 2; i++) { + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); + if (sec_ptr) { + r[0] = + _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + sec_ptr += 64; + } + highbd_sad16x4_core_avx2(s, r, sad_acc); + src_ptr += 64; + ref_ptr += 64; } - - s[0] = _mm256_sub_epi16(s[0], r[0]); - s[1] = _mm256_sub_epi16(s[1], r[1]); - s[2] = _mm256_sub_epi16(s[2], r[2]); - s[3] = _mm256_sub_epi16(s[3], r[3]); - s[4] = _mm256_sub_epi16(s[4], r[4]); - s[5] = _mm256_sub_epi16(s[5], r[5]); - s[6] = _mm256_sub_epi16(s[6], r[6]); - s[7] = _mm256_sub_epi16(s[7], r[7]); - - s[0] = _mm256_abs_epi16(s[0]); - s[1] = _mm256_abs_epi16(s[1]); - s[2] = _mm256_abs_epi16(s[2]); - s[3] = _mm256_abs_epi16(s[3]); - s[4] = _mm256_abs_epi16(s[4]); - s[5] = _mm256_abs_epi16(s[5]); - s[6] = _mm256_abs_epi16(s[6]); - s[7] = _mm256_abs_epi16(s[7]); - - s[0] = _mm256_add_epi16(s[0], s[1]); - s[0] = _mm256_add_epi16(s[0], s[2]); - s[0] = _mm256_add_epi16(s[0], s[3]); - - s[4] = _mm256_add_epi16(s[4], s[5]); - s[4] = _mm256_add_epi16(s[4], s[6]); - s[4] = _mm256_add_epi16(s[4], s[7]); - - r[0] = _mm256_unpacklo_epi16(s[0], zero); - r[1] = _mm256_unpackhi_epi16(s[0], zero); - r[2] = _mm256_unpacklo_epi16(s[4], zero); - r[3] = _mm256_unpackhi_epi16(s[4], zero); - - r[0] = _mm256_add_epi32(r[0], r[1]); - r[0] = _mm256_add_epi32(r[0], r[2]); - r[0] = _mm256_add_epi32(r[0], r[3]); - *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); } -unsigned int aom_highbd_sad128x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { +static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2( + int N, const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride) { __m256i sad = _mm256_setzero_si256(); uint16_t *srcp = CONVERT_TO_SHORTPTR(src); uint16_t *refp = CONVERT_TO_SHORTPTR(ref); int row = 0; - while (row < 64) { + while (row < N) { sad128x1(srcp, refp, NULL, &sad); srcp += src_stride; refp += ref_stride; - row += 1; + row++; } return get_sad_from_mm256_epi32(&sad); } -unsigned int aom_highbd_sad64x128_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - uint32_t sum = aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride); - src += src_stride << 6; - ref += ref_stride << 6; - sum += aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride); - return sum; -} - -unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - uint32_t sum = aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride); - src += src_stride << 6; - ref += ref_stride << 6; - sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride); - return sum; -} - -// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD. -static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride, - const uint16_t *ref_ptr, int ref_stride, - const uint16_t *sec_ptr, __m256i *sad_acc) { - __m256i s0, s1, s2, s3, r0, r1, r2, r3; - const __m256i zero = _mm256_setzero_si256(); - - s0 = _mm256_loadu_si256((const __m256i *)src_ptr); - s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); - s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); - s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); - - r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); - r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); - r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); - r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); - - if (sec_ptr) { - r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr)); - r1 = _mm256_avg_epu16(r1, - _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); - r2 = _mm256_avg_epu16(r2, - _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); - r3 = _mm256_avg_epu16(r3, - _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); +#define highbd_sadMxN_avx2(m, n) \ + unsigned int aom_highbd_sad##m##x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \ } - s0 = _mm256_sub_epi16(s0, r0); - s1 = _mm256_sub_epi16(s1, r1); - s2 = _mm256_sub_epi16(s2, r2); - s3 = _mm256_sub_epi16(s3, r3); +highbd_sadMxN_avx2(16, 4); +highbd_sadMxN_avx2(16, 8); +highbd_sadMxN_avx2(16, 16); +highbd_sadMxN_avx2(16, 32); +highbd_sadMxN_avx2(16, 64); + +highbd_sadMxN_avx2(32, 8); +highbd_sadMxN_avx2(32, 16); +highbd_sadMxN_avx2(32, 32); +highbd_sadMxN_avx2(32, 64); - s0 = _mm256_abs_epi16(s0); - s1 = _mm256_abs_epi16(s1); - s2 = _mm256_abs_epi16(s2); - s3 = _mm256_abs_epi16(s3); +highbd_sadMxN_avx2(64, 16); +highbd_sadMxN_avx2(64, 32); +highbd_sadMxN_avx2(64, 64); +highbd_sadMxN_avx2(64, 128); - s0 = _mm256_add_epi16(s0, s1); - s0 = _mm256_add_epi16(s0, s2); - s0 = _mm256_add_epi16(s0, s3); +highbd_sadMxN_avx2(128, 64); +highbd_sadMxN_avx2(128, 128); - r0 = _mm256_unpacklo_epi16(s0, zero); - r1 = _mm256_unpackhi_epi16(s0, zero); +unsigned int aom_highbd_sad16x4_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); - r0 = _mm256_add_epi32(r0, r1); - *sad_acc = _mm256_add_epi32(*sad_acc, r0); + return get_sad_from_mm256_epi32(&sad); } unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride, @@ -566,6 +336,40 @@ unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride, return sum; } +unsigned int aom_highbd_sad16x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 5; + uint32_t sum = aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 16 << left_shift; + sum += aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad32x8_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 2; + int row_section = 0; + + while (row_section < 2) { + sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 32 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { @@ -614,6 +418,26 @@ unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride, return sum; } +unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 1; + int row_section = 0; + + while (row_section < 8) { + sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 64 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { @@ -697,7 +521,7 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride, } // SAD 4D -// Combine 4 __m256i vectors to uint32_t result[4] +// Combine 4 __m256i input vectors v to uint32_t result[4] static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v, uint32_t *res) { __m256i u0, u1, u2, u3; @@ -752,287 +576,124 @@ static void init_sad(__m256i *s) { s[3] = _mm256_setzero_si256(); } -void aom_highbd_sad16x8x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { +static AOM_FORCE_INLINE void aom_highbd_sad16xNx4d_avx2( + int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { __m256i sad_vec[4]; const uint16_t *refp[4]; const uint16_t *keep = CONVERT_TO_SHORTPTR(src); const uint16_t *srcp; const int shift_for_4_rows = 2; - int i; + int i, j; init_sad(sad_vec); convert_pointers(ref_array, refp); for (i = 0; i < 4; ++i) { srcp = keep; - sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); - srcp += src_stride << shift_for_4_rows; - refp[i] += ref_stride << shift_for_4_rows; - sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + for (j = 0; j < N; j += 4) { + sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + srcp += src_stride << shift_for_4_rows; + refp[i] += ref_stride << shift_for_4_rows; + } } get_4d_sad_from_mm256_epi32(sad_vec, sad_array); } -void aom_highbd_sad16x16x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first8rows[4]; - uint32_t second8rows[4]; - const uint8_t *ref[4]; - const int shift_for_8_rows = 3; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, first8rows); - src += src_stride << shift_for_8_rows; - ref[0] += ref_stride << shift_for_8_rows; - ref[1] += ref_stride << shift_for_8_rows; - ref[2] += ref_stride << shift_for_8_rows; - ref[3] += ref_stride << shift_for_8_rows; - aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, second8rows); - sad_array[0] = first8rows[0] + second8rows[0]; - sad_array[1] = first8rows[1] + second8rows[1]; - sad_array[2] = first8rows[2] + second8rows[2]; - sad_array[3] = first8rows[3] + second8rows[3]; -} - -void aom_highbd_sad16x32x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first_half[4]; - uint32_t second_half[4]; - const uint8_t *ref[4]; - const int shift_for_rows = 4; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, first_half); - src += src_stride << shift_for_rows; - ref[0] += ref_stride << shift_for_rows; - ref[1] += ref_stride << shift_for_rows; - ref[2] += ref_stride << shift_for_rows; - ref[3] += ref_stride << shift_for_rows; - aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, second_half); - sad_array[0] = first_half[0] + second_half[0]; - sad_array[1] = first_half[1] + second_half[1]; - sad_array[2] = first_half[2] + second_half[2]; - sad_array[3] = first_half[3] + second_half[3]; -} - -void aom_highbd_sad32x16x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { +static AOM_FORCE_INLINE void aom_highbd_sad32xNx4d_avx2( + int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { __m256i sad_vec[4]; const uint16_t *refp[4]; const uint16_t *keep = CONVERT_TO_SHORTPTR(src); const uint16_t *srcp; const int shift_for_4_rows = 2; - int i; - int rows_section; + int i, r; init_sad(sad_vec); convert_pointers(ref_array, refp); for (i = 0; i < 4; ++i) { srcp = keep; - rows_section = 0; - while (rows_section < 4) { + for (r = 0; r < N; r += 4) { sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); srcp += src_stride << shift_for_4_rows; refp[i] += ref_stride << shift_for_4_rows; - rows_section++; } } get_4d_sad_from_mm256_epi32(sad_vec, sad_array); } -void aom_highbd_sad32x32x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first_half[4]; - uint32_t second_half[4]; - const uint8_t *ref[4]; - const int shift_for_rows = 4; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, first_half); - src += src_stride << shift_for_rows; - ref[0] += ref_stride << shift_for_rows; - ref[1] += ref_stride << shift_for_rows; - ref[2] += ref_stride << shift_for_rows; - ref[3] += ref_stride << shift_for_rows; - aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, second_half); - sad_array[0] = first_half[0] + second_half[0]; - sad_array[1] = first_half[1] + second_half[1]; - sad_array[2] = first_half[2] + second_half[2]; - sad_array[3] = first_half[3] + second_half[3]; -} - -void aom_highbd_sad32x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first_half[4]; - uint32_t second_half[4]; - const uint8_t *ref[4]; - const int shift_for_rows = 5; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, first_half); - src += src_stride << shift_for_rows; - ref[0] += ref_stride << shift_for_rows; - ref[1] += ref_stride << shift_for_rows; - ref[2] += ref_stride << shift_for_rows; - ref[3] += ref_stride << shift_for_rows; - aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, second_half); - sad_array[0] = first_half[0] + second_half[0]; - sad_array[1] = first_half[1] + second_half[1]; - sad_array[2] = first_half[2] + second_half[2]; - sad_array[3] = first_half[3] + second_half[3]; -} - -void aom_highbd_sad64x32x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { +static AOM_FORCE_INLINE void aom_highbd_sad64xNx4d_avx2( + int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { __m256i sad_vec[4]; const uint16_t *refp[4]; const uint16_t *keep = CONVERT_TO_SHORTPTR(src); const uint16_t *srcp; const int shift_for_rows = 1; - int i; - int rows_section; + int i, r; init_sad(sad_vec); convert_pointers(ref_array, refp); for (i = 0; i < 4; ++i) { srcp = keep; - rows_section = 0; - while (rows_section < 16) { + for (r = 0; r < N; r += 2) { sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]); srcp += src_stride << shift_for_rows; refp[i] += ref_stride << shift_for_rows; - rows_section++; } } get_4d_sad_from_mm256_epi32(sad_vec, sad_array); } -void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first_half[4]; - uint32_t second_half[4]; - const uint8_t *ref[4]; - const int shift_for_rows = 5; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, first_half); - src += src_stride << shift_for_rows; - ref[0] += ref_stride << shift_for_rows; - ref[1] += ref_stride << shift_for_rows; - ref[2] += ref_stride << shift_for_rows; - ref[3] += ref_stride << shift_for_rows; - aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, second_half); - sad_array[0] = first_half[0] + second_half[0]; - sad_array[1] = first_half[1] + second_half[1]; - sad_array[2] = first_half[2] + second_half[2]; - sad_array[3] = first_half[3] + second_half[3]; -} - -void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first_half[4]; - uint32_t second_half[4]; - const uint8_t *ref[4]; - const int shift_for_rows = 6; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, first_half); - src += src_stride << shift_for_rows; - ref[0] += ref_stride << shift_for_rows; - ref[1] += ref_stride << shift_for_rows; - ref[2] += ref_stride << shift_for_rows; - ref[3] += ref_stride << shift_for_rows; - aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, second_half); - sad_array[0] = first_half[0] + second_half[0]; - sad_array[1] = first_half[1] + second_half[1]; - sad_array[2] = first_half[2] + second_half[2]; - sad_array[3] = first_half[3] + second_half[3]; -} - -void aom_highbd_sad128x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { +static AOM_FORCE_INLINE void aom_highbd_sad128xNx4d_avx2( + int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { __m256i sad_vec[4]; const uint16_t *refp[4]; const uint16_t *keep = CONVERT_TO_SHORTPTR(src); const uint16_t *srcp; - int i; - int rows_section; + int i, r; init_sad(sad_vec); convert_pointers(ref_array, refp); for (i = 0; i < 4; ++i) { srcp = keep; - rows_section = 0; - while (rows_section < 64) { + for (r = 0; r < N; r++) { sad128x1(srcp, refp[i], NULL, &sad_vec[i]); srcp += src_stride; refp[i] += ref_stride; - rows_section++; } } get_4d_sad_from_mm256_epi32(sad_vec, sad_array); } -void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first_half[4]; - uint32_t second_half[4]; - const uint8_t *ref[4]; - const int shift_for_rows = 6; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, first_half); - src += src_stride << shift_for_rows; - ref[0] += ref_stride << shift_for_rows; - ref[1] += ref_stride << shift_for_rows; - ref[2] += ref_stride << shift_for_rows; - ref[3] += ref_stride << shift_for_rows; - aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, second_half); - sad_array[0] = first_half[0] + second_half[0]; - sad_array[1] = first_half[1] + second_half[1]; - sad_array[2] = first_half[2] + second_half[2]; - sad_array[3] = first_half[3] + second_half[3]; -} +#define highbd_sadMxNx4d_avx2(m, n) \ + void aom_highbd_sad##m##x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + aom_highbd_sad##m##xNx4d_avx2(n, src, src_stride, ref_array, ref_stride, \ + sad_array); \ + } + +highbd_sadMxNx4d_avx2(16, 4); +highbd_sadMxNx4d_avx2(16, 8); +highbd_sadMxNx4d_avx2(16, 16); +highbd_sadMxNx4d_avx2(16, 32); +highbd_sadMxNx4d_avx2(16, 64); + +highbd_sadMxNx4d_avx2(32, 8); +highbd_sadMxNx4d_avx2(32, 16); +highbd_sadMxNx4d_avx2(32, 32); +highbd_sadMxNx4d_avx2(32, 64); + +highbd_sadMxNx4d_avx2(64, 16); +highbd_sadMxNx4d_avx2(64, 32); +highbd_sadMxNx4d_avx2(64, 64); +highbd_sadMxNx4d_avx2(64, 128); + +highbd_sadMxNx4d_avx2(128, 64); +highbd_sadMxNx4d_avx2(128, 128); diff --git a/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c b/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c index c6fd62c9e2..f77a585b4c 100644 --- a/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c +++ b/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c @@ -84,81 +84,6 @@ unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride, return sum; } -static void sad64x64x4d(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - __m128i *res) { - uint32_t sum[4]; - aom_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, sum); - *res = _mm_loadu_si128((const __m128i *)sum); -} - -void aom_sad64x128x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - __m128i sum0, sum1; - const uint8_t *rf[4]; - - rf[0] = ref[0]; - rf[1] = ref[1]; - rf[2] = ref[2]; - rf[3] = ref[3]; - sad64x64x4d(src, src_stride, rf, ref_stride, &sum0); - src += src_stride << 6; - rf[0] += ref_stride << 6; - rf[1] += ref_stride << 6; - rf[2] += ref_stride << 6; - rf[3] += ref_stride << 6; - sad64x64x4d(src, src_stride, rf, ref_stride, &sum1); - sum0 = _mm_add_epi32(sum0, sum1); - _mm_storeu_si128((__m128i *)res, sum0); -} - -void aom_sad128x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - __m128i sum0, sum1; - unsigned int half_width = 64; - const uint8_t *rf[4]; - - rf[0] = ref[0]; - rf[1] = ref[1]; - rf[2] = ref[2]; - rf[3] = ref[3]; - sad64x64x4d(src, src_stride, rf, ref_stride, &sum0); - src += half_width; - rf[0] += half_width; - rf[1] += half_width; - rf[2] += half_width; - rf[3] += half_width; - sad64x64x4d(src, src_stride, rf, ref_stride, &sum1); - sum0 = _mm_add_epi32(sum0, sum1); - _mm_storeu_si128((__m128i *)res, sum0); -} - -void aom_sad128x128x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - const uint8_t *rf[4]; - uint32_t sum0[4]; - uint32_t sum1[4]; - - rf[0] = ref[0]; - rf[1] = ref[1]; - rf[2] = ref[2]; - rf[3] = ref[3]; - aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum0); - src += src_stride << 6; - rf[0] += ref_stride << 6; - rf[1] += ref_stride << 6; - rf[2] += ref_stride << 6; - rf[3] += ref_stride << 6; - aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum1); - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - res[2] = sum0[2] + sum1[2]; - res[3] = sum0[3] + sum1[3]; -} - static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int h, const uint8_t *second_pred, diff --git a/media/libaom/src/aom_dsp/x86/sse_avx2.c b/media/libaom/src/aom_dsp/x86/sse_avx2.c index 305dde5c08..e6ee2fcab9 100644 --- a/media/libaom/src/aom_dsp/x86/sse_avx2.c +++ b/media/libaom/src/aom_dsp/x86/sse_avx2.c @@ -21,12 +21,11 @@ static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, const uint8_t *b) { const __m256i v_a0 = yy_loadu_256(a); const __m256i v_b0 = yy_loadu_256(b); - const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0)); - const __m256i v_a01_w = - _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1)); - const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0)); - const __m256i v_b01_w = - _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1)); + const __m256i zero = _mm256_setzero_si256(); + const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero); + const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero); + const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero); + const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero); const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); @@ -35,11 +34,29 @@ static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { int64_t sum; + __m256i zero = _mm256_setzero_si256(); + const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero); + const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + xx_storel_64(&sum, sum_1x64); + return sum; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) { const __m256i sum0_4x64 = - _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all)); + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32)); const __m256i sum1_4x64 = - _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1)); + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1)); const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + *sum = _mm256_add_epi64(*sum, sum_4x64); +} + +static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) { + int64_t sum; const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), _mm256_extracti128_si256(sum_4x64, 1)); const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); @@ -47,31 +64,48 @@ static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { xx_storel_64(&sum, sum_1x64); return sum; } +#endif +static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_a2 = xx_loadl_32(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_32(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_b2 = xx_loadl_32(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_32(b + b_stride * 3); + const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1), + _mm_unpacklo_epi32(v_a2, v_a3)); + const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1), + _mm_unpacklo_epi32(v_b2, v_b3)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} +static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); + const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height) { int32_t y = 0; int64_t sse = 0; __m256i sum = _mm256_setzero_si256(); + __m256i zero = _mm256_setzero_si256(); switch (width) { case 4: do { - const __m128i v_a0 = xx_loadl_32(a); - const __m128i v_a1 = xx_loadl_32(a + a_stride); - const __m128i v_a2 = xx_loadl_32(a + a_stride * 2); - const __m128i v_a3 = xx_loadl_32(a + a_stride * 3); - const __m128i v_b0 = xx_loadl_32(b); - const __m128i v_b1 = xx_loadl_32(b + b_stride); - const __m128i v_b2 = xx_loadl_32(b + b_stride * 2); - const __m128i v_b3 = xx_loadl_32(b + b_stride * 3); - const __m128i v_a0123 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3)); - const __m128i v_b0123 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3)); - const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); - const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); - const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + sse_w4x4_avx2(a, a_stride, b, b_stride, &sum); a += a_stride << 2; b += b_stride << 2; y += 4; @@ -80,16 +114,7 @@ int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, break; case 8: do { - const __m128i v_a0 = xx_loadl_64(a); - const __m128i v_a1 = xx_loadl_64(a + a_stride); - const __m128i v_b0 = xx_loadl_64(b); - const __m128i v_b1 = xx_loadl_64(b + b_stride); - const __m256i v_a_w = - _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); - const __m256i v_b_w = - _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); - const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + sse_w8x2_avx2(a, a_stride, b, b_stride, &sum); a += a_stride << 1; b += b_stride << 1; y += 2; @@ -99,14 +124,26 @@ int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, case 16: do { const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_a1 = xx_loadu_128(a + a_stride); const __m128i v_b0 = xx_loadu_128(b); - const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0); - const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0); - const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); - a += a_stride; - b += b_stride; - y += 1; + const __m128i v_b1 = xx_loadu_128(b + b_stride); + const __m256i v_a = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01); + const __m256i v_b = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01); + const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero); + const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero); + const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero); + const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero); + const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl); + const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu); + const __m256i temp = + _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub), + _mm256_madd_epi16(v_bsub, v_bsub)); + sum = _mm256_add_epi32(sum, temp); + a += a_stride << 1; + b += b_stride << 1; + y += 2; } while (y < height); sse = summary_all_avx2(&sum); break; @@ -141,12 +178,42 @@ int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, } while (y < height); sse = summary_all_avx2(&sum); break; - default: break; + default: + if ((width & 0x07) == 0) { + do { + int i = 0; + do { + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + const uint8_t *a2 = a + i + (a_stride << 1); + const uint8_t *b2 = b + i + (b_stride << 1); + sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } + sse = summary_all_avx2(&sum); + break; } return sse; } +#if CONFIG_AV1_HIGHBITDEPTH static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, const uint16_t *b) { const __m256i v_a_w = yy_loadu_256(a); @@ -155,6 +222,33 @@ static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); } +static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_a2 = xx_loadl_64(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_64(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_b2 = xx_loadl_64(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_64(b + b_stride * 3); + const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1), + _mm_unpacklo_epi64(v_a2, v_a3)); + const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1), + _mm_unpacklo_epi64(v_b2, v_b3)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m256i v_a_w = yy_loadu2_128(a + a_stride, a); + const __m256i v_b_w = yy_loadu2_128(b + b_stride, b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int width, int height) { int32_t y = 0; @@ -165,20 +259,7 @@ int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, switch (width) { case 4: do { - const __m128i v_a0 = xx_loadl_64(a); - const __m128i v_a1 = xx_loadl_64(a + a_stride); - const __m128i v_a2 = xx_loadl_64(a + a_stride * 2); - const __m128i v_a3 = xx_loadl_64(a + a_stride * 3); - const __m128i v_b0 = xx_loadl_64(b); - const __m128i v_b1 = xx_loadl_64(b + b_stride); - const __m128i v_b2 = xx_loadl_64(b + b_stride * 2); - const __m128i v_b3 = xx_loadl_64(b + b_stride * 3); - const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1), - _mm_unpacklo_epi64(v_a2, v_a3)); - const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1), - _mm_unpacklo_epi64(v_b2, v_b3)); - const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride); a += a_stride << 2; b += b_stride << 2; y += 4; @@ -187,10 +268,7 @@ int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, break; case 8: do { - const __m256i v_a_w = yy_loadu2_128(a + a_stride, a); - const __m256i v_b_w = yy_loadu2_128(b + b_stride, b); - const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride); a += a_stride << 1; b += b_stride << 1; y += 2; @@ -208,43 +286,99 @@ int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, break; case 32: do { - highbd_sse_w16_avx2(&sum, a, b); - highbd_sse_w16_avx2(&sum, a + 16, b + 16); - a += a_stride; - b += b_stride; - y += 1; + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16, b + 16); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 64; } while (y < height); - sse = summary_all_avx2(&sum); + sse = summary_4x64_avx2(sum); break; case 64: do { - highbd_sse_w16_avx2(&sum, a, b); - highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1); - highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2); - highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3); - a += a_stride; - b += b_stride; - y += 1; + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 32; } while (y < height); - sse = summary_all_avx2(&sum); + sse = summary_4x64_avx2(sum); break; case 128: do { - highbd_sse_w16_avx2(&sum, a, b); - highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1); - highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2); - highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3); - highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4); - highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5); - highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6); - highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7); - a += a_stride; - b += b_stride; - y += 1; + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); + highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4); + highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5); + highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6); + highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 16 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 16; } while (y < height); - sse = summary_all_avx2(&sum); + sse = summary_4x64_avx2(sum); + break; + default: + if (width & 0x7) { + do { + int i = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + const uint16_t *a2 = a + i + (a_stride << 1); + const uint16_t *b2 = b + i + (b_stride << 1); + highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride); + summary_32_avx2(&sum32, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } else { + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + int i = 0; + do { + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + l += 2; + } while (l < 8 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 8; + } while (y < height); + } + sse = summary_4x64_avx2(sum); break; - default: break; } return sse; } +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/media/libaom/src/aom_dsp/x86/sse_sse4.c b/media/libaom/src/aom_dsp/x86/sse_sse4.c index 8b5af84691..5f95eb9aeb 100644 --- a/media/libaom/src/aom_dsp/x86/sse_sse4.c +++ b/media/libaom/src/aom_dsp/x86/sse_sse4.c @@ -28,6 +28,15 @@ static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { return sum; } +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) { + const __m128i sum0 = _mm_cvtepu32_epi64(*sum32); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8)); + *sum64 = _mm_add_epi64(sum0, *sum64); + *sum64 = _mm_add_epi64(sum1, *sum64); +} +#endif + static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, const uint8_t *b) { const __m128i v_a0 = xx_loadu_128(a); @@ -42,6 +51,27 @@ static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); } +static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m128i *sum) { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); + const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} +static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b, + __m128i *sum) { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height) { int y = 0; @@ -50,14 +80,7 @@ int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, switch (width) { case 4: do { - const __m128i v_a0 = xx_loadl_32(a); - const __m128i v_a1 = xx_loadl_32(a + a_stride); - const __m128i v_b0 = xx_loadl_32(b); - const __m128i v_b1 = xx_loadl_32(b + b_stride); - const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); - const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); - const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); - sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + sse4x2_sse4_1(a, a_stride, b, b_stride, &sum); a += a_stride << 1; b += b_stride << 1; y += 2; @@ -66,12 +89,7 @@ int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, break; case 8: do { - const __m128i v_a0 = xx_loadl_64(a); - const __m128i v_b0 = xx_loadl_64(b); - const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); - const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); - const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); - sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + sse8_sse4_1(a, b, &sum); a += a_stride; b += b_stride; y += 1; @@ -125,12 +143,53 @@ int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, } while (y < height); sse = summary_all_sse4(&sum); break; - default: break; + default: + if (width & 0x07) { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + } + sse = summary_all_sse4(&sum); + break; } return sse; } +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, const uint16_t *b) { const __m128i v_a_w = xx_loadu_128(a); @@ -150,14 +209,7 @@ int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, switch (width) { case 4: do { - const __m128i v_a0 = xx_loadl_64(a); - const __m128i v_a1 = xx_loadl_64(a + a_stride); - const __m128i v_b0 = xx_loadl_64(b); - const __m128i v_b1 = xx_loadl_64(b + b_stride); - const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); - const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); - const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); - sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride); a += a_stride << 1; b += b_stride << 1; y += 2; @@ -175,67 +227,127 @@ int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, break; case 16: do { - highbd_sse_w8_sse4_1(&sum, a, b); - highbd_sse_w8_sse4_1(&sum, a + 8, b + 8); - a += a_stride; - b += b_stride; - y += 1; + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 64; } while (y < height); - sse = summary_all_sse4(&sum); + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); break; case 32: do { - highbd_sse_w8_sse4_1(&sum, a, b); - highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); - highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); - highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); - a += a_stride; - b += b_stride; - y += 1; + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 32; } while (y < height); - sse = summary_all_sse4(&sum); + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); break; case 64: do { - highbd_sse_w8_sse4_1(&sum, a, b); - highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); - highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); - highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); - highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4); - highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5); - highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6); - highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7); - a += a_stride; - b += b_stride; - y += 1; + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 16 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 16; } while (y < height); - sse = summary_all_sse4(&sum); + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); break; case 128: do { - highbd_sse_w8_sse4_1(&sum, a, b); - highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); - highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); - highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); - highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4); - highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5); - highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6); - highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7); - highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8); - highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9); - highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10); - highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11); - highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12); - highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13); - highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14); - highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15); - a += a_stride; - b += b_stride; - y += 1; + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 8 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 8; } while (y < height); - sse = summary_all_sse4(&sum); + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + default: + if (width & 0x7) { + do { + __m128i sum32 = _mm_setzero_si128(); + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + summary_32_sse4(&sum32, &sum); + } while (y < height); + } else { + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 8 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 8; + } while (y < height); + } + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); break; - default: break; } return sse; } +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/media/libaom/src/aom_dsp/x86/ssim_opt_x86_64.asm b/media/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm index 6d9b5a12f1..6d9b5a12f1 100644 --- a/media/libaom/src/aom_dsp/x86/ssim_opt_x86_64.asm +++ b/media/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm diff --git a/media/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm b/media/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm index 45bf6ec3c5..cbf28901be 100644 --- a/media/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm +++ b/media/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm @@ -135,44 +135,33 @@ SECTION .text %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, dst, dst_stride, \ - sec, sec_stride, height, sse, \ - g_bilin_filter, g_pw_8 + sec, sec_stride, height, sse %define block_height dword heightm %define sec_str sec_stridemp - - ;Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back %else cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, dst, dst_stride, \ - height, sse, g_bilin_filter, g_pw_8 + height, sse %define block_height heightd + %endif + + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm - ;Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx - LOAD_IF_USED 0, 1 ; load eax, ecx back - %endif + LOAD_IF_USED 0, 1 ; load eax, ecx back %else %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ diff --git a/media/libaom/src/aom_dsp/x86/subtract_avx2.c b/media/libaom/src/aom_dsp/x86/subtract_avx2.c index 4389d123db..40831600a6 100644 --- a/media/libaom/src/aom_dsp/x86/subtract_avx2.c +++ b/media/libaom/src/aom_dsp/x86/subtract_avx2.c @@ -26,7 +26,7 @@ static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr, _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1); } -static INLINE void aom_subtract_block_16xn_avx2( +static INLINE void subtract_block_16xn_avx2( int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { for (int32_t j = 0; j < rows; ++j) { @@ -42,7 +42,7 @@ static INLINE void aom_subtract_block_16xn_avx2( } } -static INLINE void aom_subtract_block_32xn_avx2( +static INLINE void subtract_block_32xn_avx2( int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { for (int32_t j = 0; j < rows; ++j) { @@ -53,7 +53,7 @@ static INLINE void aom_subtract_block_32xn_avx2( } } -static INLINE void aom_subtract_block_64xn_avx2( +static INLINE void subtract_block_64xn_avx2( int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { for (int32_t j = 0; j < rows; ++j) { @@ -65,7 +65,7 @@ static INLINE void aom_subtract_block_64xn_avx2( } } -static INLINE void aom_subtract_block_128xn_avx2( +static INLINE void subtract_block_128xn_avx2( int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { for (int32_t j = 0; j < rows; ++j) { @@ -85,20 +85,20 @@ void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t pred_stride) { switch (cols) { case 16: - aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, - src_stride, pred_ptr, pred_stride); + subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); break; case 32: - aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, - src_stride, pred_ptr, pred_stride); + subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); break; case 64: - aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, - src_stride, pred_ptr, pred_stride); + subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); break; case 128: - aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr, - src_stride, pred_ptr, pred_stride); + subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); break; default: aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, diff --git a/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c b/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c index 0af44e3a4c..97d78b6842 100644 --- a/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c +++ b/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c @@ -77,3 +77,172 @@ uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width, return aom_sum_squares_2d_i16_c(src, stride, width, height); } } + +// Accumulate sum of 16-bit elements in the vector +static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) { + __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1); + __m128i vtmp2 = _mm256_castsi256_si128(vec_a); + vtmp1 = _mm_add_epi16(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 8); + vtmp1 = _mm_add_epi16(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 4); + vtmp1 = _mm_add_epi16(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 2); + vtmp1 = _mm_add_epi16(vtmp1, vtmp2); + return _mm_extract_epi16(vtmp1, 0); +} + +// Accumulate sum of 32-bit elements in the vector +static AOM_INLINE int32_t mm256_accumulate_epi32(__m256i vec_a) { + __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1); + __m128i vtmp2 = _mm256_castsi256_si128(vec_a); + vtmp1 = _mm_add_epi32(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 8); + vtmp1 = _mm_add_epi32(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 4); + vtmp1 = _mm_add_epi32(vtmp1, vtmp2); + return _mm_cvtsi128_si32(vtmp1); +} + +uint64_t aom_var_2d_u8_avx2(uint8_t *src, int src_stride, int width, + int height) { + uint8_t *srcp; + uint64_t s = 0, ss = 0; + __m256i vzero = _mm256_setzero_si256(); + __m256i v_acc_sum = vzero; + __m256i v_acc_sqs = vzero; + int i, j; + + // Process 32 elements in a row + for (i = 0; i < width - 31; i += 32) { + srcp = src + i; + // Process 8 columns at a time + for (j = 0; j < height - 7; j += 8) { + __m256i vsrc[8]; + for (int k = 0; k < 8; k++) { + vsrc[k] = _mm256_loadu_si256((__m256i *)srcp); + srcp += src_stride; + } + for (int k = 0; k < 8; k++) { + __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc[k], vzero); + __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc[k], vzero); + v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0); + v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1); + + __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0); + __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1); + } + + // Update total sum and clear the vectors + s += mm256_accumulate_epi16(v_acc_sum); + ss += mm256_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process remaining rows (height not a multiple of 8) + for (; j < height; j++) { + __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp); + __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc, vzero); + __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc, vzero); + v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0); + v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1); + + __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0); + __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1); + + srcp += src_stride; + } + + // Update total sum and clear the vectors + s += mm256_accumulate_epi16(v_acc_sum); + ss += mm256_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process the remaining area using C + srcp = src; + for (int k = 0; k < height; k++) { + for (int m = i; m < width; m++) { + uint8_t val = srcp[m]; + s += val; + ss += val * val; + } + srcp += src_stride; + } + return (ss - s * s / (width * height)); +} + +uint64_t aom_var_2d_u16_avx2(uint8_t *src, int src_stride, int width, + int height) { + uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp; + uint64_t s = 0, ss = 0; + __m256i vzero = _mm256_setzero_si256(); + __m256i v_acc_sum = vzero; + __m256i v_acc_sqs = vzero; + int i, j; + + // Process 16 elements in a row + for (i = 0; i < width - 15; i += 16) { + srcp = srcp1 + i; + // Process 8 columns at a time + for (j = 0; j < height - 8; j += 8) { + __m256i vsrc[8]; + for (int k = 0; k < 8; k++) { + vsrc[k] = _mm256_loadu_si256((__m256i *)srcp); + srcp += src_stride; + } + for (int k = 0; k < 8; k++) { + __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc[k], vzero); + __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc[k], vzero); + v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum); + v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum); + + __m256i vsqs0 = _mm256_madd_epi16(vsrc[k], vsrc[k]); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); + } + + // Update total sum and clear the vectors + s += mm256_accumulate_epi32(v_acc_sum); + ss += mm256_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process remaining rows (height not a multiple of 8) + for (; j < height; j++) { + __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp); + __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc, vzero); + __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc, vzero); + v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum); + v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum); + + __m256i vsqs0 = _mm256_madd_epi16(vsrc, vsrc); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); + srcp += src_stride; + } + + // Update total sum and clear the vectors + s += mm256_accumulate_epi32(v_acc_sum); + ss += mm256_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process the remaining area using C + srcp = srcp1; + for (int k = 0; k < height; k++) { + for (int m = i; m < width; m++) { + uint16_t val = srcp[m]; + s += val; + ss += val * val; + } + srcp += src_stride; + } + return (ss - s * s / (width * height)); +} diff --git a/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c b/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c index 22d7739ec4..85b301a88e 100644 --- a/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c +++ b/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c @@ -201,3 +201,166 @@ uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) { return aom_sum_squares_i16_c(src, n); } } + +// Accumulate sum of 16-bit elements in the vector +static AOM_INLINE int32_t mm_accumulate_epi16(__m128i vec_a) { + __m128i vtmp = _mm_srli_si128(vec_a, 8); + vec_a = _mm_add_epi16(vec_a, vtmp); + vtmp = _mm_srli_si128(vec_a, 4); + vec_a = _mm_add_epi16(vec_a, vtmp); + vtmp = _mm_srli_si128(vec_a, 2); + vec_a = _mm_add_epi16(vec_a, vtmp); + return _mm_extract_epi16(vec_a, 0); +} + +// Accumulate sum of 32-bit elements in the vector +static AOM_INLINE int32_t mm_accumulate_epi32(__m128i vec_a) { + __m128i vtmp = _mm_srli_si128(vec_a, 8); + vec_a = _mm_add_epi32(vec_a, vtmp); + vtmp = _mm_srli_si128(vec_a, 4); + vec_a = _mm_add_epi32(vec_a, vtmp); + return _mm_cvtsi128_si32(vec_a); +} + +uint64_t aom_var_2d_u8_sse2(uint8_t *src, int src_stride, int width, + int height) { + uint8_t *srcp; + uint64_t s = 0, ss = 0; + __m128i vzero = _mm_setzero_si128(); + __m128i v_acc_sum = vzero; + __m128i v_acc_sqs = vzero; + int i, j; + + // Process 16 elements in a row + for (i = 0; i < width - 15; i += 16) { + srcp = src + i; + // Process 8 columns at a time + for (j = 0; j < height - 7; j += 8) { + __m128i vsrc[8]; + for (int k = 0; k < 8; k++) { + vsrc[k] = _mm_loadu_si128((__m128i *)srcp); + srcp += src_stride; + } + for (int k = 0; k < 8; k++) { + __m128i vsrc0 = _mm_unpacklo_epi8(vsrc[k], vzero); + __m128i vsrc1 = _mm_unpackhi_epi8(vsrc[k], vzero); + v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0); + v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1); + + __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0); + __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1); + } + + // Update total sum and clear the vectors + s += mm_accumulate_epi16(v_acc_sum); + ss += mm_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process remaining rows (height not a multiple of 8) + for (; j < height; j++) { + __m128i vsrc = _mm_loadu_si128((__m128i *)srcp); + __m128i vsrc0 = _mm_unpacklo_epi8(vsrc, vzero); + __m128i vsrc1 = _mm_unpackhi_epi8(vsrc, vzero); + v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0); + v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1); + + __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0); + __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1); + + srcp += src_stride; + } + + // Update total sum and clear the vectors + s += mm_accumulate_epi16(v_acc_sum); + ss += mm_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process the remaining area using C + srcp = src; + for (int k = 0; k < height; k++) { + for (int m = i; m < width; m++) { + uint8_t val = srcp[m]; + s += val; + ss += val * val; + } + srcp += src_stride; + } + return (ss - s * s / (width * height)); +} + +uint64_t aom_var_2d_u16_sse2(uint8_t *src, int src_stride, int width, + int height) { + uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp; + uint64_t s = 0, ss = 0; + __m128i vzero = _mm_setzero_si128(); + __m128i v_acc_sum = vzero; + __m128i v_acc_sqs = vzero; + int i, j; + + // Process 8 elements in a row + for (i = 0; i < width - 8; i += 8) { + srcp = srcp1 + i; + // Process 8 columns at a time + for (j = 0; j < height - 8; j += 8) { + __m128i vsrc[8]; + for (int k = 0; k < 8; k++) { + vsrc[k] = _mm_loadu_si128((__m128i *)srcp); + srcp += src_stride; + } + for (int k = 0; k < 8; k++) { + __m128i vsrc0 = _mm_unpacklo_epi16(vsrc[k], vzero); + __m128i vsrc1 = _mm_unpackhi_epi16(vsrc[k], vzero); + v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum); + v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum); + + __m128i vsqs0 = _mm_madd_epi16(vsrc[k], vsrc[k]); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); + } + + // Update total sum and clear the vectors + s += mm_accumulate_epi32(v_acc_sum); + ss += mm_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process remaining rows (height not a multiple of 8) + for (; j < height; j++) { + __m128i vsrc = _mm_loadu_si128((__m128i *)srcp); + __m128i vsrc0 = _mm_unpacklo_epi16(vsrc, vzero); + __m128i vsrc1 = _mm_unpackhi_epi16(vsrc, vzero); + v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum); + v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum); + + __m128i vsqs0 = _mm_madd_epi16(vsrc, vsrc); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); + srcp += src_stride; + } + + // Update total sum and clear the vectors + s += mm_accumulate_epi32(v_acc_sum); + ss += mm_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process the remaining area using C + srcp = srcp1; + for (int k = 0; k < height; k++) { + for (int m = i; m < width; m++) { + uint16_t val = srcp[m]; + s += val; + ss += val * val; + } + srcp += src_stride; + } + return (ss - s * s / (width * height)); +} diff --git a/media/libaom/src/aom_dsp/x86/synonyms.h b/media/libaom/src/aom_dsp/x86/synonyms.h index 1e9f1e27b8..2e99bee3e9 100644 --- a/media/libaom/src/aom_dsp/x86/synonyms.h +++ b/media/libaom/src/aom_dsp/x86/synonyms.h @@ -13,6 +13,7 @@ #define AOM_AOM_DSP_X86_SYNONYMS_H_ #include <immintrin.h> +#include <string.h> #include "config/aom_config.h" @@ -28,7 +29,9 @@ // Loads and stores to do away with the tedium of casting the address // to the right type. static INLINE __m128i xx_loadl_32(const void *a) { - return _mm_cvtsi32_si128(*(const uint32_t *)a); + int val; + memcpy(&val, a, sizeof(val)); + return _mm_cvtsi32_si128(val); } static INLINE __m128i xx_loadl_64(const void *a) { @@ -44,7 +47,8 @@ static INLINE __m128i xx_loadu_128(const void *a) { } static INLINE void xx_storel_32(void *const a, const __m128i v) { - *(uint32_t *)a = _mm_cvtsi128_si32(v); + const int val = _mm_cvtsi128_si32(v); + memcpy(a, &val, sizeof(val)); } static INLINE void xx_storel_64(void *const a, const __m128i v) { diff --git a/media/libaom/src/aom_dsp/x86/synonyms_avx2.h b/media/libaom/src/aom_dsp/x86/synonyms_avx2.h index 3f69b120ea..4d6ee6ad64 100644 --- a/media/libaom/src/aom_dsp/x86/synonyms_avx2.h +++ b/media/libaom/src/aom_dsp/x86/synonyms_avx2.h @@ -67,6 +67,11 @@ static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { return yy_set_m128i(mhi, mlo); } +static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) { + _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1)); + _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a)); +} + static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) { const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1); return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256()); diff --git a/media/libaom/src/aom_dsp/x86/transpose_sse2.h b/media/libaom/src/aom_dsp/x86/transpose_sse2.h index d0d1ee6845..7ac692c78b 100644 --- a/media/libaom/src/aom_dsp/x86/transpose_sse2.h +++ b/media/libaom/src/aom_dsp/x86/transpose_sse2.h @@ -17,7 +17,7 @@ #include "config/aom_config.h" static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { - // Unpack 16 bit elements. Goes from: + // Unpack 8 bit elements. Goes from: // in[0]: 00 01 02 03 // in[1]: 10 11 12 13 // in[2]: 20 21 22 23 @@ -28,7 +28,7 @@ static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); - // Unpack 32 bit elements resulting in: + // Unpack 16 bit elements resulting in: // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 return _mm_unpacklo_epi16(a0, a1); } diff --git a/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h b/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h index b1611ba870..ea57c9f35e 100644 --- a/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h +++ b/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h @@ -20,9 +20,6 @@ extern "C" { #endif -typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, - int8_t cos_bit); - static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) { return _mm256_set1_epi32( (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); @@ -117,58 +114,115 @@ static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in, } } -static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, - __m256i *const out) { - // Unpack 16 bit elements. Goes from: - // in[0]: 00 01 02 03 08 09 0a 0b 04 05 06 07 0c 0d 0e 0f - // in[1]: 10 11 12 13 18 19 1a 1b 14 15 16 17 1c 1d 1e 1f - // in[2]: 20 21 22 23 28 29 2a 2b 24 25 26 27 2c 2d 2e 2f - // in[3]: 30 31 32 33 38 39 3a 3b 34 35 36 37 3c 3d 3e 3f - // in[4]: 40 41 42 43 48 49 4a 4b 44 45 46 47 4c 4d 4e 4f - // in[5]: 50 51 52 53 58 59 5a 5b 54 55 56 57 5c 5d 5e 5f - // in[6]: 60 61 62 63 68 69 6a 6b 64 65 66 67 6c 6d 6e 6f - // in[7]: 70 71 72 73 78 79 7a 7b 74 75 76 77 7c 7d 7e 7f - // in[8]: 80 81 82 83 88 89 8a 8b 84 85 86 87 8c 8d 8e 8f - // to: - // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - // ... - __m256i a[16]; - for (int i = 0; i < 16; i += 2) { - a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]); - a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]); +static INLINE void transpose2_8x8_avx2(const __m256i *const in, + __m256i *const out) { + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (int i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); } - __m256i b[16]; - for (int i = 0; i < 16; i += 2) { - b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]); - b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]); + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (int i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); } - __m256i c[16]; - for (int i = 0; i < 16; i += 2) { - c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]); - c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]); + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (int i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); } - out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20); - out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20); - out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20); - out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20); +} + +static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, + __m256i *const out) { + __m256i t[16]; - out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31); - out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31); - out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31); - out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31); +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1); - out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20); - out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20); - out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20); - out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20); +#define LOADR(idx) \ + t[8 + idx] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + idx] = _mm256_inserti128_si256( \ + t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1); - out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31); - out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31); - out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31); - out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31); + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in, + __m256i *const out) { + const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]); + const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]); + const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]); + const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]); + const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]); + const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]); + const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]); + const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]); + + const __m256i b0 = _mm256_unpacklo_epi32(a0, a1); + const __m256i b1 = _mm256_unpacklo_epi32(a2, a3); + const __m256i b2 = _mm256_unpacklo_epi32(a4, a5); + const __m256i b3 = _mm256_unpacklo_epi32(a6, a7); + const __m256i b4 = _mm256_unpackhi_epi32(a0, a1); + const __m256i b5 = _mm256_unpackhi_epi32(a2, a3); + const __m256i b6 = _mm256_unpackhi_epi32(a4, a5); + const __m256i b7 = _mm256_unpackhi_epi32(a6, a7); + + out[0] = _mm256_unpacklo_epi64(b0, b1); + out[1] = _mm256_unpackhi_epi64(b0, b1); + out[2] = _mm256_unpacklo_epi64(b4, b5); + out[3] = _mm256_unpackhi_epi64(b4, b5); + out[4] = _mm256_unpacklo_epi64(b2, b3); + out[5] = _mm256_unpackhi_epi64(b2, b3); + out[6] = _mm256_unpacklo_epi64(b6, b7); + out[7] = _mm256_unpackhi_epi64(b6, b7); } static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) { @@ -192,6 +246,113 @@ static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) { } } +static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) { + __m256i tmp, round; + round = _mm256_set1_epi32(1 << (bit - 1)); + tmp = _mm256_add_epi32(vec, round); + return _mm256_srai_epi32(tmp, bit); +} + +static INLINE void av1_round_shift_array_32_avx2(__m256i *input, + __m256i *output, + const int size, + const int bit) { + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + output[i] = av1_round_shift_32_avx2(input[i], bit); + } + } else { + int i; + for (i = 0; i < size; i++) { + output[i] = _mm256_slli_epi32(input[i], -bit); + } + } +} + +static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input, + __m256i *output, + const int size, + const int bit, + const int val) { + const __m256i sqrt2 = _mm256_set1_epi32(val); + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + const __m256i r0 = av1_round_shift_32_avx2(input[i], bit); + const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits); + } + } else { + int i; + for (i = 0; i < size; i++) { + const __m256i r0 = _mm256_slli_epi32(input[i], -bit); + const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits); + } + } +} + +static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) { + const __m256i scale_rounding = + pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1)); + const __m256i b = _mm256_madd_epi16(a, scale_rounding); + return _mm256_srai_epi32(b, NewSqrt2Bits); +} + +static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a, + int32_t *const b) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_lo = _mm256_unpacklo_epi16(a, one); + const __m256i a_hi = _mm256_unpackhi_epi16(a, one); + const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2); + const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31); + _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo)); + _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi)); + _mm256_store_si256((__m256i *)(b + 64), temp); +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2( + const __m256i *const in, int32_t *const out, const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride); + } +} + +static INLINE void pack_reg(const __m128i *in1, const __m128i *in2, + __m256i *out) { + out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1); + out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1); + out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1); + out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1); + out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1); + out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1); + out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1); + out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1); +} + +static INLINE void extract_reg(const __m256i *in, __m128i *out1) { + out1[0] = _mm256_castsi256_si128(in[0]); + out1[1] = _mm256_castsi256_si128(in[1]); + out1[2] = _mm256_castsi256_si128(in[2]); + out1[3] = _mm256_castsi256_si128(in[3]); + out1[4] = _mm256_castsi256_si128(in[4]); + out1[5] = _mm256_castsi256_si128(in[5]); + out1[6] = _mm256_castsi256_si128(in[6]); + out1[7] = _mm256_castsi256_si128(in[7]); + + out1[8] = _mm256_extracti128_si256(in[0], 0x01); + out1[9] = _mm256_extracti128_si256(in[1], 0x01); + out1[10] = _mm256_extracti128_si256(in[2], 0x01); + out1[11] = _mm256_extracti128_si256(in[3], 0x01); + out1[12] = _mm256_extracti128_si256(in[4], 0x01); + out1[13] = _mm256_extracti128_si256(in[5], 0x01); + out1[14] = _mm256_extracti128_si256(in[6], 0x01); + out1[15] = _mm256_extracti128_si256(in[7], 0x01); +} + #ifdef __cplusplus } #endif diff --git a/media/libaom/src/aom_dsp/x86/txfm_common_sse2.h b/media/libaom/src/aom_dsp/x86/txfm_common_sse2.h index ed82eee962..9c99eb93bd 100644 --- a/media/libaom/src/aom_dsp/x86/txfm_common_sse2.h +++ b/media/libaom/src/aom_dsp/x86/txfm_common_sse2.h @@ -26,4 +26,8 @@ static INLINE __m128i mm_reverse_epi16(const __m128i x) { return _mm_shuffle_epi32(b, 0x4e); } +#define octa_set_epi16(a, b, c, d, e, f, g, h) \ + _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ + (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) + #endif // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/media/libaom/src/aom_dsp/x86/variance_avx2.c b/media/libaom/src/aom_dsp/x86/variance_avx2.c index 800aef1266..c4919ba9b4 100644 --- a/media/libaom/src/aom_dsp/x86/variance_avx2.c +++ b/media/libaom/src/aom_dsp/x86/variance_avx2.c @@ -28,7 +28,7 @@ static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) { static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref, __m256i *const sse, __m256i *const sum) { - const __m256i adj_sub = _mm256_set1_epi16(0xff01); // (1,-1) + const __m256i adj_sub = _mm256_set1_epi16((short)0xff01); // (1,-1) // unpack into pairs of source and reference values const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); @@ -234,6 +234,10 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, int height, unsigned int *sse); +unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse); unsigned int aom_sub_pixel_avg_variance32xh_avx2( const uint8_t *src, int src_stride, int x_offset, int y_offset, @@ -276,6 +280,11 @@ AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5); AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6); AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5); AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4); +AOM_SUB_PIXEL_VAR_AVX2(16, 64, 16, 4, 6); +AOM_SUB_PIXEL_VAR_AVX2(16, 32, 16, 4, 5); +AOM_SUB_PIXEL_VAR_AVX2(16, 16, 16, 4, 4); +AOM_SUB_PIXEL_VAR_AVX2(16, 8, 16, 4, 3); +AOM_SUB_PIXEL_VAR_AVX2(16, 4, 16, 4, 2); #define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2) \ unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \ diff --git a/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c b/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c index 88e27aef3a..f779270ae3 100644 --- a/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c +++ b/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c @@ -104,6 +104,65 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); +// Functions related to sub pixel variance width 16 +#define LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ + /* load source and destination of 2 rows and insert*/ \ + src_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \ + _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \ + dst_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \ + _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1); + +#define AVG_NEXT_SRC_INSERT(src_reg, size_stride) \ + src_next_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \ + _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1); \ + /* average between current and next stride source */ \ + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + +#define MERGE_NEXT_SRC_INSERT(src_reg, size_stride) \ + src_next_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \ + _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1); \ + MERGE_WITH_SRC(src_reg, src_next_reg) + +#define LOAD_SRC_NEXT_BYTE_INSERT \ + /* load source and another source from next row */ \ + src_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \ + _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \ + /* load source and next row source from 1 byte onwards */ \ + src_next_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \ + _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1); + +#define LOAD_DST_INSERT \ + dst_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \ + _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1); + +#define LOAD_SRC_MERGE_128BIT(filter) \ + __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \ + __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \ + __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); \ + __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); \ + __m128i filter_128bit = _mm256_castsi256_si128(filter); \ + __m128i pw8_128bit = _mm256_castsi256_si128(pw8); + +#define FILTER_SRC_128BIT(filter) \ + /* filter the source */ \ + src_lo = _mm_maddubs_epi16(src_lo, filter); \ + src_hi = _mm_maddubs_epi16(src_hi, filter); \ + \ + /* add 8 to source */ \ + src_lo = _mm_add_epi16(src_lo, pw8_128bit); \ + src_hi = _mm_add_epi16(src_hi, pw8_128bit); \ + \ + /* divide source by 16 */ \ + src_lo = _mm_srai_epi16(src_lo, 4); \ + src_hi = _mm_srai_epi16(src_hi, 4); + unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, @@ -127,8 +186,8 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, src += src_stride; dst += dst_stride; } - // x_offset = 0 and y_offset = 8 - } else if (y_offset == 8) { + // x_offset = 0 and y_offset = 4 + } else if (y_offset == 4) { __m256i src_next_reg; for (i = 0; i < height; i++) { LOAD_SRC_DST @@ -156,8 +215,8 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, dst += dst_stride; } } - // x_offset = 8 and y_offset = 0 - } else if (x_offset == 8) { + // x_offset = 4 and y_offset = 0 + } else if (x_offset == 4) { if (y_offset == 0) { __m256i src_next_reg; for (i = 0; i < height; i++) { @@ -169,8 +228,8 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, src += src_stride; dst += dst_stride; } - // x_offset = 8 and y_offset = 8 - } else if (y_offset == 8) { + // x_offset = 4 and y_offset = 4 + } else if (y_offset == 4) { __m256i src_next_reg, src_avg; // load source and another source starting from the next // following byte @@ -189,7 +248,7 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, CALC_SUM_SSE_INSIDE_LOOP dst += dst_stride; } - // x_offset = 8 and y_offset = bilin interpolation + // x_offset = 4 and y_offset = bilin interpolation } else { __m256i filter, pw8, src_next_reg, src_avg; y_offset <<= 5; @@ -228,8 +287,8 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, src += src_stride; dst += dst_stride; } - // x_offset = bilin interpolation and y_offset = 8 - } else if (y_offset == 8) { + // x_offset = bilin interpolation and y_offset = 4 + } else if (y_offset == 4) { __m256i filter, pw8, src_next_reg, src_pack; x_offset <<= 5; filter = _mm256_load_si256( @@ -292,6 +351,244 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, return sum; } +unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse) { + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height; i += 2) { + LOAD_SRC_DST_INSERT(src_stride, dst_stride) + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += (src_stride << 1); + dst += (dst_stride << 1); + } + // x_offset = 0 and y_offset = 4 + } else if (y_offset == 4) { + __m256i src_next_reg; + for (i = 0; i < height; i += 2) { + LOAD_SRC_DST_INSERT(src_stride, dst_stride) + AVG_NEXT_SRC_INSERT(src_reg, src_stride) + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += (src_stride << 1); + dst += (dst_stride << 1); + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i += 2) { + LOAD_SRC_DST_INSERT(src_stride, dst_stride) + MERGE_NEXT_SRC_INSERT(src_reg, src_stride) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src += (src_stride << 1); + dst += (dst_stride << 1); + } + } + // x_offset = 4 and y_offset = 0 + } else if (x_offset == 4) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height; i += 2) { + LOAD_SRC_NEXT_BYTE_INSERT + LOAD_DST_INSERT + /* average between current and next stride source */ + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += (src_stride << 1); + dst += (dst_stride << 1); + } + // x_offset = 4 and y_offset = 4 + } else if (y_offset == 4) { + __m256i src_next_reg, src_avg, src_temp; + // load and insert source and next row source + LOAD_SRC_NEXT_BYTE_INSERT + src_avg = _mm256_avg_epu8(src_reg, src_next_reg); + src += src_stride << 1; + for (i = 0; i < height - 2; i += 2) { + LOAD_SRC_NEXT_BYTE_INSERT + src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); + src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); + src_temp = _mm256_avg_epu8(src_avg, src_temp); + LOAD_DST_INSERT + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_temp, zero_reg) + // save current source average + src_avg = src_next_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride << 1; + src += src_stride << 1; + } + // last 2 rows processing happens here + __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); + __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); + src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); + src_next_reg = _mm256_permute2x128_si256( + src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); + LOAD_DST_INSERT + src_avg = _mm256_avg_epu8(src_avg, src_next_reg); + MERGE_WITH_SRC(src_avg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + } else { + // x_offset = 4 and y_offset = bilin interpolation + __m256i filter, pw8, src_next_reg, src_avg, src_temp; + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load and insert source and next row source + LOAD_SRC_NEXT_BYTE_INSERT + src_avg = _mm256_avg_epu8(src_reg, src_next_reg); + src += src_stride << 1; + for (i = 0; i < height - 2; i += 2) { + LOAD_SRC_NEXT_BYTE_INSERT + src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); + src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); + LOAD_DST_INSERT + MERGE_WITH_SRC(src_avg, src_temp) + // save current source average + src_avg = src_next_reg; + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride << 1; + src += src_stride << 1; + } + // last 2 rows processing happens here + __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); + __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); + src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); + src_next_reg = _mm256_permute2x128_si256( + src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); + LOAD_DST_INSERT + MERGE_WITH_SRC(src_avg, src_next_reg) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i += 2) { + LOAD_SRC_DST_INSERT(src_stride, dst_stride) + MERGE_NEXT_SRC_INSERT(src_reg, 1) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src += (src_stride << 1); + dst += (dst_stride << 1); + } + // x_offset = bilin interpolation and y_offset = 4 + } else if (y_offset == 4) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + // load and insert source and next row source + LOAD_SRC_NEXT_BYTE_INSERT + MERGE_WITH_SRC(src_reg, src_next_reg) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src += src_stride << 1; + for (i = 0; i < height - 2; i += 2) { + LOAD_SRC_NEXT_BYTE_INSERT + LOAD_DST_INSERT + MERGE_WITH_SRC(src_reg, src_next_reg) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_next_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src_pack = src_reg; + src += src_stride << 1; + dst += dst_stride << 1; + } + // last 2 rows processing happens here + LOAD_SRC_MERGE_128BIT(filter) + LOAD_DST_INSERT + FILTER_SRC_128BIT(filter_128bit) + src_reg_0 = _mm_packus_epi16(src_lo, src_hi); + src_next_reg = _mm256_permute2x128_si256( + src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_next_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + } else { + // x_offset = bilin interpolation and y_offset = bilin interpolation + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load and insert source and next row source + LOAD_SRC_NEXT_BYTE_INSERT + MERGE_WITH_SRC(src_reg, src_next_reg) + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src += src_stride << 1; + for (i = 0; i < height - 2; i += 2) { + LOAD_SRC_NEXT_BYTE_INSERT + LOAD_DST_INSERT + MERGE_WITH_SRC(src_reg, src_next_reg) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); + // average between previous pack to the current + MERGE_WITH_SRC(src_pack, src_next_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride << 1; + dst += dst_stride << 1; + } + // last 2 rows processing happens here + LOAD_SRC_MERGE_128BIT(xfilter) + LOAD_DST_INSERT + FILTER_SRC_128BIT(filter_128bit) + src_reg_0 = _mm_packus_epi16(src_lo, src_hi); + src_next_reg = _mm256_permute2x128_si256( + src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); + MERGE_WITH_SRC(src_pack, src_next_reg) + FILTER_SRC(yfilter) + CALC_SUM_SSE_INSIDE_LOOP + } + } + CALC_SUM_AND_SSE + _mm256_zeroupper(); + return sum; +} + unsigned int aom_sub_pixel_avg_variance32xh_avx2( const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, diff --git a/media/libaom/src/aom_dsp/x86/variance_sse2.c b/media/libaom/src/aom_dsp/x86/variance_sse2.c index 3c37e77c06..4e2b5a1aa0 100644 --- a/media/libaom/src/aom_dsp/x86/variance_sse2.c +++ b/media/libaom/src/aom_dsp/x86/variance_sse2.c @@ -21,9 +21,10 @@ #include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" #include "av1/common/filter.h" -#include "av1/common/onyxc_int.h" #include "av1/common/reconinter.h" +#include "av1/encoder/reconinter_enc.h" unsigned int aom_get_mb_ss_sse2(const int16_t *src) { __m128i vsum = _mm_setzero_si128(); @@ -144,6 +145,7 @@ static INLINE void variance8_sse2(const uint8_t *src, const int src_stride, __m128i *const sum) { assert(h <= 128); // May overflow for larger height. *sum = _mm_setzero_si128(); + *sse = _mm_setzero_si128(); for (int i = 0; i < h; i++) { const __m128i s = load8_8to16_sse2(src); const __m128i r = load8_8to16_sse2(ref); @@ -236,6 +238,14 @@ static INLINE void variance128_sse2(const uint8_t *src, const int src_stride, } } +void aom_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + __m128i vsse, vsum; + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, sum); +} + #define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels) \ unsigned int aom_variance##bw##x##bh##_sse2( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ @@ -494,88 +504,36 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, const int ref_num = 0; const int is_intrabc = is_intrabc_block(mi); const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; const int is_scaled = av1_is_scaled(sf); if (is_scaled) { - // Note: This is mostly a copy from the >=8X8 case in - // build_inter_predictors() function, with some small tweaks. - - // Some assumptions. - const int plane = 0; - - // Get pre-requisites. + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int ssx = pd->subsampling_x; - const int ssy = pd->subsampling_y; - assert(ssx == 0 && ssy == 0); const struct buf_2d *const dst_buf = &pd->dst; const struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref_num]; - const int mi_x = mi_col * MI_SIZE; - const int mi_y = mi_row * MI_SIZE; - // Calculate subpel_x/y and x/y_step. - const int row_start = 0; // Because ss_y is 0. - const int col_start = 0; // Because ss_x is 0. - const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; - const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; - int orig_pos_y = pre_y << SUBPEL_BITS; - orig_pos_y += mv->row * (1 << (1 - ssy)); - int orig_pos_x = pre_x << SUBPEL_BITS; - orig_pos_x += mv->col * (1 << (1 - ssx)); - int pos_y = sf->scale_value_y(orig_pos_y, sf); - int pos_x = sf->scale_value_x(orig_pos_x, sf); - pos_x += SCALE_EXTRA_OFF; - pos_y += SCALE_EXTRA_OFF; - - const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); - const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); - const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - const int right = (pre_buf->width + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - pos_y = clamp(pos_y, top, bottom); - pos_x = clamp(pos_x, left, right); - - const uint8_t *const pre = - pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + - (pos_x >> SCALE_SUBPEL_BITS); - - const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, - pos_x & SCALE_SUBPEL_MASK, - pos_y & SCALE_SUBPEL_MASK }; - - // Get warp types. - const WarpedMotionParams *const wm = - &xd->global_motion[mi->ref_frame[ref_num]]; - const int is_global = is_global_mv_block(mi, wm->wmtype); - WarpTypesAllowed warp_types; - warp_types.global_warp_allowed = is_global; - warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; - - // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); - const InterpFilters filters = + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); - - // Get the inter predictor. - const int build_for_obmc = 0; - av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width, - &subpel_params, sf, width, height, &conv_params, - filters, &warp_types, mi_x >> pd->subsampling_x, - mi_y >> pd->subsampling_y, plane, ref_num, mi, - build_for_obmc, xd, cm->allow_warped_motion); - + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred, width, mv, + &inter_pred_params); return; } } - const InterpFilterParams *filter = - (subpel_search == 1) - ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) - : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); - int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS; + const InterpFilterParams *filter = av1_get_filter(subpel_search); + // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for + // 2-tap yet. + int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS; if (!subpel_x_q3 && !subpel_y_q3) { if (width >= 16) { @@ -638,20 +596,13 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1); - uint8_t *temp_start_horiz = - (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp; + uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS) + ? temp + (filter_taps >> 1) * MAX_SB_SIZE + : temp; uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); int intermediate_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - // TODO(Deepa): Remove the memset below when we have - // 4 tap simd for sse2 and ssse3. - if (subpel_search == 1) { - memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width); - memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width); - memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width); - memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width); - } aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height); aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1, |