diff options
author | Moonchild <moonchild@palemoon.org> | 2021-03-03 18:48:48 +0000 |
---|---|---|
committer | Moonchild <moonchild@palemoon.org> | 2021-03-04 00:03:46 +0000 |
commit | d192e6808fab971ffbe8c15ff3695c7d08b280c6 (patch) | |
tree | 9d9cc4d21c93ae3e1a88ab5c160c3be5f6af0ca9 /media/libaom/src/av1/common/arm/selfguided_neon.c | |
parent | c81c0395a36b0a6b21a6a0ae7b31953fe4a460d9 (diff) | |
download | uxp-d192e6808fab971ffbe8c15ff3695c7d08b280c6.tar.gz |
Issue #1737 - Import libaom 2.0.2 source
Diffstat (limited to 'media/libaom/src/av1/common/arm/selfguided_neon.c')
-rw-r--r-- | media/libaom/src/av1/common/arm/selfguided_neon.c | 140 |
1 files changed, 111 insertions, 29 deletions
diff --git a/media/libaom/src/av1/common/arm/selfguided_neon.c b/media/libaom/src/av1/common/arm/selfguided_neon.c index b3a37c4cb8..fc404a64ab 100644 --- a/media/libaom/src/av1/common/arm/selfguided_neon.c +++ b/media/libaom/src/av1/common/arm/selfguided_neon.c @@ -19,8 +19,8 @@ #include "aom_dsp/txfm_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" #include "av1/common/common.h" -#include "av1/common/onyxc_int.h" #include "av1/common/resize.h" #include "av1/common/restoration.h" #include "av1/common/arm/mem_neon.h" @@ -86,7 +86,7 @@ static INLINE void calc_ab_fast_internal_common( for (int x = 0; x < 4; x++) { for (int y = 0; y < 4; y++) { - dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]]; + dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]]; } } load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3); @@ -214,7 +214,7 @@ static INLINE void calc_ab_internal_common( for (int x = 0; x < 4; x++) { for (int y = 0; y < 8; y++) { - dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]]; + dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]]; } } load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7); @@ -376,6 +376,21 @@ static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16, w -= 8; count++; } while (w > 0); + + // memset needed for row pixels as 2nd stage of boxsum filter uses + // first 2 rows of dst16, dst2 buffer which is not filled in first stage. + for (int x = 0; x < 2; x++) { + memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16)); + memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2)); + } + + // memset needed for extra columns as 2nd stage of boxsum filter uses + // last 2 columns of dst16, dst2 buffer which is not filled in first stage. + for (int x = 2; x < height + 2; x++) { + int dst_offset = x * dst_stride + width + 2; + memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16)); + memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2)); + } } { @@ -467,7 +482,7 @@ static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16, const uint32_t n = (2 * r + 1) * (2 * r + 1); const uint32x4_t const_n_val = vdupq_n_u32(n); const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); - const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]); + const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]); const uint32x4_t const_val = vdupq_n_u32(255); uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7; @@ -509,6 +524,7 @@ static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16, } while (h > 0); } +#if CONFIG_AV1_HIGHBITDEPTH static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16, uint16_t *B16, int32_t *B, const int buf_stride, const int width, @@ -522,7 +538,7 @@ static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16, const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); const uint32x4_t const_n_val = vdupq_n_u32(n); const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); - const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]); + const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]); const uint32x4_t const_val = vdupq_n_u32(255); int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; @@ -573,6 +589,7 @@ static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16, h -= (ht_inc * 4); } while (h > 0); } +#endif // CONFIG_AV1_HIGHBITDEPTH static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16, int32_t *B, const int buf_stride, @@ -584,7 +601,7 @@ static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16, const uint32_t n = (2 * r + 1) * (2 * r + 1); const uint32x4_t const_n_val = vdupq_n_u32(n); const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); - const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]); + const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]); const uint32x4_t const_val = vdupq_n_u32(255); int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; @@ -626,6 +643,7 @@ static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16, } while (h > 0); } +#if CONFIG_AV1_HIGHBITDEPTH static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16, int32_t *B, const int buf_stride, const int width, const int height, @@ -638,7 +656,7 @@ static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16, const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); const uint32x4_t const_n_val = vdupq_n_u32(n); const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); - const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]); + const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]); const uint32x4_t const_val = vdupq_n_u32(255); int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; @@ -679,6 +697,7 @@ static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16, h -= (ht_inc * 4); } while (h > 0); } +#endif // CONFIG_AV1_HIGHBITDEPTH static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1, int32_t *dst2, const int dst_stride, const int width, @@ -788,6 +807,21 @@ static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1, w -= 8; count++; } while (w > 0); + + // memset needed for row pixels as 2nd stage of boxsum filter uses + // first 2 rows of dst1, dst2 buffer which is not filled in first stage. + for (int x = 0; x < 2; x++) { + memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1)); + memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2)); + } + + // memset needed for extra columns as 2nd stage of boxsum filter uses + // last 2 columns of dst1, dst2 buffer which is not filled in first stage. + for (int x = 2; x < height + 2; x++) { + int dst_offset = x * dst_stride + width + 2; + memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1)); + memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2)); + } } { @@ -1145,7 +1179,7 @@ static INLINE void restoration_fast_internal(uint16_t *dgd16, int width, int32_t *dst, int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; @@ -1181,17 +1215,25 @@ static INLINE void restoration_fast_internal(uint16_t *dgd16, int width, // Calculation of a, b. a output is in 16bit tmp_buf which is in range of // [1, 256] for all bit depths. b output is kept in 32bit buffer. - if (8 == bit_depth) { - calc_ab_fast_internal_lbd( - (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), - (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r, - params->s[radius_idx], 2); - } else { +#if CONFIG_AV1_HIGHBITDEPTH + if (bit_depth > 8) { calc_ab_fast_internal_hbd( (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, bit_depth, r, params->s[radius_idx], 2); + } else { + calc_ab_fast_internal_lbd( + (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r, + params->s[radius_idx], 2); } +#else + (void)bit_depth; + calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1), + (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, + width + 2, height + 2, r, params->s[radius_idx], 2); +#endif final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16, dgd_stride, dst, dst_stride, width, height); } @@ -1200,7 +1242,7 @@ static INLINE void restoration_internal(uint16_t *dgd16, int width, int height, int dgd_stride, int32_t *dst, int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; @@ -1235,19 +1277,27 @@ static INLINE void restoration_internal(uint16_t *dgd16, int width, int height, A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; +#if CONFIG_AV1_HIGHBITDEPTH // Calculation of a, b. a output is in 16bit tmp_buf which is in range of // [1, 256] for all bit depths. b output is kept in 32bit buffer. - if (8 == bit_depth) { - calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), + if (bit_depth > 8) { + calc_ab_internal_hbd((square_sum_buf - buf_stride - 1), (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), (B - buf_stride - 1), buf_stride, width + 2, - height + 2, r, params->s[radius_idx], 1); + height + 2, bit_depth, r, params->s[radius_idx], 1); } else { - calc_ab_internal_hbd((square_sum_buf - buf_stride - 1), + calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), (B - buf_stride - 1), buf_stride, width + 2, - height + 2, bit_depth, r, params->s[radius_idx], 1); + height + 2, r, params->s[radius_idx], 1); } +#else + (void)bit_depth; + calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), + (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), + (B - buf_stride - 1), buf_stride, width + 2, height + 2, + r, params->s[radius_idx], 1); +#endif final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst, dst_stride, width, height); } @@ -1299,8 +1349,14 @@ static INLINE void src_convert_u8_to_u16(const uint8_t *src, dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride]; } } + + // memset uninitialized rows of src buffer as they are needed for the + // boxsum filter calculation. + for (int x = height; x < height + 5; x++) + memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst)); } +#if CONFIG_AV1_HIGHBITDEPTH static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride, uint16_t *dst, const int dst_stride, int width, int height) { @@ -1339,13 +1395,18 @@ static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride, memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride), sizeof(uint16_t) * width); } + // memset uninitialized rows of src buffer as they are needed for the + // boxsum filter calculation. + for (int x = height; x < height + 5; x++) + memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst)); } +#endif // CONFIG_AV1_HIGHBITDEPTH int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, int stride, int32_t *flt0, int32_t *flt1, int flt_stride, int sgr_params_idx, int bit_depth, int highbd) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; assert(!(params->r[0] == 0 && params->r[1] == 0)); uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS]; @@ -1356,6 +1417,7 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; const int dgd_stride = stride; +#if CONFIG_AV1_HIGHBITDEPTH if (highbd) { const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); src_convert_hbd_copy( @@ -1370,6 +1432,13 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, dgd16_stride, width_ext, height_ext); } +#else + (void)highbd; + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); +#endif if (params->r[0] > 0) restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, @@ -1380,11 +1449,11 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, return 0; } -void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, - int height, int stride, int eps, - const int *xqd, uint8_t *dst8, - int dst_stride, int32_t *tmpbuf, - int bit_depth, int highbd) { +void av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { int32_t *flt0 = tmpbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); @@ -1395,11 +1464,12 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; const int dgd_stride = stride; - const sgr_params_type *const params = &sgr_params[eps]; + const sgr_params_type *const params = &av1_sgr_params[eps]; int xq[2]; assert(!(params->r[0] == 0 && params->r[1] == 0)); +#if CONFIG_AV1_HIGHBITDEPTH if (highbd) { const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); src_convert_hbd_copy( @@ -1414,7 +1484,13 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, dgd16_stride, width_ext, height_ext); } - +#else + (void)highbd; + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); +#endif if (params->r[0] > 0) restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width, bit_depth, eps, 0); @@ -1422,7 +1498,7 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, restoration_internal(dgd16, width, height, dgd16_stride, flt1, width, bit_depth, eps, 1); - decode_xq(xqd, xq, params); + av1_decode_xq(xqd, xq, params); { int16_t *src_ptr; @@ -1485,6 +1561,7 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero)); +#if CONFIG_AV1_HIGHBITDEPTH if (highbd) { r4 = vminq_u16(r4, max); vst1q_u16(dst16_ptr, r4); @@ -1492,6 +1569,11 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, t0 = vqmovn_u16(r4); vst1_u8(dst_ptr, t0); } +#else + (void)max; + t0 = vqmovn_u16(r4); + vst1_u8(dst_ptr, t0); +#endif w -= 8; count += 8; dst_ptr += 8; |