diff options
author | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
---|---|---|
committer | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
commit | 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch) | |
tree | 10027f336435511475e392454359edea8e25895d /media/libvpx/vpx_dsp/x86 | |
parent | 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff) | |
download | uxp-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz |
Add m-esr52 at 52.6.0
Diffstat (limited to 'media/libvpx/vpx_dsp/x86')
-rw-r--r-- | media/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm | 289 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm | 365 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm | 313 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/highbd_variance_sse2.c | 245 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/sad4d_avx2.c | 168 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/sad4d_sse2.asm | 233 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/sad_avx2.c | 181 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/sad_mmx.asm | 427 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/sad_sse2.asm | 269 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/sad_sse3.asm | 374 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/sad_sse4.asm | 359 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/sad_ssse3.asm | 370 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/variance_avx2.c | 93 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/variance_impl_avx2.c | 215 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/variance_impl_mmx.asm | 424 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/variance_mmx.c | 107 | ||||
-rw-r--r-- | media/libvpx/vpx_dsp/x86/variance_sse2.c | 309 |
17 files changed, 4741 insertions, 0 deletions
diff --git a/media/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm b/media/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm new file mode 100644 index 0000000000..95cc4372ec --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm @@ -0,0 +1,289 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define program_name vpx + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_4x2x4 5-6 0 + movh m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + movhps m0, [srcq +%4*2] + movhps m4, [ref1q+%5*2] + movhps m5, [ref2q+%5*2] + movhps m6, [ref3q+%5*2] + movhps m7, [ref4q+%5*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + movu m2, [ref1q+%3*2] + movhps m0, [srcq +%4*2] + movhps m2, [ref1q+%5*2] + mova m3, m0 + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m4, m2 + + movu m2, [ref2q+%3*2] + mova m3, m0 + movhps m2, [ref2q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m5, m2 + + movu m2, [ref3q+%3*2] + mova m3, m0 + movhps m2, [ref3q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m6, m2 + + movu m2, [ref4q+%3*2] + mova m3, m0 + movhps m2, [ref4q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_8x2x4 5-6 0 + ; 1st 8 px + mova m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + mova m3, m0 + movu m2, [ref1q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif + + ; 2nd 8 px + mova m0, [srcq +(%4)*2] + mova m3, m0 + movu m2, [ref1q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endmacro + +; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_16x2x4 5-6 0 + HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) + HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 +%endmacro + +; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_32x2x4 5-6 0 + HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) + HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 +%endmacro + +; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_64x2x4 5-6 0 + HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) + HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 +%endmacro + +; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 +%macro HIGH_SADNXN4D 2 +%if UNIX64 +cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif + +; set m1 + push srcq + mov srcd, 0x00010001 + movd m1, srcd + pshufd m1, m1, 0x0 + pop srcq + + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + +; convert byte pointers to short pointers + shl srcq, 1 + shl ref2q, 1 + shl ref3q, 1 + shl ref4q, 1 + shl ref1q, 1 + + HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + ; N.B. HIGH_PROCESS outputs dwords (32 bits) + ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM + movhlps m0, m4 + movhlps m1, m5 + movhlps m2, m6 + movhlps m3, m7 + paddd m4, m0 + paddd m5, m1 + paddd m6, m2 + paddd m7, m3 + punpckldq m4, m5 + punpckldq m6, m7 + movhlps m0, m4 + movhlps m1, m6 + paddd m4, m0 + paddd m6, m1 + punpcklqdq m4, m6 + movifnidn r4, r4mp + movu [r4], m4 + RET +%endmacro + + +INIT_XMM sse2 +HIGH_SADNXN4D 64, 64 +HIGH_SADNXN4D 64, 32 +HIGH_SADNXN4D 32, 64 +HIGH_SADNXN4D 32, 32 +HIGH_SADNXN4D 32, 16 +HIGH_SADNXN4D 16, 32 +HIGH_SADNXN4D 16, 16 +HIGH_SADNXN4D 16, 8 +HIGH_SADNXN4D 8, 16 +HIGH_SADNXN4D 8, 8 +HIGH_SADNXN4D 8, 4 +HIGH_SADNXN4D 4, 8 +HIGH_SADNXN4D 4, 4 diff --git a/media/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm b/media/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm new file mode 100644 index 0000000000..4d422dde3a --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm @@ -0,0 +1,365 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define program_name vpx + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro HIGH_SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +; convert src, ref & second_pred to short ptrs (from byte ptrs) + shl srcq, 1 + shl refq, 1 +%if %4 == 1 + shl second_predq, 1 +%endif +%endmacro + +; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD64XN 1-2 0 + HIGH_SAD_FN 64, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + ; first half of each row + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + paddd m0, m1 + paddd m0, m3 + ; second half of each row + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq+64] + psubusw m5, m1 + psubusw m1, [srcq+64] + por m1, m5 + mova m5, [srcq+80] + psubusw m5, m2 + psubusw m2, [srcq+80] + por m2, m5 + mova m5, [srcq+96] + psubusw m5, m3 + psubusw m3, [srcq+96] + por m3, m5 + mova m5, [srcq+112] + psubusw m5, m4 + psubusw m4, [srcq+112] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 +HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 +HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 +HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 + + +; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD32XN 1-2 0 + HIGH_SAD_FN 32, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 +HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 +HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 +HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 +HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 +HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 + +; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD16XN 1-2 0 + HIGH_SAD_FN 16, %1, 5, %2 + mov n_rowsd, %1/2 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_strideq*2+16] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+16] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*2+16] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*2] + por m3, m5 + mova m5, [srcq+src_strideq*2+16] + psubusw m5, m4 + psubusw m4, [srcq+src_strideq*2+16] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 +HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 +HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 +HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 +HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 +HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 + + +; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD8XN 1-2 0 + HIGH_SAD_FN 8, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq*2] + movu m3, [refq+ref_strideq*4] + movu m4, [refq+ref_stride3q*2] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m2 + psubusw m2, [srcq+src_strideq*2] + por m2, m5 + mova m5, [srcq+src_strideq*4] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*4] + por m3, m5 + mova m5, [srcq+src_stride3q*2] + psubusw m5, m4 + psubusw m4, [srcq+src_stride3q*2] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*8] + paddd m0, m1 + lea srcq, [srcq+src_strideq*8] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 +HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 +HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 +HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 +HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 +HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 diff --git a/media/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/media/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm new file mode 100644 index 0000000000..923418a992 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm @@ -0,0 +1,313 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;unsigned int vpx_highbd_calc16x16var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(vpx_highbd_calc16x16var_sse2) PRIVATE +sym(vpx_highbd_calc16x16var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+16] + prefetcht0 [rsi+rax] + prefetcht0 [rsi+rax+16] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + + prefetcht0 [rdi] + prefetcht0 [rdi+16] + prefetcht0 [rdi+rdx] + prefetcht0 [rdi+rdx+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +.var16loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 2 + jnz .var16loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_highbd_calc8x8var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(vpx_highbd_calc8x8var_sse2) PRIVATE +sym(vpx_highbd_calc8x8var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+rax] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + + prefetcht0 [rdi] + prefetcht0 [rdi+rdx] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 8 + +.var8loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rbx+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rdi+rdx*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + lea rbx, [rbx+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 4 + jnz .var8loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/media/libvpx/vpx_dsp/x86/highbd_variance_sse2.c new file mode 100644 index 0000000000..343c0478b9 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/highbd_variance_sse2.c @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" + +#include "vp9/encoder/vp9_variance.h" +#include "vpx_ports/mem.h" + +typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = ROUND_POWER_OF_TWO(sse_long, 4); +} + +static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = ROUND_POWER_OF_TWO(sse_long, 8); +} + + +#define HIGH_GET_VAR(S) \ +void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ +} \ +\ +void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 2); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ +} \ +\ +void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 4); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ +} + +HIGH_GET_VAR(16); +HIGH_GET_VAR(8); + +#undef HIGH_GET_VAR + +#define VAR_FN(w, h, block_size, shift) \ +uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, \ + block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +\ +uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +\ +uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} + +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); + +#undef VAR_FN + +unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} diff --git a/media/libvpx/vpx_dsp/x86/sad4d_avx2.c b/media/libvpx/vpx_dsp/x86/sad4d_avx2.c new file mode 100644 index 0000000000..793658f9ea --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad4d_avx2.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <immintrin.h> // AVX2 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_sad32x32x4d_avx2(const uint8_t *src, + int src_stride, + const uint8_t *const ref[4], + int ref_stride, + uint32_t res[4]) { + __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 32 ; i++) { + // load src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)src); + ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); + ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); + ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); + ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + + src+= src_stride; + ref0+= ref_stride; + ref1+= ref_stride; + ref2+= ref_stride; + ref3+= ref_stride; + } + { + __m128i sum; + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } +} + +void vpx_sad64x64x4d_avx2(const uint8_t *src, + int src_stride, + const uint8_t *const ref[4], + int ref_stride, + uint32_t res[4]) { + __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; + __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; + __m256i ref3_reg, ref3next_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 64 ; i++) { + // load 64 bytes from src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)src); + srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32)); + ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); + ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32)); + ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); + ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32)); + ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); + ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32)); + ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); + ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32)); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); + ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); + ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); + ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); + + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); + src+= src_stride; + ref0+= ref_stride; + ref1+= ref_stride; + ref2+= ref_stride; + ref3+= ref_stride; + } + { + __m128i sum; + + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } +} diff --git a/media/libvpx/vpx_dsp/x86/sad4d_sse2.asm b/media/libvpx/vpx_dsp/x86/sad4d_sse2.asm new file mode 100644 index 0000000000..0f7fb93d47 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad4d_sse2.asm @@ -0,0 +1,233 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define program_name vpx + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_4x2x4 5-6 0 + movd m0, [srcq +%2] +%if %1 == 1 + movd m6, [ref1q+%3] + movd m4, [ref2q+%3] + movd m7, [ref3q+%3] + movd m5, [ref4q+%3] + punpckldq m0, [srcq +%4] + punpckldq m6, [ref1q+%5] + punpckldq m4, [ref2q+%5] + punpckldq m7, [ref3q+%5] + punpckldq m5, [ref4q+%5] + psadbw m6, m0 + psadbw m4, m0 + psadbw m7, m0 + psadbw m5, m0 + punpckldq m6, m4 + punpckldq m7, m5 +%else + movd m1, [ref1q+%3] + movd m2, [ref2q+%3] + movd m3, [ref3q+%3] + movd m4, [ref4q+%3] + punpckldq m0, [srcq +%4] + punpckldq m1, [ref1q+%5] + punpckldq m2, [ref2q+%5] + punpckldq m3, [ref3q+%5] + punpckldq m4, [ref4q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + psadbw m4, m0 + punpckldq m1, m2 + punpckldq m3, m4 + paddd m6, m1 + paddd m7, m3 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_8x2x4 5-6 0 + movh m0, [srcq +%2] +%if %1 == 1 + movh m4, [ref1q+%3] + movh m5, [ref2q+%3] + movh m6, [ref3q+%3] + movh m7, [ref4q+%3] + movhps m0, [srcq +%4] + movhps m4, [ref1q+%5] + movhps m5, [ref2q+%5] + movhps m6, [ref3q+%5] + movhps m7, [ref4q+%5] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movh m1, [ref1q+%3] + movh m2, [ref2q+%3] + movh m3, [ref3q+%3] + movhps m0, [srcq +%4] + movhps m1, [ref1q+%5] + movhps m2, [ref2q+%5] + movhps m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movh m1, [ref4q+%3] + movhps m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_16x2x4 5-6 0 + ; 1st 16 px + mova m0, [srcq +%2] +%if %1 == 1 + movu m4, [ref1q+%3] + movu m5, [ref2q+%3] + movu m6, [ref3q+%3] + movu m7, [ref4q+%3] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movu m1, [ref1q+%3] + movu m2, [ref2q+%3] + movu m3, [ref3q+%3] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%3] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif + + ; 2nd 16 px + mova m0, [srcq +%4] + movu m1, [ref1q+%5] + movu m2, [ref2q+%5] + movu m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif + psadbw m1, m0 + paddd m7, m1 +%endmacro + +; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_32x2x4 5-6 0 + PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 + PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 +%endmacro + +; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_64x2x4 5-6 0 + PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 + PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 +%endmacro + +; void vpx_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 +%macro SADNXN4D 2 +%if UNIX64 +cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + + PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + +%if mmsize == 16 + pslldq m5, 4 + pslldq m7, 4 + por m4, m5 + por m6, m7 + mova m5, m4 + mova m7, m6 + punpcklqdq m4, m6 + punpckhqdq m5, m7 + movifnidn r4, r4mp + paddd m4, m5 + movu [r4], m4 + RET +%else + movifnidn r4, r4mp + movq [r4+0], m6 + movq [r4+8], m7 + RET +%endif +%endmacro + +INIT_XMM sse2 +SADNXN4D 64, 64 +SADNXN4D 64, 32 +SADNXN4D 32, 64 +SADNXN4D 32, 32 +SADNXN4D 32, 16 +SADNXN4D 16, 32 +SADNXN4D 16, 16 +SADNXN4D 16, 8 +SADNXN4D 8, 16 +SADNXN4D 8, 8 +SADNXN4D 8, 4 + +INIT_MMX sse +SADNXN4D 4, 8 +SADNXN4D 4, 4 diff --git a/media/libvpx/vpx_dsp/x86/sad_avx2.c b/media/libvpx/vpx_dsp/x86/sad_avx2.c new file mode 100644 index 0000000000..ce9ad8f780 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad_avx2.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <immintrin.h> +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" + +#define FSAD64_H(h) \ +unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, \ + int src_stride, \ + const uint8_t *ref_ptr, \ + int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0 ; i < h ; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + sad1_reg = _mm256_sad_epu8(ref1_reg, \ + _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8(ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr+= ref_stride; \ + src_ptr+= src_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + return res; \ +} + +#define FSAD32_H(h) \ +unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, \ + int src_stride, \ + const uint8_t *ref_ptr, \ + int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0 ; i < max ; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + sad1_reg = _mm256_sad_epu8(ref1_reg, \ + _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8(ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr+= ref2_stride; \ + src_ptr+= src2_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + return res; \ +} + +#define FSAD64 \ +FSAD64_H(64); \ +FSAD64_H(32); + +#define FSAD32 \ +FSAD32_H(64); \ +FSAD32_H(32); \ +FSAD32_H(16); + +FSAD64; +FSAD32; + +#undef FSAD64 +#undef FSAD32 +#undef FSAD64_H +#undef FSAD32_H + +#define FSADAVG64_H(h) \ +unsigned int vpx_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \ + int src_stride, \ + const uint8_t *ref_ptr, \ + int ref_stride, \ + const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0 ; i < h ; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + ref1_reg = _mm256_avg_epu8(ref1_reg, \ + _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8(ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(second_pred +32))); \ + sad1_reg = _mm256_sad_epu8(ref1_reg, \ + _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8(ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr+= ref_stride; \ + src_ptr+= src_stride; \ + second_pred+= 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + return res; \ +} + +#define FSADAVG32_H(h) \ +unsigned int vpx_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \ + int src_stride, \ + const uint8_t *ref_ptr, \ + int ref_stride, \ + const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0 ; i < max ; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + ref1_reg = _mm256_avg_epu8(ref1_reg, \ + _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8(ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(second_pred +32))); \ + sad1_reg = _mm256_sad_epu8(ref1_reg, \ + _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8(ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = _mm256_add_epi32(sum_sad, \ + _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr+= ref2_stride; \ + src_ptr+= src2_stride; \ + second_pred+= 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + return res; \ +} + +#define FSADAVG64 \ +FSADAVG64_H(64); \ +FSADAVG64_H(32); + +#define FSADAVG32 \ +FSADAVG32_H(64); \ +FSADAVG32_H(32); \ +FSADAVG32_H(16); + +FSADAVG64; +FSADAVG32; + +#undef FSADAVG64 +#undef FSADAVG32 +#undef FSADAVG64_H +#undef FSADAVG32_H diff --git a/media/libvpx/vpx_dsp/x86/sad_mmx.asm b/media/libvpx/vpx_dsp/x86/sad_mmx.asm new file mode 100644 index 0000000000..9968992bd1 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad_mmx.asm @@ -0,0 +1,427 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +global sym(vpx_sad16x16_mmx) PRIVATE +global sym(vpx_sad8x16_mmx) PRIVATE +global sym(vpx_sad8x8_mmx) PRIVATE +global sym(vpx_sad4x4_mmx) PRIVATE +global sym(vpx_sad16x8_mmx) PRIVATE + +;unsigned int vpx_sad16x16_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad16x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + + lea rcx, [rcx+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +.x16x16sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm2, QWORD PTR [rsi+8] + + movq mm1, QWORD PTR [rdi] + movq mm3, QWORD PTR [rdi+8] + + movq mm4, mm0 + movq mm5, mm2 + + psubusb mm0, mm1 + psubusb mm1, mm4 + + psubusb mm2, mm3 + psubusb mm3, mm5 + + por mm0, mm1 + por mm2, mm3 + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm6 + punpcklbw mm2, mm6 + + punpckhbw mm1, mm6 + punpckhbw mm3, mm6 + + paddw mm0, mm2 + paddw mm1, mm3 + + + lea rsi, [rsi+rax] + add rdi, rdx + + paddw mm7, mm0 + paddw mm7, mm1 + + cmp rsi, rcx + jne .x16x16sad_mmx_loop + + + movq mm0, mm7 + + punpcklwd mm0, mm6 + punpckhwd mm7, mm6 + + paddw mm0, mm7 + movq mm7, mm0 + + + psrlq mm0, 32 + paddw mm7, mm0 + + movq rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_sad8x16_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad8x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + + lea rcx, [rcx+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +.x8x16sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + punpcklbw mm0, mm6 + + punpckhbw mm2, mm6 + lea rsi, [rsi+rax] + + add rdi, rdx + paddw mm7, mm0 + + paddw mm7, mm2 + cmp rsi, rcx + + jne .x8x16sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movq rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_sad8x8_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +.x8x8sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + punpcklbw mm0, mm6 + + punpckhbw mm2, mm6 + paddw mm0, mm2 + + lea rsi, [rsi+rax] + add rdi, rdx + + paddw mm7, mm0 + cmp rsi, rcx + + jne .x8x8sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movq rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_sad4x4_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + movd mm0, DWORD PTR [rsi] + movd mm1, DWORD PTR [rdi] + + movd mm2, DWORD PTR [rsi+rax] + movd mm3, DWORD PTR [rdi+rdx] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + pxor mm3, mm3 + + punpcklbw mm0, mm3 + punpckhbw mm2, mm3 + + paddw mm0, mm2 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movd mm4, DWORD PTR [rsi] + movd mm5, DWORD PTR [rdi] + + movd mm6, DWORD PTR [rsi+rax] + movd mm7, DWORD PTR [rdi+rdx] + + punpcklbw mm4, mm6 + punpcklbw mm5, mm7 + + movq mm6, mm4 + psubusb mm4, mm5 + + psubusb mm5, mm6 + por mm4, mm5 + + movq mm5, mm4 + punpcklbw mm4, mm3 + + punpckhbw mm5, mm3 + paddw mm4, mm5 + + paddw mm0, mm4 + movq mm1, mm0 + + punpcklwd mm0, mm3 + punpckhwd mm1, mm3 + + paddw mm0, mm1 + movq mm1, mm0 + + psrlq mm0, 32 + paddw mm0, mm1 + + movq rax, mm0 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_sad16x8_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad16x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +.x16x8sad_mmx_loop: + + movq mm0, [rsi] + movq mm1, [rdi] + + movq mm2, [rsi+8] + movq mm3, [rdi+8] + + movq mm4, mm0 + movq mm5, mm2 + + psubusb mm0, mm1 + psubusb mm1, mm4 + + psubusb mm2, mm3 + psubusb mm3, mm5 + + por mm0, mm1 + por mm2, mm3 + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm6 + punpckhbw mm1, mm6 + + punpcklbw mm2, mm6 + punpckhbw mm3, mm6 + + + paddw mm0, mm2 + paddw mm1, mm3 + + paddw mm0, mm1 + lea rsi, [rsi+rax] + + add rdi, rdx + paddw mm7, mm0 + + cmp rsi, rcx + jne .x16x8sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movq rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vpx_dsp/x86/sad_sse2.asm b/media/libvpx/vpx_dsp/x86/sad_sse2.asm new file mode 100644 index 0000000000..c6a829dc21 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad_sse2.asm @@ -0,0 +1,269 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define program_name vpx + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +%endmacro + +; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD64XN 1-2 0 + SAD_FN 64, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + paddd m1, m2 + paddd m3, m4 + add refq, ref_strideq + paddd m0, m1 + add srcq, src_strideq + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD64XN 64 ; sad64x64_sse2 +SAD64XN 32 ; sad64x32_sse2 +SAD64XN 64, 1 ; sad64x64_avg_sse2 +SAD64XN 32, 1 ; sad64x32_avg_sse2 + +; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD32XN 1-2 0 + SAD_FN 32, %1, 5, %2 + mov n_rowsd, %1/2 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq] + movu m4, [refq+ref_strideq+16] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+src_strideq] + psadbw m4, [srcq+src_strideq+16] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD32XN 64 ; sad32x64_sse2 +SAD32XN 32 ; sad32x32_sse2 +SAD32XN 16 ; sad32x16_sse2 +SAD32XN 64, 1 ; sad32x64_avg_sse2 +SAD32XN 32, 1 ; sad32x32_avg_sse2 +SAD32XN 16, 1 ; sad32x16_avg_sse2 + +; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD16XN 1-2 0 + SAD_FN 16, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+src_strideq] + psadbw m3, [srcq+src_strideq*2] + psadbw m4, [srcq+src_stride3q] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD16XN 32 ; sad16x32_sse2 +SAD16XN 16 ; sad16x16_sse2 +SAD16XN 8 ; sad16x8_sse2 +SAD16XN 32, 1 ; sad16x32_avg_sse2 +SAD16XN 16, 1 ; sad16x16_avg_sse2 +SAD16XN 8, 1 ; sad16x8_avg_sse2 + +; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD8XN 1-2 0 + SAD_FN 8, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movh m1, [refq] + movhps m1, [refq+ref_strideq] + movh m2, [refq+ref_strideq*2] + movhps m2, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif + movh m3, [srcq] + movhps m3, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movhps m4, [srcq+src_stride3q] + psadbw m1, m3 + psadbw m2, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD8XN 16 ; sad8x16_sse2 +SAD8XN 8 ; sad8x8_sse2 +SAD8XN 4 ; sad8x4_sse2 +SAD8XN 16, 1 ; sad8x16_avg_sse2 +SAD8XN 8, 1 ; sad8x8_avg_sse2 +SAD8XN 4, 1 ; sad8x4_avg_sse2 + +; unsigned int vpx_sad4x{4, 8}_sse(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD4XN 1-2 0 + SAD_FN 4, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movd m1, [refq] + movd m2, [refq+ref_strideq] + movd m3, [refq+ref_strideq*2] + movd m4, [refq+ref_stride3q] + punpckldq m1, m2 + punpckldq m3, m4 +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m3, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif + movd m2, [srcq] + movd m5, [srcq+src_strideq] + movd m4, [srcq+src_strideq*2] + movd m6, [srcq+src_stride3q] + punpckldq m2, m5 + punpckldq m4, m6 + psadbw m1, m2 + psadbw m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movd eax, m0 + RET +%endmacro + +INIT_MMX sse +SAD4XN 8 ; sad4x8_sse +SAD4XN 4 ; sad4x4_sse +SAD4XN 8, 1 ; sad4x8_avg_sse +SAD4XN 4, 1 ; sad4x4_avg_sse diff --git a/media/libvpx/vpx_dsp/x86/sad_sse3.asm b/media/libvpx/vpx_dsp/x86/sad_sse3.asm new file mode 100644 index 0000000000..18279bdb9d --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad_sse3.asm @@ -0,0 +1,374 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro STACK_FRAME_CREATE_X3 0 +%if ABI_IS_32BIT + %define src_ptr rsi + %define src_stride rax + %define ref_ptr rdi + %define ref_stride rdx + %define end_ptr rcx + %define ret_var rbx + %define result_ptr arg(4) + %define height dword ptr arg(4) + push rbp + mov rbp, rsp + push rsi + push rdi + push rbx + + mov rsi, arg(0) ; src_ptr + mov rdi, arg(2) ; ref_ptr + + movsxd rax, dword ptr arg(1) ; src_stride + movsxd rdx, dword ptr arg(3) ; ref_stride +%else + %if LIBVPX_YASM_WIN64 + SAVE_XMM 7, u + %define src_ptr rcx + %define src_stride rdx + %define ref_ptr r8 + %define ref_stride r9 + %define end_ptr r10 + %define ret_var r11 + %define result_ptr [rsp+xmm_stack_space+8+4*8] + %define height dword ptr [rsp+xmm_stack_space+8+4*8] + %else + %define src_ptr rdi + %define src_stride rsi + %define ref_ptr rdx + %define ref_stride rcx + %define end_ptr r9 + %define ret_var r10 + %define result_ptr r8 + %define height r8 + %endif +%endif + +%endmacro + +%macro STACK_FRAME_DESTROY_X3 0 + %define src_ptr + %define src_stride + %define ref_ptr + %define ref_stride + %define end_ptr + %define ret_var + %define result_ptr + %define height + +%if ABI_IS_32BIT + pop rbx + pop rdi + pop rsi + pop rbp +%else + %if LIBVPX_YASM_WIN64 + RESTORE_XMM + %endif +%endif + ret +%endmacro + +%macro PROCESS_16X2X3 5 +%if %1==0 + movdqa xmm0, XMMWORD PTR [%2] + lddqu xmm5, XMMWORD PTR [%3] + lddqu xmm6, XMMWORD PTR [%3+1] + lddqu xmm7, XMMWORD PTR [%3+2] + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, XMMWORD PTR [%2] + lddqu xmm1, XMMWORD PTR [%3] + lddqu xmm2, XMMWORD PTR [%3+1] + lddqu xmm3, XMMWORD PTR [%3+2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, XMMWORD PTR [%2+%4] + lddqu xmm1, XMMWORD PTR [%3+%5] + lddqu xmm2, XMMWORD PTR [%3+%5+1] + lddqu xmm3, XMMWORD PTR [%3+%5+2] + +%if %1==0 || %1==1 + lea %2, [%2+%4*2] + lea %3, [%3+%5*2] +%endif + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_8X2X3 5 +%if %1==0 + movq mm0, QWORD PTR [%2] + movq mm5, QWORD PTR [%3] + movq mm6, QWORD PTR [%3+1] + movq mm7, QWORD PTR [%3+2] + + psadbw mm5, mm0 + psadbw mm6, mm0 + psadbw mm7, mm0 +%else + movq mm0, QWORD PTR [%2] + movq mm1, QWORD PTR [%3] + movq mm2, QWORD PTR [%3+1] + movq mm3, QWORD PTR [%3+2] + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm5, mm1 + paddw mm6, mm2 + paddw mm7, mm3 +%endif + movq mm0, QWORD PTR [%2+%4] + movq mm1, QWORD PTR [%3+%5] + movq mm2, QWORD PTR [%3+%5+1] + movq mm3, QWORD PTR [%3+%5+2] + +%if %1==0 || %1==1 + lea %2, [%2+%4*2] + lea %3, [%3+%5*2] +%endif + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm5, mm1 + paddw mm6, mm2 + paddw mm7, mm3 +%endmacro + +;void int vpx_sad16x16x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad16x16x3_sse3) PRIVATE +sym(vpx_sad16x16x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rcx], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rcx+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rcx+8], xmm0 + + STACK_FRAME_DESTROY_X3 + +;void int vpx_sad16x8x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad16x8x3_sse3) PRIVATE +sym(vpx_sad16x8x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rcx], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rcx+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rcx+8], xmm0 + + STACK_FRAME_DESTROY_X3 + +;void int vpx_sad8x16x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad8x16x3_sse3) PRIVATE +sym(vpx_sad8x16x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + punpckldq mm5, mm6 + + movq [rcx], mm5 + movd [rcx+8], mm7 + + STACK_FRAME_DESTROY_X3 + +;void int vpx_sad8x8x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad8x8x3_sse3) PRIVATE +sym(vpx_sad8x8x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + punpckldq mm5, mm6 + + movq [rcx], mm5 + movd [rcx+8], mm7 + + STACK_FRAME_DESTROY_X3 + +;void int vpx_sad4x4x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad4x4x3_sse3) PRIVATE +sym(vpx_sad4x4x3_sse3): + + STACK_FRAME_CREATE_X3 + + movd mm0, DWORD PTR [src_ptr] + movd mm1, DWORD PTR [ref_ptr] + + movd mm2, DWORD PTR [src_ptr+src_stride] + movd mm3, DWORD PTR [ref_ptr+ref_stride] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + movd mm4, DWORD PTR [ref_ptr+1] + movd mm5, DWORD PTR [ref_ptr+2] + + movd mm2, DWORD PTR [ref_ptr+ref_stride+1] + movd mm3, DWORD PTR [ref_ptr+ref_stride+2] + + psadbw mm1, mm0 + + punpcklbw mm4, mm2 + punpcklbw mm5, mm3 + + psadbw mm4, mm0 + psadbw mm5, mm0 + + lea src_ptr, [src_ptr+src_stride*2] + lea ref_ptr, [ref_ptr+ref_stride*2] + + movd mm0, DWORD PTR [src_ptr] + movd mm2, DWORD PTR [ref_ptr] + + movd mm3, DWORD PTR [src_ptr+src_stride] + movd mm6, DWORD PTR [ref_ptr+ref_stride] + + punpcklbw mm0, mm3 + punpcklbw mm2, mm6 + + movd mm3, DWORD PTR [ref_ptr+1] + movd mm7, DWORD PTR [ref_ptr+2] + + psadbw mm2, mm0 + + paddw mm1, mm2 + + movd mm2, DWORD PTR [ref_ptr+ref_stride+1] + movd mm6, DWORD PTR [ref_ptr+ref_stride+2] + + punpcklbw mm3, mm2 + punpcklbw mm7, mm6 + + psadbw mm3, mm0 + psadbw mm7, mm0 + + paddw mm3, mm4 + paddw mm7, mm5 + + mov rcx, result_ptr + + punpckldq mm1, mm3 + + movq [rcx], mm1 + movd [rcx+8], mm7 + + STACK_FRAME_DESTROY_X3 diff --git a/media/libvpx/vpx_dsp/x86/sad_sse4.asm b/media/libvpx/vpx_dsp/x86/sad_sse4.asm new file mode 100644 index 0000000000..bc67447971 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad_sse4.asm @@ -0,0 +1,359 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro PROCESS_16X2X8 1 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + movq xmm2, MMWORD PTR [rdi+16] + punpcklqdq xmm1, xmm3 + punpcklqdq xmm3, xmm2 + + movdqa xmm2, xmm1 + mpsadbw xmm1, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm1, xmm2 + paddw xmm1, xmm3 + paddw xmm1, xmm4 +%else + movdqa xmm0, XMMWORD PTR [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + movq xmm2, MMWORD PTR [rdi+16] + punpcklqdq xmm5, xmm3 + punpcklqdq xmm3, xmm2 + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm5, xmm2 + paddw xmm5, xmm3 + paddw xmm5, xmm4 + + paddw xmm1, xmm5 +%endif + movdqa xmm0, XMMWORD PTR [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + movq xmm2, MMWORD PTR [rdi+ rdx+16] + punpcklqdq xmm5, xmm3 + punpcklqdq xmm3, xmm2 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm5, xmm2 + paddw xmm5, xmm3 + paddw xmm5, xmm4 + + paddw xmm1, xmm5 +%endmacro + +%macro PROCESS_8X2X8 1 +%if %1 + movq xmm0, MMWORD PTR [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm1, xmm3 + + movdqa xmm2, xmm1 + mpsadbw xmm1, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm1, xmm2 +%else + movq xmm0, MMWORD PTR [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm5, xmm3 + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm5, xmm2 + + paddw xmm1, xmm5 +%endif + movq xmm0, MMWORD PTR [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + punpcklqdq xmm5, xmm3 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm5, xmm2 + + paddw xmm1, xmm5 +%endmacro + +%macro PROCESS_4X2X8 1 +%if %1 + movd xmm0, [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm1, xmm3 + + mpsadbw xmm1, xmm0, 0x0 +%else + movd xmm0, [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm5, xmm3 + + mpsadbw xmm5, xmm0, 0x0 + + paddw xmm1, xmm5 +%endif + movd xmm0, [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + punpcklqdq xmm5, xmm3 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + mpsadbw xmm5, xmm0, 0x0 + + paddw xmm1, xmm5 +%endmacro + +%macro WRITE_AS_INTS 0 + mov rdi, arg(4) ;Results + pxor xmm0, xmm0 + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm0 + punpckhwd xmm2, xmm0 + + movdqa [rdi], xmm1 + movdqa [rdi + 16], xmm2 +%endmacro + +;void vpx_sad16x16x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array); +global sym(vpx_sad16x16x8_sse4_1) PRIVATE +sym(vpx_sad16x16x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vpx_sad16x8x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vpx_sad16x8x8_sse4_1) PRIVATE +sym(vpx_sad16x8x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vpx_sad8x8x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vpx_sad8x8x8_sse4_1) PRIVATE +sym(vpx_sad8x8x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vpx_sad8x16x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vpx_sad8x16x8_sse4_1) PRIVATE +sym(vpx_sad8x16x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vpx_sad4x4x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vpx_sad4x4x8_sse4_1) PRIVATE +sym(vpx_sad4x4x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_4X2X8 1 + PROCESS_4X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + + diff --git a/media/libvpx/vpx_dsp/x86/sad_ssse3.asm b/media/libvpx/vpx_dsp/x86/sad_ssse3.asm new file mode 100644 index 0000000000..49f204fa04 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad_ssse3.asm @@ -0,0 +1,370 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro PROCESS_16X2X3 1 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm5, XMMWORD PTR [rdi] + lddqu xmm6, XMMWORD PTR [rdi+1] + lddqu xmm7, XMMWORD PTR [rdi+2] + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm1, XMMWORD PTR [rdi] + lddqu xmm2, XMMWORD PTR [rdi+1] + lddqu xmm3, XMMWORD PTR [rdi+2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, XMMWORD PTR [rsi+rax] + lddqu xmm1, XMMWORD PTR [rdi+rdx] + lddqu xmm2, XMMWORD PTR [rdi+rdx+1] + lddqu xmm3, XMMWORD PTR [rdi+rdx+2] + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_16X2X3_OFFSET 2 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + movdqa xmm4, XMMWORD PTR [rdi] + movdqa xmm7, XMMWORD PTR [rdi+16] + + movdqa xmm5, xmm7 + palignr xmm5, xmm4, %2 + + movdqa xmm6, xmm7 + palignr xmm6, xmm4, (%2+1) + + palignr xmm7, xmm4, (%2+2) + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, XMMWORD PTR [rsi] + movdqa xmm4, XMMWORD PTR [rdi] + movdqa xmm3, XMMWORD PTR [rdi+16] + + movdqa xmm1, xmm3 + palignr xmm1, xmm4, %2 + + movdqa xmm2, xmm3 + palignr xmm2, xmm4, (%2+1) + + palignr xmm3, xmm4, (%2+2) + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, XMMWORD PTR [rsi+rax] + movdqa xmm4, XMMWORD PTR [rdi+rdx] + movdqa xmm3, XMMWORD PTR [rdi+rdx+16] + + movdqa xmm1, xmm3 + palignr xmm1, xmm4, %2 + + movdqa xmm2, xmm3 + palignr xmm2, xmm4, (%2+1) + + palignr xmm3, xmm4, (%2+2) + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_16X16X3_OFFSET 2 +%2_aligned_by_%1: + + sub rdi, %1 + + PROCESS_16X2X3_OFFSET 1, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + + jmp %2_store_off + +%endmacro + +%macro PROCESS_16X8X3_OFFSET 2 +%2_aligned_by_%1: + + sub rdi, %1 + + PROCESS_16X2X3_OFFSET 1, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + + jmp %2_store_off + +%endmacro + +;void int vpx_sad16x16x3_ssse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad16x16x3_ssse3) PRIVATE +sym(vpx_sad16x16x3_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + push rcx + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rdx, 0xf + and rdx, rdi + + jmp .vpx_sad16x16x3_ssse3_skiptable +.vpx_sad16x16x3_ssse3_jumptable: + dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump +.vpx_sad16x16x3_ssse3_skiptable: + + call .vpx_sad16x16x3_ssse3_do_jump +.vpx_sad16x16x3_ssse3_do_jump: + pop rcx ; get the address of do_jump + mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump + add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable + + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable + add rcx, rax + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + jmp rcx + + PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3 + +.vpx_sad16x16x3_ssse3_aligned_by_15: + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + +.vpx_sad16x16x3_ssse3_store_off: + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rcx + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void int vpx_sad16x8x3_ssse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad16x8x3_ssse3) PRIVATE +sym(vpx_sad16x8x3_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + push rcx + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rdx, 0xf + and rdx, rdi + + jmp .vpx_sad16x8x3_ssse3_skiptable +.vpx_sad16x8x3_ssse3_jumptable: + dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump +.vpx_sad16x8x3_ssse3_skiptable: + + call .vpx_sad16x8x3_ssse3_do_jump +.vpx_sad16x8x3_ssse3_do_jump: + pop rcx ; get the address of do_jump + mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump + add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable + + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable + add rcx, rax + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + jmp rcx + + PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3 + +.vpx_sad16x8x3_ssse3_aligned_by_15: + + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + +.vpx_sad16x8x3_ssse3_store_off: + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rcx + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vpx_dsp/x86/variance_avx2.c b/media/libvpx/vpx_dsp/x86/variance_avx2.c new file mode 100644 index 0000000000..82cef4af0a --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/variance_avx2.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_dsp_rtcd.h" + +typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +void vpx_get32x32var_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +static void variance_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + get_var_avx2 var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += 16) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(&src[src_stride * i + j], src_stride, + &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + + +unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_get16x16var_avx2, 16); + return *sse - (((unsigned int)sum * sum) >> 8); +} + +unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse; +} + +unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 10); +} + +unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 11); +} diff --git a/media/libvpx/vpx_dsp/x86/variance_impl_avx2.c b/media/libvpx/vpx_dsp/x86/variance_impl_avx2.c new file mode 100644 index 0000000000..0e40959aa9 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/variance_impl_avx2.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> // AVX2 + +#include "./vpx_dsp_rtcd.h" + +void vpx_get16x16var_avx2(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum) { + __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; + __m256i ref_expand_high, madd_low, madd_high; + unsigned int i, src_2strides, ref_2strides; + __m256i zero_reg = _mm256_set1_epi16(0); + __m256i sum_ref_src = _mm256_set1_epi16(0); + __m256i madd_ref_src = _mm256_set1_epi16(0); + + // processing two strides in a 256 bit register reducing the number + // of loop stride by half (comparing to the sse2 code) + src_2strides = source_stride << 1; + ref_2strides = recon_stride << 1; + for (i = 0; i < 8; i++) { + src = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i const *) (src_ptr))); + src = _mm256_inserti128_si256(src, + _mm_loadu_si128((__m128i const *)(src_ptr+source_stride)), 1); + + ref =_mm256_castsi128_si256( + _mm_loadu_si128((__m128i const *) (ref_ptr))); + ref = _mm256_inserti128_si256(ref, + _mm_loadu_si128((__m128i const *)(ref_ptr+recon_stride)), 1); + + // expanding to 16 bit each lane + src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); + src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); + + ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); + ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); + + // src-ref + src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); + src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); + + // madd low (src - ref) + madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); + + // add high to low + src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); + + // madd high (src - ref) + madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); + + sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); + + // add high to low + madd_ref_src = _mm256_add_epi32(madd_ref_src, + _mm256_add_epi32(madd_low, madd_high)); + + src_ptr+= src_2strides; + ref_ptr+= ref_2strides; + } + + { + __m128i sum_res, madd_res; + __m128i expand_sum_low, expand_sum_high, expand_sum; + __m128i expand_madd_low, expand_madd_high, expand_madd; + __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; + + // extract the low lane and add it to the high lane + sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), + _mm256_extractf128_si256(sum_ref_src, 1)); + + madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), + _mm256_extractf128_si256(madd_ref_src, 1)); + + // padding each 2 bytes with another 2 zeroed bytes + expand_sum_low = _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), + sum_res); + expand_sum_high = _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), + sum_res); + + // shifting the sign 16 bits right + expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); + expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); + + expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); + + // expand each 32 bits of the madd result to 64 bits + expand_madd_low = _mm_unpacklo_epi32(madd_res, + _mm256_castsi256_si128(zero_reg)); + expand_madd_high = _mm_unpackhi_epi32(madd_res, + _mm256_castsi256_si128(zero_reg)); + + expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); + + ex_expand_sum_low = _mm_unpacklo_epi32(expand_sum, + _mm256_castsi256_si128(zero_reg)); + ex_expand_sum_high = _mm_unpackhi_epi32(expand_sum, + _mm256_castsi256_si128(zero_reg)); + + ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); + + // shift 8 bytes eight + madd_res = _mm_srli_si128(expand_madd, 8); + sum_res = _mm_srli_si128(ex_expand_sum, 8); + + madd_res = _mm_add_epi32(madd_res, expand_madd); + sum_res = _mm_add_epi32(sum_res, ex_expand_sum); + + *((int*)SSE)= _mm_cvtsi128_si32(madd_res); + + *((int*)Sum)= _mm_cvtsi128_si32(sum_res); + } +} + +void vpx_get32x32var_avx2(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum) { + __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; + __m256i ref_expand_high, madd_low, madd_high; + unsigned int i; + __m256i zero_reg = _mm256_set1_epi16(0); + __m256i sum_ref_src = _mm256_set1_epi16(0); + __m256i madd_ref_src = _mm256_set1_epi16(0); + + // processing 32 elements in parallel + for (i = 0; i < 16; i++) { + src = _mm256_loadu_si256((__m256i const *) (src_ptr)); + + ref = _mm256_loadu_si256((__m256i const *) (ref_ptr)); + + // expanding to 16 bit each lane + src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); + src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); + + ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); + ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); + + // src-ref + src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); + src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); + + // madd low (src - ref) + madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); + + // add high to low + src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); + + // madd high (src - ref) + madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); + + sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); + + // add high to low + madd_ref_src = _mm256_add_epi32(madd_ref_src, + _mm256_add_epi32(madd_low, madd_high)); + + src_ptr+= source_stride; + ref_ptr+= recon_stride; + } + + { + __m256i expand_sum_low, expand_sum_high, expand_sum; + __m256i expand_madd_low, expand_madd_high, expand_madd; + __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; + + // padding each 2 bytes with another 2 zeroed bytes + expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); + expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); + + // shifting the sign 16 bits right + expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); + expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); + + expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); + + // expand each 32 bits of the madd result to 64 bits + expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); + expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); + + expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); + + ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); + ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); + + ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); + + // shift 8 bytes eight + madd_ref_src = _mm256_srli_si256(expand_madd, 8); + sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); + + madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); + sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); + + // extract the low lane and the high lane and add the results + *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); + + *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); + } +} diff --git a/media/libvpx/vpx_dsp/x86/variance_impl_mmx.asm b/media/libvpx/vpx_dsp/x86/variance_impl_mmx.asm new file mode 100644 index 0000000000..a8d7d99dbc --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/variance_impl_mmx.asm @@ -0,0 +1,424 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;unsigned int vpx_get_mb_ss_mmx( short *src_ptr ) +global sym(vpx_get_mb_ss_mmx) PRIVATE +sym(vpx_get_mb_ss_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + sub rsp, 8 + ; end prolog + + mov rax, arg(0) ;src_ptr + mov rcx, 16 + pxor mm4, mm4 + +.NEXTROW: + movq mm0, [rax] + movq mm1, [rax+8] + movq mm2, [rax+16] + movq mm3, [rax+24] + pmaddwd mm0, mm0 + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + pmaddwd mm3, mm3 + + paddd mm4, mm0 + paddd mm4, mm1 + paddd mm4, mm2 + paddd mm4, mm3 + + add rax, 32 + dec rcx + ja .NEXTROW + movq QWORD PTR [rsp], mm4 + + ;return sum[0]+sum[1]; + movsxd rax, dword ptr [rsp] + movsxd rcx, dword ptr [rsp+4] + add rax, rcx + + + ; begin epilog + add rsp, 8 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vpx_get8x8var_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *SSE, +; int *Sum +;) +global sym(vpx_get8x8var_mmx) PRIVATE +sym(vpx_get8x8var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + push rbx + sub rsp, 16 + ; end prolog + + + pxor mm5, mm5 ; Blank mmx6 + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + + ; Row 1 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm1, [rbx] ; Copy eight bytes to mm1 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + + ; Row 2 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 3 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 4 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 5 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + ; movq mm4, [rbx + rdx] + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 6 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 7 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 8 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Now accumulate the final results. + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory + movsx rdx, WORD PTR [rsp+8] + movsx rcx, WORD PTR [rsp+10] + movsx rbx, WORD PTR [rsp+12] + movsx rax, WORD PTR [rsp+14] + add rdx, rcx + add rbx, rax + add rdx, rbx ;XSum + movsxd rax, DWORD PTR [rsp] + movsxd rcx, DWORD PTR [rsp+4] + add rax, rcx ;XXSum + mov rsi, arg(4) ;SSE + mov rdi, arg(5) ;Sum + mov dword ptr [rsi], eax + mov dword ptr [rdi], edx + xor rax, rax ; return 0 + + + ; begin epilog + add rsp, 16 + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + +;void +;vpx_get4x4var_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *SSE, +; int *Sum +;) +global sym(vpx_get4x4var_mmx) PRIVATE +sym(vpx_get4x4var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + push rbx + sub rsp, 16 + ; end prolog + + + pxor mm5, mm5 ; Blank mmx6 + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + + ; Row 1 + movd mm0, [rax] ; Copy four bytes to mm0 + movd mm1, [rbx] ; Copy four bytes to mm1 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy four bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + + ; Row 2 + movd mm0, [rax] ; Copy four bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy four bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 3 + movd mm0, [rax] ; Copy four bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher precision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy four bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 4 + movd mm0, [rax] ; Copy four bytes to mm0 + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + paddd mm7, mm0 ; accumulate in mm7 + + + ; Now accumulate the final results. + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory + movsx rdx, WORD PTR [rsp+8] + movsx rcx, WORD PTR [rsp+10] + movsx rbx, WORD PTR [rsp+12] + movsx rax, WORD PTR [rsp+14] + add rdx, rcx + add rbx, rax + add rdx, rbx ;XSum + movsxd rax, DWORD PTR [rsp] + movsxd rcx, DWORD PTR [rsp+4] + add rax, rcx ;XXSum + mov rsi, arg(4) ;SSE + mov rdi, arg(5) ;Sum + mov dword ptr [rsi], eax + mov dword ptr [rdi], edx + xor rax, rax ; return 0 + + + ; begin epilog + add rsp, 16 + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vpx_dsp/x86/variance_mmx.c b/media/libvpx/vpx_dsp/x86/variance_mmx.c new file mode 100644 index 0000000000..99dd741bca --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/variance_mmx.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" + +extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse, int *sum); + +unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int var; + int avg; + + vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg); + *sse = var; + return (var - (((unsigned int)avg * avg) >> 4)); +} + +unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int var; + int avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg); + *sse = var; + + return (var - (((unsigned int)avg * avg) >> 6)); +} + +unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse2, &sum2); + vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride, + b + 8 * b_stride + 8, b_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + *sse = var; + return var; +} + +unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse2, &sum2); + vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride, + b + 8 * b_stride + 8, b_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + avg = sum0 + sum1 + sum2 + sum3; + *sse = var; + return (var - (((unsigned int)avg * avg) >> 8)); +} + +unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + return (var - (((unsigned int)avg * avg) >> 7)); +} + +unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + + return (var - (((unsigned int)avg * avg) >> 7)); +} diff --git a/media/libvpx/vpx_dsp/x86/variance_sse2.c b/media/libvpx/vpx_dsp/x86/variance_sse2.c new file mode 100644 index 0000000000..6256bc5362 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/variance_sse2.c @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> // SSE2 + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" + +typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = _mm_loadu_si128((const __m128i *)src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } + + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return _mm_cvtsi128_si32(vsum); +} + +#define READ64(p, stride, i) \ + _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ + _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) + +static void get4x4var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); + const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); + const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); + const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + // sum + __m128i vsum = _mm_add_epi16(diff0, diff1); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), + _mm_madd_epi16(diff1, diff1)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + *sse = _mm_cvtsi128_si32(vsum); +} + +void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; i += 2) { + const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + i * src_stride)), zero); + const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + i * ref_stride)), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + (i + 1) * src_stride)), zero); + const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + (i + 1) * ref_stride)), zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + +void vpx_get16x16var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 16; ++i) { + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + + src += src_stride; + ref += ref_stride; + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0) + + (int16_t)_mm_extract_epi16(vsum, 1); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + + +static void variance_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + getNxMvar_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 4); +} + +unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 4, + sse, &sum, get4x4var_sse2, 4); + return *sse - (((unsigned int)sum * sum) >> 5); +} + +unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 4, 8, + sse, &sum, get4x4var_sse2, 4); + return *sse - (((unsigned int)sum * sum) >> 5); +} + +unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 6); +} + +unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 8, + sse, &sum, vpx_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); +} + +unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 16, + sse, &sum, vpx_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); +} + +unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 8); +} + +unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 10); +} + +unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 64, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} |