diff options
Diffstat (limited to 'media/ffvpx/libavcodec/x86/h264_intrapred.asm')
-rw-r--r-- | media/ffvpx/libavcodec/x86/h264_intrapred.asm | 743 |
1 files changed, 15 insertions, 728 deletions
diff --git a/media/ffvpx/libavcodec/x86/h264_intrapred.asm b/media/ffvpx/libavcodec/x86/h264_intrapred.asm index f3aa3172f0..8a38ba2bb5 100644 --- a/media/ffvpx/libavcodec/x86/h264_intrapred.asm +++ b/media/ffvpx/libavcodec/x86/h264_intrapred.asm @@ -42,32 +42,12 @@ SECTION .text cextern pb_1 cextern pb_3 cextern pw_4 -cextern pw_5 cextern pw_8 -cextern pw_16 -cextern pw_17 -cextern pw_32 ;----------------------------------------------------------------------------- ; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- -INIT_MMX mmx -cglobal pred16x16_vertical_8, 2,3 - sub r0, r1 - mov r2, 8 - movq mm0, [r0+0] - movq mm1, [r0+8] -.loop: - movq [r0+r1*1+0], mm0 - movq [r0+r1*1+8], mm1 - movq [r0+r1*2+0], mm0 - movq [r0+r1*2+8], mm1 - lea r0, [r0+r1*2] - dec r2 - jg .loop - REP_RET - INIT_XMM sse cglobal pred16x16_vertical_8, 2,3 sub r0, r1 @@ -82,7 +62,7 @@ cglobal pred16x16_vertical_8, 2,3 lea r0, [r0+r1*2] dec r2 jg .loop - REP_RET + RET ;----------------------------------------------------------------------------- ; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride) @@ -115,11 +95,9 @@ cglobal pred16x16_horizontal_8, 2,3 lea r0, [r0+r1*2] dec r2 jg .loop - REP_RET + RET %endmacro -INIT_MMX mmx -PRED16x16_H INIT_MMX mmxext PRED16x16_H INIT_XMM ssse3 @@ -158,14 +136,6 @@ cglobal pred16x16_dc_8, 2,7 %endif SPLATB_REG m0, r2, m1 -%if mmsize==8 - mov r3d, 8 -.loop: - mova [r4+r1*0+0], m0 - mova [r4+r1*0+8], m0 - mova [r4+r1*1+0], m0 - mova [r4+r1*1+8], m0 -%else mov r3d, 4 .loop: mova [r4+r1*0], m0 @@ -173,15 +143,12 @@ cglobal pred16x16_dc_8, 2,7 lea r4, [r4+r1*2] mova [r4+r1*0], m0 mova [r4+r1*1], m0 -%endif lea r4, [r4+r1*2] dec r3d jg .loop - REP_RET + RET %endmacro -INIT_MMX mmxext -PRED16x16_DC INIT_XMM sse2 PRED16x16_DC INIT_XMM ssse3 @@ -191,47 +158,6 @@ PRED16x16_DC ; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- -%macro PRED16x16_TM 0 -cglobal pred16x16_tm_vp8_8, 2,5 - sub r0, r1 - pxor mm7, mm7 - movq mm0, [r0+0] - movq mm2, [r0+8] - movq mm1, mm0 - movq mm3, mm2 - punpcklbw mm0, mm7 - punpckhbw mm1, mm7 - punpcklbw mm2, mm7 - punpckhbw mm3, mm7 - movzx r3d, byte [r0-1] - mov r4d, 16 -.loop: - movzx r2d, byte [r0+r1-1] - sub r2d, r3d - movd mm4, r2d - SPLATW mm4, mm4, 0 - movq mm5, mm4 - movq mm6, mm4 - movq mm7, mm4 - paddw mm4, mm0 - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 - packuswb mm4, mm5 - packuswb mm6, mm7 - movq [r0+r1+0], mm4 - movq [r0+r1+8], mm6 - add r0, r1 - dec r4d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmx -PRED16x16_TM -INIT_MMX mmxext -PRED16x16_TM - INIT_XMM sse2 cglobal pred16x16_tm_vp8_8, 2,6,6 sub r0, r1 @@ -266,7 +192,7 @@ cglobal pred16x16_tm_vp8_8, 2,6,6 lea r0, [r0+r1*2] dec r5d jg .loop - REP_RET + RET %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 @@ -302,7 +228,7 @@ cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration lea dstq, [dstq+strideq*4] dec iterationd jg .loop - REP_RET + RET %endif ;----------------------------------------------------------------------------- @@ -315,22 +241,6 @@ cglobal pred16x16_plane_%1_8, 2,9,7 neg r1 ; -stride movh m0, [r0+r1 -1] -%if mmsize == 8 - pxor m4, m4 - movh m1, [r0+r1 +3 ] - movh m2, [r0+r1 +8 ] - movh m3, [r0+r1 +12] - punpcklbw m0, m4 - punpcklbw m1, m4 - punpcklbw m2, m4 - punpcklbw m3, m4 - pmullw m0, [pw_m8tom1 ] - pmullw m1, [pw_m8tom1+8] - pmullw m2, [pw_1to8 ] - pmullw m3, [pw_1to8 +8] - paddw m0, m2 - paddw m1, m3 -%else ; mmsize == 16 %if cpuflag(ssse3) movhps m0, [r0+r1 +8] pmaddubsw m0, [plane_shuf] ; H coefficients @@ -344,21 +254,10 @@ cglobal pred16x16_plane_%1_8, 2,9,7 paddw m0, m1 %endif movhlps m1, m0 -%endif paddw m0, m1 -%if cpuflag(mmxext) PSHUFLW m1, m0, 0xE -%elif cpuflag(mmx) - mova m1, m0 - psrlq m1, 32 -%endif paddw m0, m1 -%if cpuflag(mmxext) PSHUFLW m1, m0, 0x1 -%elif cpuflag(mmx) - mova m1, m0 - psrlq m1, 16 -%endif paddw m0, m1 ; sum of H coefficients lea r4, [r0+r2*8-1] @@ -500,24 +399,10 @@ cglobal pred16x16_plane_%1_8, 2,9,7 SWAP 0, 1 %endif mova m2, m0 -%if mmsize == 8 - mova m5, m0 -%endif pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) -%if mmsize == 16 psllw m2, 3 -%else - psllw m5, 3 - psllw m2, 2 - mova m6, m5 - paddw m6, m2 -%endif paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H -%if mmsize == 8 - paddw m5, m0 ; a + {8,9,10,11}*H - paddw m6, m0 ; a + {12,13,14,15}*H -%endif mov r4, 8 .loop: @@ -527,20 +412,8 @@ cglobal pred16x16_plane_%1_8, 2,9,7 psraw m4, 5 packuswb m3, m4 mova [r0], m3 -%if mmsize == 8 - mova m3, m5 ; b[8..11] - mova m4, m6 ; b[12..15] - psraw m3, 5 - psraw m4, 5 - packuswb m3, m4 - mova [r0+8], m3 -%endif paddw m0, m1 paddw m2, m1 -%if mmsize == 8 - paddw m5, m1 - paddw m6, m1 -%endif mova m3, m0 ; b[0..7] mova m4, m2 ; b[8..15] @@ -548,35 +421,15 @@ cglobal pred16x16_plane_%1_8, 2,9,7 psraw m4, 5 packuswb m3, m4 mova [r0+r2], m3 -%if mmsize == 8 - mova m3, m5 ; b[8..11] - mova m4, m6 ; b[12..15] - psraw m3, 5 - psraw m4, 5 - packuswb m3, m4 - mova [r0+r2+8], m3 -%endif paddw m0, m1 paddw m2, m1 -%if mmsize == 8 - paddw m5, m1 - paddw m6, m1 -%endif lea r0, [r0+r2*2] dec r4 jg .loop - REP_RET + RET %endmacro -INIT_MMX mmx -H264_PRED16x16_PLANE h264 -H264_PRED16x16_PLANE rv40 -H264_PRED16x16_PLANE svq3 -INIT_MMX mmxext -H264_PRED16x16_PLANE h264 -H264_PRED16x16_PLANE rv40 -H264_PRED16x16_PLANE svq3 INIT_XMM sse2 H264_PRED16x16_PLANE h264 H264_PRED16x16_PLANE rv40 @@ -596,14 +449,6 @@ cglobal pred8x8_plane_8, 2,9,7 neg r1 ; -stride movd m0, [r0+r1 -1] -%if mmsize == 8 - pxor m2, m2 - movh m1, [r0+r1 +4 ] - punpcklbw m0, m2 - punpcklbw m1, m2 - pmullw m0, [pw_m4to4] - pmullw m1, [pw_m4to4+8] -%else ; mmsize == 16 %if cpuflag(ssse3) movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary pmaddubsw m0, [plane8_shuf] ; H coefficients @@ -615,25 +460,14 @@ cglobal pred8x8_plane_8, 2,9,7 pmullw m0, [pw_m4to4] %endif movhlps m1, m0 -%endif paddw m0, m1 %if notcpuflag(ssse3) -%if cpuflag(mmxext) PSHUFLW m1, m0, 0xE -%elif cpuflag(mmx) - mova m1, m0 - psrlq m1, 32 -%endif paddw m0, m1 %endif ; !ssse3 -%if cpuflag(mmxext) PSHUFLW m1, m0, 0x1 -%elif cpuflag(mmx) - mova m1, m0 - psrlq m1, 16 -%endif paddw m0, m1 ; sum of H coefficients lea r4, [r0+r2*4-1] @@ -703,20 +537,12 @@ cglobal pred8x8_plane_8, 2,9,7 SPLATW m0, m0, 0 ; H SPLATW m1, m1, 0 ; V SPLATW m3, m3, 0 ; a -%if mmsize == 8 - mova m2, m0 -%endif pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H -%if mmsize == 8 - psllw m2, 2 - paddw m2, m0 ; a + {4,5,6,7}*H -%endif mov r4, 4 ALIGN 16 .loop: -%if mmsize == 16 mova m3, m0 ; b[0..7] paddw m0, m1 psraw m3, 5 @@ -726,35 +552,13 @@ ALIGN 16 packuswb m3, m4 movh [r0], m3 movhps [r0+r2], m3 -%else ; mmsize == 8 - mova m3, m0 ; b[0..3] - mova m4, m2 ; b[4..7] - paddw m0, m1 - paddw m2, m1 - psraw m3, 5 - psraw m4, 5 - mova m5, m0 ; V+b[0..3] - mova m6, m2 ; V+b[4..7] - paddw m0, m1 - paddw m2, m1 - psraw m5, 5 - psraw m6, 5 - packuswb m3, m4 - packuswb m5, m6 - mova [r0], m3 - mova [r0+r2], m5 -%endif lea r0, [r0+r2*2] dec r4 jg .loop - REP_RET + RET %endmacro -INIT_MMX mmx -H264_PRED8x8_PLANE -INIT_MMX mmxext -H264_PRED8x8_PLANE INIT_XMM sse2 H264_PRED8x8_PLANE INIT_XMM ssse3 @@ -795,11 +599,9 @@ cglobal pred8x8_horizontal_8, 2,3 lea r0, [r0+r1*2] dec r2 jg .loop - REP_RET + RET %endmacro -INIT_MMX mmx -PRED8x8_H INIT_MMX mmxext PRED8x8_H INIT_MMX ssse3 @@ -935,52 +737,12 @@ cglobal pred8x8_dc_rv40_8, 2,7 lea r4, [r4+r1*2] dec r3d jg .loop - REP_RET + RET ;----------------------------------------------------------------------------- ; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- -%macro PRED8x8_TM 0 -cglobal pred8x8_tm_vp8_8, 2,6 - sub r0, r1 - pxor mm7, mm7 - movq mm0, [r0] - movq mm1, mm0 - punpcklbw mm0, mm7 - punpckhbw mm1, mm7 - movzx r4d, byte [r0-1] - mov r5d, 4 -.loop: - movzx r2d, byte [r0+r1*1-1] - movzx r3d, byte [r0+r1*2-1] - sub r2d, r4d - sub r3d, r4d - movd mm2, r2d - movd mm4, r3d - SPLATW mm2, mm2, 0 - SPLATW mm4, mm4, 0 - movq mm3, mm2 - movq mm5, mm4 - paddw mm2, mm0 - paddw mm3, mm1 - paddw mm4, mm0 - paddw mm5, mm1 - packuswb mm2, mm3 - packuswb mm4, mm5 - movq [r0+r1*1], mm2 - movq [r0+r1*2], mm4 - lea r0, [r0+r1*2] - dec r5d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmx -PRED8x8_TM -INIT_MMX mmxext -PRED8x8_TM - INIT_XMM sse2 cglobal pred8x8_tm_vp8_8, 2,6,4 sub r0, r1 @@ -1008,7 +770,7 @@ cglobal pred8x8_tm_vp8_8, 2,6,4 lea r0, [r0+r1*2] dec r5d jg .loop - REP_RET + RET INIT_XMM ssse3 cglobal pred8x8_tm_vp8_8, 2,3,6 @@ -1035,7 +797,7 @@ cglobal pred8x8_tm_vp8_8, 2,3,6 lea r0, [r0+r1*2] dec r2d jg .loop - REP_RET + RET ; dest, left, right, src, tmp ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 @@ -1337,114 +1099,6 @@ PRED8x8L_VERTICAL ; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- -INIT_MMX mmxext -cglobal pred8x8l_down_left_8, 4,5 - sub r0, r3 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1d, r1d - jz .fix_lt_2 - test r2d, r2d - jz .fix_tr_1 - jmp .do_top -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2d, r2d - jnz .do_top -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .do_top -.fix_tr_2: - punpckhbw mm3, mm3 - pshufw mm1, mm3, 0xFF - jmp .do_topright -.do_top: - PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 - movq mm7, mm4 - test r2d, r2d - jz .fix_tr_2 - movq mm0, [r0+8] - movq mm5, mm0 - movq mm2, mm0 - movq mm4, mm0 - psrlq mm5, 56 - PALIGNR mm2, mm3, 7, mm3 - PALIGNR mm5, mm4, 1, mm4 - PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 -.do_topright: - lea r1, [r0+r3*2] - movq mm6, mm1 - psrlq mm1, 56 - movq mm4, mm1 - lea r2, [r1+r3*2] - movq mm2, mm6 - PALIGNR mm2, mm7, 1, mm0 - movq mm3, mm6 - PALIGNR mm3, mm7, 7, mm0 - PALIGNR mm4, mm6, 1, mm0 - movq mm5, mm7 - movq mm1, mm7 - movq mm7, mm6 - lea r4, [r2+r3*2] - psllq mm1, 8 - PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 - PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 - movq [r4+r3*2], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 - movq [r4+r3*1], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 - movq [r2+r3*2], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 - movq [r2+r3*1], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 - movq [r1+r3*2], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 - movq [r1+r3*1], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 - movq [r0+r3*2], mm1 - psllq mm1, 8 - psrlq mm0, 56 - por mm1, mm0 - movq [r0+r3*1], mm1 - RET - %macro PRED8x8L_DOWN_LEFT 0 cglobal pred8x8l_down_left_8, 4,4 sub r0, r3 @@ -1534,142 +1188,10 @@ INIT_MMX ssse3 PRED8x8L_DOWN_LEFT ;----------------------------------------------------------------------------- -; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft, -; int has_topright, ptrdiff_t stride) +; void ff_pred8x8l_down_right_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- -INIT_MMX mmxext -cglobal pred8x8l_down_right_8, 4,5 - sub r0, r3 - lea r4, [r0+r3*2] - movq mm0, [r0+r3*1-8] - punpckhbw mm0, [r0+r3*0-8] - movq mm1, [r4+r3*1-8] - punpckhbw mm1, [r0+r3*2-8] - mov r4, r0 - punpckhwd mm1, mm0 - lea r0, [r0+r3*4] - movq mm2, [r0+r3*1-8] - punpckhbw mm2, [r0+r3*0-8] - lea r0, [r0+r3*2] - movq mm3, [r0+r3*1-8] - punpckhbw mm3, [r0+r3*0-8] - punpckhwd mm3, mm2 - punpckhdq mm3, mm1 - lea r0, [r0+r3*2] - movq mm0, [r0+r3*0-8] - movq mm1, [r4] - mov r0, r4 - movq mm4, mm3 - movq mm2, mm3 - PALIGNR mm4, mm0, 7, mm0 - PALIGNR mm1, mm2, 1, mm2 - test r1d, r1d ; top_left - jz .fix_lt_1 -.do_left: - movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm4, mm0 - movq mm7, mm2 - movq mm6, mm2 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 - psllq mm1, 56 - PALIGNR mm7, mm1, 7, mm3 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1d, r1d ; top_left - jz .fix_lt_2 - test r2d, r2d ; top_right - jz .fix_tr_1 -.do_top: - PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 - movq mm5, mm4 - jmp .body -.fix_lt_1: - movq mm5, mm3 - pxor mm5, mm4 - psrlq mm5, 56 - psllq mm5, 48 - pxor mm1, mm5 - jmp .do_left -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2d, r2d ; top_right - jnz .do_top -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .do_top -.body: - lea r1, [r0+r3*2] - movq mm1, mm7 - movq mm7, mm5 - movq mm5, mm6 - movq mm2, mm7 - lea r2, [r1+r3*2] - PALIGNR mm2, mm6, 1, mm0 - movq mm3, mm7 - PALIGNR mm3, mm6, 7, mm0 - movq mm4, mm7 - lea r4, [r2+r3*2] - psrlq mm4, 8 - PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 - PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 - movq [r4+r3*2], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 - movq [r4+r3*1], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 - movq [r2+r3*2], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 - movq [r2+r3*1], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 - movq [r1+r3*2], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 - movq [r1+r3*1], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 - movq [r0+r3*2], mm0 - psrlq mm0, 8 - psllq mm1, 56 - por mm0, mm1 - movq [r0+r3*1], mm0 - RET - %macro PRED8x8L_DOWN_RIGHT 0 cglobal pred8x8l_down_right_8, 4,5 sub r0, r3 @@ -1790,113 +1312,6 @@ PRED8x8L_DOWN_RIGHT ; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- -INIT_MMX mmxext -cglobal pred8x8l_vertical_right_8, 4,5 - sub r0, r3 - lea r4, [r0+r3*2] - movq mm0, [r0+r3*1-8] - punpckhbw mm0, [r0+r3*0-8] - movq mm1, [r4+r3*1-8] - punpckhbw mm1, [r0+r3*2-8] - mov r4, r0 - punpckhwd mm1, mm0 - lea r0, [r0+r3*4] - movq mm2, [r0+r3*1-8] - punpckhbw mm2, [r0+r3*0-8] - lea r0, [r0+r3*2] - movq mm3, [r0+r3*1-8] - punpckhbw mm3, [r0+r3*0-8] - punpckhwd mm3, mm2 - punpckhdq mm3, mm1 - lea r0, [r0+r3*2] - movq mm0, [r0+r3*0-8] - movq mm1, [r4] - mov r0, r4 - movq mm4, mm3 - movq mm2, mm3 - PALIGNR mm4, mm0, 7, mm0 - PALIGNR mm1, mm2, 1, mm2 - test r1d, r1d - jz .fix_lt_1 - jmp .do_left -.fix_lt_1: - movq mm5, mm3 - pxor mm5, mm4 - psrlq mm5, 56 - psllq mm5, 48 - pxor mm1, mm5 - jmp .do_left -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2d, r2d - jnz .do_top -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .do_top -.do_left: - movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm7, mm2 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1d, r1d - jz .fix_lt_2 - test r2d, r2d - jz .fix_tr_1 -.do_top: - PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 - lea r1, [r0+r3*2] - movq mm2, mm6 - movq mm3, mm6 - PALIGNR mm3, mm7, 7, mm0 - PALIGNR mm6, mm7, 6, mm1 - movq mm4, mm3 - pavgb mm3, mm2 - lea r2, [r1+r3*2] - PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5 - movq [r0+r3*1], mm3 - movq [r0+r3*2], mm0 - movq mm5, mm0 - movq mm6, mm3 - movq mm1, mm7 - movq mm2, mm1 - psllq mm2, 8 - movq mm3, mm1 - psllq mm3, 16 - lea r4, [r2+r3*2] - PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4 - PALIGNR mm6, mm0, 7, mm2 - movq [r1+r3*1], mm6 - psllq mm0, 8 - PALIGNR mm5, mm0, 7, mm1 - movq [r1+r3*2], mm5 - psllq mm0, 8 - PALIGNR mm6, mm0, 7, mm2 - movq [r2+r3*1], mm6 - psllq mm0, 8 - PALIGNR mm5, mm0, 7, mm1 - movq [r2+r3*2], mm5 - psllq mm0, 8 - PALIGNR mm6, mm0, 7, mm2 - movq [r4+r3*1], mm6 - psllq mm0, 8 - PALIGNR mm5, mm0, 7, mm1 - movq [r4+r3*2], mm5 - RET - %macro PRED8x8L_VERTICAL_RIGHT 0 cglobal pred8x8l_vertical_right_8, 4,5,7 ; manually spill XMM registers for Win64 because @@ -2196,121 +1611,6 @@ PRED8x8L_HORIZONTAL_UP ; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- -INIT_MMX mmxext -cglobal pred8x8l_horizontal_down_8, 4,5 - sub r0, r3 - lea r4, [r0+r3*2] - movq mm0, [r0+r3*1-8] - punpckhbw mm0, [r0+r3*0-8] - movq mm1, [r4+r3*1-8] - punpckhbw mm1, [r0+r3*2-8] - mov r4, r0 - punpckhwd mm1, mm0 - lea r0, [r0+r3*4] - movq mm2, [r0+r3*1-8] - punpckhbw mm2, [r0+r3*0-8] - lea r0, [r0+r3*2] - movq mm3, [r0+r3*1-8] - punpckhbw mm3, [r0+r3*0-8] - punpckhwd mm3, mm2 - punpckhdq mm3, mm1 - lea r0, [r0+r3*2] - movq mm0, [r0+r3*0-8] - movq mm1, [r4] - mov r0, r4 - movq mm4, mm3 - movq mm2, mm3 - PALIGNR mm4, mm0, 7, mm0 - PALIGNR mm1, mm2, 1, mm2 - test r1d, r1d - jnz .do_left -.fix_lt_1: - movq mm5, mm3 - pxor mm5, mm4 - psrlq mm5, 56 - psllq mm5, 48 - pxor mm1, mm5 - jmp .do_left -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2d, r2d - jnz .do_top -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .do_top -.do_left: - movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm4, mm0 - movq mm7, mm2 - movq mm6, mm2 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 - psllq mm1, 56 - PALIGNR mm7, mm1, 7, mm3 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1d, r1d - jz .fix_lt_2 - test r2d, r2d - jz .fix_tr_1 -.do_top: - PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 - movq mm5, mm4 - lea r1, [r0+r3*2] - psllq mm7, 56 - movq mm2, mm5 - movq mm3, mm6 - movq mm4, mm2 - PALIGNR mm2, mm6, 7, mm5 - PALIGNR mm6, mm7, 7, mm0 - lea r2, [r1+r3*2] - PALIGNR mm4, mm3, 1, mm7 - movq mm5, mm3 - pavgb mm3, mm6 - PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7 - movq mm4, mm2 - movq mm1, mm2 - lea r4, [r2+r3*2] - psrlq mm4, 16 - psrlq mm1, 8 - PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5 - movq mm7, mm3 - punpcklbw mm3, mm0 - punpckhbw mm7, mm0 - movq mm1, mm7 - movq mm0, mm7 - movq mm4, mm7 - movq [r4+r3*2], mm3 - PALIGNR mm7, mm3, 2, mm5 - movq [r4+r3*1], mm7 - PALIGNR mm1, mm3, 4, mm5 - movq [r2+r3*2], mm1 - PALIGNR mm0, mm3, 6, mm3 - movq [r2+r3*1], mm0 - movq mm2, mm6 - movq mm3, mm6 - movq [r1+r3*2], mm4 - PALIGNR mm6, mm4, 2, mm5 - movq [r1+r3*1], mm6 - PALIGNR mm2, mm4, 4, mm5 - movq [r0+r3*2], mm2 - PALIGNR mm3, mm4, 6, mm4 - movq [r0+r3*1], mm3 - RET - %macro PRED8x8L_HORIZONTAL_DOWN 0 cglobal pred8x8l_horizontal_down_8, 4,5 sub r0, r3 @@ -2476,7 +1776,7 @@ cglobal pred4x4_dc_8, 3,5 ; ptrdiff_t stride) ;----------------------------------------------------------------------------- -%macro PRED4x4_TM 0 +INIT_MMX mmxext cglobal pred4x4_tm_vp8_8, 3,6 sub r0, r2 pxor mm7, mm7 @@ -2491,15 +1791,8 @@ cglobal pred4x4_tm_vp8_8, 3,6 sub r3d, r4d movd mm2, r1d movd mm4, r3d -%if cpuflag(mmxext) pshufw mm2, mm2, 0 pshufw mm4, mm4, 0 -%else - punpcklwd mm2, mm2 - punpcklwd mm4, mm4 - punpckldq mm2, mm2 - punpckldq mm4, mm4 -%endif paddw mm2, mm0 paddw mm4, mm0 packuswb mm2, mm2 @@ -2509,13 +1802,7 @@ cglobal pred4x4_tm_vp8_8, 3,6 lea r0, [r0+r2*2] dec r5d jg .loop - REP_RET -%endmacro - -INIT_MMX mmx -PRED4x4_TM -INIT_MMX mmxext -PRED4x4_TM + RET INIT_XMM ssse3 cglobal pred4x4_tm_vp8_8, 3,3 |