| ; |
| ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| ; |
| ; Use of this source code is governed by a BSD-style license |
| ; that can be found in the LICENSE file in the root of the source |
| ; tree. An additional intellectual property rights grant can be found |
| ; in the file PATENTS. All contributing project authors may |
| ; be found in the AUTHORS file in the root of the source tree. |
| ; |
| |
| %include "vpx_ports/x86_abi_support.asm" |
| |
| %macro STACK_FRAME_CREATE_X3 0 |
| %if ABI_IS_32BIT |
| %define src_ptr rsi |
| %define src_stride rax |
| %define ref_ptr rdi |
| %define ref_stride rdx |
| %define end_ptr rcx |
| %define ret_var rbx |
| %define result_ptr arg(4) |
| %define max_err arg(4) |
| push rbp |
| mov rbp, rsp |
| push rsi |
| push rdi |
| push rbx |
| |
| mov rsi, arg(0) ; src_ptr |
| mov rdi, arg(2) ; ref_ptr |
| |
| movsxd rax, dword ptr arg(1) ; src_stride |
| movsxd rdx, dword ptr arg(3) ; ref_stride |
| %else |
| %ifidn __OUTPUT_FORMAT__,x64 |
| %define src_ptr rcx |
| %define src_stride rdx |
| %define ref_ptr r8 |
| %define ref_stride r9 |
| %define end_ptr r10 |
| %define ret_var r11 |
| %define result_ptr [rsp+8+4*8] |
| %define max_err [rsp+8+4*8] |
| %else |
| %define src_ptr rdi |
| %define src_stride rsi |
| %define ref_ptr rdx |
| %define ref_stride rcx |
| %define end_ptr r9 |
| %define ret_var r10 |
| %define result_ptr r8 |
| %define max_err r8 |
| %endif |
| %endif |
| |
| %endmacro |
| |
| %macro STACK_FRAME_DESTROY_X3 0 |
| %define src_ptr |
| %define src_stride |
| %define ref_ptr |
| %define ref_stride |
| %define end_ptr |
| %define ret_var |
| %define result_ptr |
| %define max_err |
| |
| %if ABI_IS_32BIT |
| pop rbx |
| pop rdi |
| pop rsi |
| pop rbp |
| %else |
| %ifidn __OUTPUT_FORMAT__,x64 |
| %endif |
| %endif |
| ret |
| %endmacro |
| |
| %macro STACK_FRAME_CREATE_X4 0 |
| %if ABI_IS_32BIT |
| %define src_ptr rsi |
| %define src_stride rax |
| %define r0_ptr rcx |
| %define r1_ptr rdx |
| %define r2_ptr rbx |
| %define r3_ptr rdi |
| %define ref_stride rbp |
| %define result_ptr arg(4) |
| push rbp |
| mov rbp, rsp |
| push rsi |
| push rdi |
| push rbx |
| |
| push rbp |
| mov rdi, arg(2) ; ref_ptr_base |
| |
| LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi |
| |
| mov rsi, arg(0) ; src_ptr |
| |
| movsxd rbx, dword ptr arg(1) ; src_stride |
| movsxd rbp, dword ptr arg(3) ; ref_stride |
| |
| xchg rbx, rax |
| %else |
| %ifidn __OUTPUT_FORMAT__,x64 |
| %define src_ptr rcx |
| %define src_stride rdx |
| %define r0_ptr rsi |
| %define r1_ptr r10 |
| %define r2_ptr r11 |
| %define r3_ptr r8 |
| %define ref_stride r9 |
| %define result_ptr [rsp+16+4*8] |
| push rsi |
| |
| LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr |
| %else |
| %define src_ptr rdi |
| %define src_stride rsi |
| %define r0_ptr r9 |
| %define r1_ptr r10 |
| %define r2_ptr r11 |
| %define r3_ptr rdx |
| %define ref_stride rcx |
| %define result_ptr r8 |
| |
| LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr |
| |
| %endif |
| %endif |
| %endmacro |
| |
| %macro STACK_FRAME_DESTROY_X4 0 |
| %define src_ptr |
| %define src_stride |
| %define r0_ptr |
| %define r1_ptr |
| %define r2_ptr |
| %define r3_ptr |
| %define ref_stride |
| %define result_ptr |
| |
| %if ABI_IS_32BIT |
| pop rbx |
| pop rdi |
| pop rsi |
| pop rbp |
| %else |
| %ifidn __OUTPUT_FORMAT__,x64 |
| pop rsi |
| %endif |
| %endif |
| ret |
| %endmacro |
| |
| %macro PROCESS_16X2X3 5 |
| %if %1==0 |
| movdqa xmm0, XMMWORD PTR [%2] |
| lddqu xmm5, XMMWORD PTR [%3] |
| lddqu xmm6, XMMWORD PTR [%3+1] |
| lddqu xmm7, XMMWORD PTR [%3+2] |
| |
| psadbw xmm5, xmm0 |
| psadbw xmm6, xmm0 |
| psadbw xmm7, xmm0 |
| %else |
| movdqa xmm0, XMMWORD PTR [%2] |
| lddqu xmm1, XMMWORD PTR [%3] |
| lddqu xmm2, XMMWORD PTR [%3+1] |
| lddqu xmm3, XMMWORD PTR [%3+2] |
| |
| psadbw xmm1, xmm0 |
| psadbw xmm2, xmm0 |
| psadbw xmm3, xmm0 |
| |
| paddw xmm5, xmm1 |
| paddw xmm6, xmm2 |
| paddw xmm7, xmm3 |
| %endif |
| movdqa xmm0, XMMWORD PTR [%2+%4] |
| lddqu xmm1, XMMWORD PTR [%3+%5] |
| lddqu xmm2, XMMWORD PTR [%3+%5+1] |
| lddqu xmm3, XMMWORD PTR [%3+%5+2] |
| |
| %if %1==0 || %1==1 |
| lea %2, [%2+%4*2] |
| lea %3, [%3+%5*2] |
| %endif |
| |
| psadbw xmm1, xmm0 |
| psadbw xmm2, xmm0 |
| psadbw xmm3, xmm0 |
| |
| paddw xmm5, xmm1 |
| paddw xmm6, xmm2 |
| paddw xmm7, xmm3 |
| %endmacro |
| |
| %macro PROCESS_8X2X3 5 |
| %if %1==0 |
| movq mm0, QWORD PTR [%2] |
| movq mm5, QWORD PTR [%3] |
| movq mm6, QWORD PTR [%3+1] |
| movq mm7, QWORD PTR [%3+2] |
| |
| psadbw mm5, mm0 |
| psadbw mm6, mm0 |
| psadbw mm7, mm0 |
| %else |
| movq mm0, QWORD PTR [%2] |
| movq mm1, QWORD PTR [%3] |
| movq mm2, QWORD PTR [%3+1] |
| movq mm3, QWORD PTR [%3+2] |
| |
| psadbw mm1, mm0 |
| psadbw mm2, mm0 |
| psadbw mm3, mm0 |
| |
| paddw mm5, mm1 |
| paddw mm6, mm2 |
| paddw mm7, mm3 |
| %endif |
| movq mm0, QWORD PTR [%2+%4] |
| movq mm1, QWORD PTR [%3+%5] |
| movq mm2, QWORD PTR [%3+%5+1] |
| movq mm3, QWORD PTR [%3+%5+2] |
| |
| %if %1==0 || %1==1 |
| lea %2, [%2+%4*2] |
| lea %3, [%3+%5*2] |
| %endif |
| |
| psadbw mm1, mm0 |
| psadbw mm2, mm0 |
| psadbw mm3, mm0 |
| |
| paddw mm5, mm1 |
| paddw mm6, mm2 |
| paddw mm7, mm3 |
| %endmacro |
| |
| %macro LOAD_X4_ADDRESSES 5 |
| mov %2, [%1+REG_SZ_BYTES*0] |
| mov %3, [%1+REG_SZ_BYTES*1] |
| |
| mov %4, [%1+REG_SZ_BYTES*2] |
| mov %5, [%1+REG_SZ_BYTES*3] |
| %endmacro |
| |
| %macro PROCESS_16X2X4 8 |
| %if %1==0 |
| movdqa xmm0, XMMWORD PTR [%2] |
| lddqu xmm4, XMMWORD PTR [%3] |
| lddqu xmm5, XMMWORD PTR [%4] |
| lddqu xmm6, XMMWORD PTR [%5] |
| lddqu xmm7, XMMWORD PTR [%6] |
| |
| psadbw xmm4, xmm0 |
| psadbw xmm5, xmm0 |
| psadbw xmm6, xmm0 |
| psadbw xmm7, xmm0 |
| %else |
| movdqa xmm0, XMMWORD PTR [%2] |
| lddqu xmm1, XMMWORD PTR [%3] |
| lddqu xmm2, XMMWORD PTR [%4] |
| lddqu xmm3, XMMWORD PTR [%5] |
| |
| psadbw xmm1, xmm0 |
| psadbw xmm2, xmm0 |
| psadbw xmm3, xmm0 |
| |
| paddw xmm4, xmm1 |
| lddqu xmm1, XMMWORD PTR [%6] |
| paddw xmm5, xmm2 |
| paddw xmm6, xmm3 |
| |
| psadbw xmm1, xmm0 |
| paddw xmm7, xmm1 |
| %endif |
| movdqa xmm0, XMMWORD PTR [%2+%7] |
| lddqu xmm1, XMMWORD PTR [%3+%8] |
| lddqu xmm2, XMMWORD PTR [%4+%8] |
| lddqu xmm3, XMMWORD PTR [%5+%8] |
| |
| psadbw xmm1, xmm0 |
| psadbw xmm2, xmm0 |
| psadbw xmm3, xmm0 |
| |
| paddw xmm4, xmm1 |
| lddqu xmm1, XMMWORD PTR [%6+%8] |
| paddw xmm5, xmm2 |
| paddw xmm6, xmm3 |
| |
| %if %1==0 || %1==1 |
| lea %2, [%2+%7*2] |
| lea %3, [%3+%8*2] |
| |
| lea %4, [%4+%8*2] |
| lea %5, [%5+%8*2] |
| |
| lea %6, [%6+%8*2] |
| %endif |
| psadbw xmm1, xmm0 |
| paddw xmm7, xmm1 |
| |
| %endmacro |
| |
| %macro PROCESS_8X2X4 8 |
| %if %1==0 |
| movq mm0, QWORD PTR [%2] |
| movq mm4, QWORD PTR [%3] |
| movq mm5, QWORD PTR [%4] |
| movq mm6, QWORD PTR [%5] |
| movq mm7, QWORD PTR [%6] |
| |
| psadbw mm4, mm0 |
| psadbw mm5, mm0 |
| psadbw mm6, mm0 |
| psadbw mm7, mm0 |
| %else |
| movq mm0, QWORD PTR [%2] |
| movq mm1, QWORD PTR [%3] |
| movq mm2, QWORD PTR [%4] |
| movq mm3, QWORD PTR [%5] |
| |
| psadbw mm1, mm0 |
| psadbw mm2, mm0 |
| psadbw mm3, mm0 |
| |
| paddw mm4, mm1 |
| movq mm1, QWORD PTR [%6] |
| paddw mm5, mm2 |
| paddw mm6, mm3 |
| |
| psadbw mm1, mm0 |
| paddw mm7, mm1 |
| %endif |
| movq mm0, QWORD PTR [%2+%7] |
| movq mm1, QWORD PTR [%3+%8] |
| movq mm2, QWORD PTR [%4+%8] |
| movq mm3, QWORD PTR [%5+%8] |
| |
| psadbw mm1, mm0 |
| psadbw mm2, mm0 |
| psadbw mm3, mm0 |
| |
| paddw mm4, mm1 |
| movq mm1, QWORD PTR [%6+%8] |
| paddw mm5, mm2 |
| paddw mm6, mm3 |
| |
| %if %1==0 || %1==1 |
| lea %2, [%2+%7*2] |
| lea %3, [%3+%8*2] |
| |
| lea %4, [%4+%8*2] |
| lea %5, [%5+%8*2] |
| |
| lea %6, [%6+%8*2] |
| %endif |
| psadbw mm1, mm0 |
| paddw mm7, mm1 |
| |
| %endmacro |
| |
| ;void int vp8_sad16x16x3_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(vp8_sad16x16x3_sse3) |
| sym(vp8_sad16x16x3_sse3): |
| |
| STACK_FRAME_CREATE_X3 |
| |
| PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
| |
| mov rcx, result_ptr |
| |
| movq xmm0, xmm5 |
| psrldq xmm5, 8 |
| |
| paddw xmm0, xmm5 |
| movd [rcx], xmm0 |
| ;- |
| movq xmm0, xmm6 |
| psrldq xmm6, 8 |
| |
| paddw xmm0, xmm6 |
| movd [rcx+4], xmm0 |
| ;- |
| movq xmm0, xmm7 |
| psrldq xmm7, 8 |
| |
| paddw xmm0, xmm7 |
| movd [rcx+8], xmm0 |
| |
| STACK_FRAME_DESTROY_X3 |
| |
| ;void int vp8_sad16x8x3_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(vp8_sad16x8x3_sse3) |
| sym(vp8_sad16x8x3_sse3): |
| |
| STACK_FRAME_CREATE_X3 |
| |
| PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
| |
| mov rcx, result_ptr |
| |
| movq xmm0, xmm5 |
| psrldq xmm5, 8 |
| |
| paddw xmm0, xmm5 |
| movd [rcx], xmm0 |
| ;- |
| movq xmm0, xmm6 |
| psrldq xmm6, 8 |
| |
| paddw xmm0, xmm6 |
| movd [rcx+4], xmm0 |
| ;- |
| movq xmm0, xmm7 |
| psrldq xmm7, 8 |
| |
| paddw xmm0, xmm7 |
| movd [rcx+8], xmm0 |
| |
| STACK_FRAME_DESTROY_X3 |
| |
| ;void int vp8_sad8x16x3_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(vp8_sad8x16x3_sse3) |
| sym(vp8_sad8x16x3_sse3): |
| |
| STACK_FRAME_CREATE_X3 |
| |
| PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
| |
| mov rcx, result_ptr |
| |
| punpckldq mm5, mm6 |
| |
| movq [rcx], mm5 |
| movd [rcx+8], mm7 |
| |
| STACK_FRAME_DESTROY_X3 |
| |
| ;void int vp8_sad8x8x3_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(vp8_sad8x8x3_sse3) |
| sym(vp8_sad8x8x3_sse3): |
| |
| STACK_FRAME_CREATE_X3 |
| |
| PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
| |
| mov rcx, result_ptr |
| |
| punpckldq mm5, mm6 |
| |
| movq [rcx], mm5 |
| movd [rcx+8], mm7 |
| |
| STACK_FRAME_DESTROY_X3 |
| |
| ;void int vp8_sad4x4x3_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(vp8_sad4x4x3_sse3) |
| sym(vp8_sad4x4x3_sse3): |
| |
| STACK_FRAME_CREATE_X3 |
| |
| movd mm0, DWORD PTR [src_ptr] |
| movd mm1, DWORD PTR [ref_ptr] |
| |
| movd mm2, DWORD PTR [src_ptr+src_stride] |
| movd mm3, DWORD PTR [ref_ptr+ref_stride] |
| |
| punpcklbw mm0, mm2 |
| punpcklbw mm1, mm3 |
| |
| movd mm4, DWORD PTR [ref_ptr+1] |
| movd mm5, DWORD PTR [ref_ptr+2] |
| |
| movd mm2, DWORD PTR [ref_ptr+ref_stride+1] |
| movd mm3, DWORD PTR [ref_ptr+ref_stride+2] |
| |
| psadbw mm1, mm0 |
| |
| punpcklbw mm4, mm2 |
| punpcklbw mm5, mm3 |
| |
| psadbw mm4, mm0 |
| psadbw mm5, mm0 |
| |
| lea src_ptr, [src_ptr+src_stride*2] |
| lea ref_ptr, [ref_ptr+ref_stride*2] |
| |
| movd mm0, DWORD PTR [src_ptr] |
| movd mm2, DWORD PTR [ref_ptr] |
| |
| movd mm3, DWORD PTR [src_ptr+src_stride] |
| movd mm6, DWORD PTR [ref_ptr+ref_stride] |
| |
| punpcklbw mm0, mm3 |
| punpcklbw mm2, mm6 |
| |
| movd mm3, DWORD PTR [ref_ptr+1] |
| movd mm7, DWORD PTR [ref_ptr+2] |
| |
| psadbw mm2, mm0 |
| |
| paddw mm1, mm2 |
| |
| movd mm2, DWORD PTR [ref_ptr+ref_stride+1] |
| movd mm6, DWORD PTR [ref_ptr+ref_stride+2] |
| |
| punpcklbw mm3, mm2 |
| punpcklbw mm7, mm6 |
| |
| psadbw mm3, mm0 |
| psadbw mm7, mm0 |
| |
| paddw mm3, mm4 |
| paddw mm7, mm5 |
| |
| mov rcx, result_ptr |
| |
| punpckldq mm1, mm3 |
| |
| movq [rcx], mm1 |
| movd [rcx+8], mm7 |
| |
| STACK_FRAME_DESTROY_X3 |
| |
| ;unsigned int vp8_sad16x16_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int max_err) |
| ;%define lddqu movdqu |
| global sym(vp8_sad16x16_sse3) |
| sym(vp8_sad16x16_sse3): |
| |
| STACK_FRAME_CREATE_X3 |
| |
| mov end_ptr, 4 |
| pxor xmm7, xmm7 |
| |
| .vp8_sad16x16_sse3_loop: |
| movdqa xmm0, XMMWORD PTR [src_ptr] |
| movdqu xmm1, XMMWORD PTR [ref_ptr] |
| movdqa xmm2, XMMWORD PTR [src_ptr+src_stride] |
| movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride] |
| |
| lea src_ptr, [src_ptr+src_stride*2] |
| lea ref_ptr, [ref_ptr+ref_stride*2] |
| |
| movdqa xmm4, XMMWORD PTR [src_ptr] |
| movdqu xmm5, XMMWORD PTR [ref_ptr] |
| movdqa xmm6, XMMWORD PTR [src_ptr+src_stride] |
| |
| psadbw xmm0, xmm1 |
| |
| movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride] |
| |
| psadbw xmm2, xmm3 |
| psadbw xmm4, xmm5 |
| psadbw xmm6, xmm1 |
| |
| lea src_ptr, [src_ptr+src_stride*2] |
| lea ref_ptr, [ref_ptr+ref_stride*2] |
| |
| paddw xmm7, xmm0 |
| paddw xmm7, xmm2 |
| paddw xmm7, xmm4 |
| paddw xmm7, xmm6 |
| |
| sub end_ptr, 1 |
| jne .vp8_sad16x16_sse3_loop |
| |
| movq xmm0, xmm7 |
| psrldq xmm7, 8 |
| paddw xmm0, xmm7 |
| movq rax, xmm0 |
| |
| STACK_FRAME_DESTROY_X3 |
| |
| ;void vp8_sad16x16x4d_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr_base, |
| ; int ref_stride, |
| ; int *results) |
| global sym(vp8_sad16x16x4d_sse3) |
| sym(vp8_sad16x16x4d_sse3): |
| |
| STACK_FRAME_CREATE_X4 |
| |
| PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| |
| %if ABI_IS_32BIT |
| pop rbp |
| %endif |
| mov rcx, result_ptr |
| |
| movq xmm0, xmm4 |
| psrldq xmm4, 8 |
| |
| paddw xmm0, xmm4 |
| movd [rcx], xmm0 |
| ;- |
| movq xmm0, xmm5 |
| psrldq xmm5, 8 |
| |
| paddw xmm0, xmm5 |
| movd [rcx+4], xmm0 |
| ;- |
| movq xmm0, xmm6 |
| psrldq xmm6, 8 |
| |
| paddw xmm0, xmm6 |
| movd [rcx+8], xmm0 |
| ;- |
| movq xmm0, xmm7 |
| psrldq xmm7, 8 |
| |
| paddw xmm0, xmm7 |
| movd [rcx+12], xmm0 |
| |
| STACK_FRAME_DESTROY_X4 |
| |
| ;void vp8_sad16x8x4d_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr_base, |
| ; int ref_stride, |
| ; int *results) |
| global sym(vp8_sad16x8x4d_sse3) |
| sym(vp8_sad16x8x4d_sse3): |
| |
| STACK_FRAME_CREATE_X4 |
| |
| PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| |
| %if ABI_IS_32BIT |
| pop rbp |
| %endif |
| mov rcx, result_ptr |
| |
| movq xmm0, xmm4 |
| psrldq xmm4, 8 |
| |
| paddw xmm0, xmm4 |
| movd [rcx], xmm0 |
| ;- |
| movq xmm0, xmm5 |
| psrldq xmm5, 8 |
| |
| paddw xmm0, xmm5 |
| movd [rcx+4], xmm0 |
| ;- |
| movq xmm0, xmm6 |
| psrldq xmm6, 8 |
| |
| paddw xmm0, xmm6 |
| movd [rcx+8], xmm0 |
| ;- |
| movq xmm0, xmm7 |
| psrldq xmm7, 8 |
| |
| paddw xmm0, xmm7 |
| movd [rcx+12], xmm0 |
| |
| STACK_FRAME_DESTROY_X4 |
| |
| ;void int vp8_sad8x16x4d_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(vp8_sad8x16x4d_sse3) |
| sym(vp8_sad8x16x4d_sse3): |
| |
| STACK_FRAME_CREATE_X4 |
| |
| PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| |
| %if ABI_IS_32BIT |
| pop rbp |
| %endif |
| mov rcx, result_ptr |
| |
| punpckldq mm4, mm5 |
| punpckldq mm6, mm7 |
| |
| movq [rcx], mm4 |
| movq [rcx+8], mm6 |
| |
| STACK_FRAME_DESTROY_X4 |
| |
| ;void int vp8_sad8x8x4d_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(vp8_sad8x8x4d_sse3) |
| sym(vp8_sad8x8x4d_sse3): |
| |
| STACK_FRAME_CREATE_X4 |
| |
| PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
| |
| %if ABI_IS_32BIT |
| pop rbp |
| %endif |
| mov rcx, result_ptr |
| |
| punpckldq mm4, mm5 |
| punpckldq mm6, mm7 |
| |
| movq [rcx], mm4 |
| movq [rcx+8], mm6 |
| |
| STACK_FRAME_DESTROY_X4 |
| |
| ;void int vp8_sad4x4x4d_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(vp8_sad4x4x4d_sse3) |
| sym(vp8_sad4x4x4d_sse3): |
| |
| STACK_FRAME_CREATE_X4 |
| |
| movd mm0, DWORD PTR [src_ptr] |
| movd mm1, DWORD PTR [r0_ptr] |
| |
| movd mm2, DWORD PTR [src_ptr+src_stride] |
| movd mm3, DWORD PTR [r0_ptr+ref_stride] |
| |
| punpcklbw mm0, mm2 |
| punpcklbw mm1, mm3 |
| |
| movd mm4, DWORD PTR [r1_ptr] |
| movd mm5, DWORD PTR [r2_ptr] |
| |
| movd mm6, DWORD PTR [r3_ptr] |
| movd mm2, DWORD PTR [r1_ptr+ref_stride] |
| |
| movd mm3, DWORD PTR [r2_ptr+ref_stride] |
| movd mm7, DWORD PTR [r3_ptr+ref_stride] |
| |
| psadbw mm1, mm0 |
| |
| punpcklbw mm4, mm2 |
| punpcklbw mm5, mm3 |
| |
| punpcklbw mm6, mm7 |
| psadbw mm4, mm0 |
| |
| psadbw mm5, mm0 |
| psadbw mm6, mm0 |
| |
| |
| |
| lea src_ptr, [src_ptr+src_stride*2] |
| lea r0_ptr, [r0_ptr+ref_stride*2] |
| |
| lea r1_ptr, [r1_ptr+ref_stride*2] |
| lea r2_ptr, [r2_ptr+ref_stride*2] |
| |
| lea r3_ptr, [r3_ptr+ref_stride*2] |
| |
| movd mm0, DWORD PTR [src_ptr] |
| movd mm2, DWORD PTR [r0_ptr] |
| |
| movd mm3, DWORD PTR [src_ptr+src_stride] |
| movd mm7, DWORD PTR [r0_ptr+ref_stride] |
| |
| punpcklbw mm0, mm3 |
| punpcklbw mm2, mm7 |
| |
| movd mm3, DWORD PTR [r1_ptr] |
| movd mm7, DWORD PTR [r2_ptr] |
| |
| psadbw mm2, mm0 |
| %if ABI_IS_32BIT |
| mov rax, rbp |
| |
| pop rbp |
| %define ref_stride rax |
| %endif |
| mov rsi, result_ptr |
| |
| paddw mm1, mm2 |
| movd [rsi], mm1 |
| |
| movd mm2, DWORD PTR [r1_ptr+ref_stride] |
| movd mm1, DWORD PTR [r2_ptr+ref_stride] |
| |
| punpcklbw mm3, mm2 |
| punpcklbw mm7, mm1 |
| |
| psadbw mm3, mm0 |
| psadbw mm7, mm0 |
| |
| movd mm2, DWORD PTR [r3_ptr] |
| movd mm1, DWORD PTR [r3_ptr+ref_stride] |
| |
| paddw mm3, mm4 |
| paddw mm7, mm5 |
| |
| movd [rsi+4], mm3 |
| punpcklbw mm2, mm1 |
| |
| movd [rsi+8], mm7 |
| psadbw mm2, mm0 |
| |
| paddw mm2, mm6 |
| movd [rsi+12], mm2 |
| |
| |
| STACK_FRAME_DESTROY_X4 |