vp8/common/x86/idctllm_mmx.asm - platform/external/libvpx - Git at Google

 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;


 %include "vpx_ports/x86_abi_support.asm"

 ; /****************************************************************************
 ; * Notes:
 ; *
 ; * This implementation makes use of 16 bit fixed point verio of two multiply
 ; * constants:
 ; *        1.   sqrt(2) * cos (pi/8)
 ; *         2.   sqrt(2) * sin (pi/8)
 ; * Becuase the first constant is bigger than 1, to maintain the same 16 bit
 ; * fixed point prrcision as the second one, we use a trick of
 ; *        x * a = x + x*(a-1)
 ; * so
 ; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
 ; *
 ; * For     the second constant, becuase of the 16bit version is 35468, which
 ; * is bigger than 32768, in signed 16 bit multiply, it become a negative
 ; * number.
 ; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
 ; *
 ; **************************************************************************/


 ;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
 global sym(vp8_short_idct4x4llm_mmx)
 sym(vp8_short_idct4x4llm_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 3
     GET_GOT     rbx
     ; end prolog

         mov         rax,            arg(0) ;input
         mov         rdx,            arg(1) ;output

         movq        mm0,            [rax   ]
         movq        mm1,            [rax+ 8]

         movq        mm2,            [rax+16]
         movq        mm3,            [rax+24]

         movsxd      rax,            dword ptr arg(2) ;pitch

         psubw       mm0,            mm2             ; b1= 0-2
         paddw       mm2,            mm2             ;

         movq        mm5,            mm1
         paddw       mm2,            mm0             ; a1 =0+2

         pmulhw      mm5,            [x_s1sqr2 GLOBAL]        ;
         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)

         movq        mm7,            mm3             ;
         pmulhw      mm7,            [x_c1sqr2less1 GLOBAL]    ;

         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
         psubw       mm7,            mm5             ; c1

         movq        mm5,            mm1
         movq        mm4,            mm3

         pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
         paddw       mm5,            mm1

         pmulhw      mm3,            [x_s1sqr2 GLOBAL]
         paddw       mm3,            mm4

         paddw       mm3,            mm5             ; d1
         movq        mm6,            mm2             ; a1

         movq        mm4,            mm0             ; b1
         paddw       mm2,            mm3             ;0

         paddw       mm4,            mm7             ;1
         psubw       mm0,            mm7             ;2

         psubw       mm6,            mm3             ;3

         movq        mm1,            mm2             ; 03 02 01 00
         movq        mm3,            mm4             ; 23 22 21 20

         punpcklwd   mm1,            mm0             ; 11 01 10 00
         punpckhwd   mm2,            mm0             ; 13 03 12 02

         punpcklwd   mm3,            mm6             ; 31 21 30 20
         punpckhwd   mm4,            mm6             ; 33 23 32 22

         movq        mm0,            mm1             ; 11 01 10 00
         movq        mm5,            mm2             ; 13 03 12 02

         punpckldq   mm0,            mm3             ; 30 20 10 00
         punpckhdq   mm1,            mm3             ; 31 21 11 01

         punpckldq   mm2,            mm4             ; 32 22 12 02
         punpckhdq   mm5,            mm4             ; 33 23 13 03

         movq        mm3,            mm5             ; 33 23 13 03

         psubw       mm0,            mm2             ; b1= 0-2
         paddw       mm2,            mm2             ;

         movq        mm5,            mm1
         paddw       mm2,            mm0             ; a1 =0+2

         pmulhw      mm5,            [x_s1sqr2 GLOBAL]         ;
         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)

         movq        mm7,            mm3             ;
         pmulhw      mm7,            [x_c1sqr2less1 GLOBAL]    ;

         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
         psubw       mm7,            mm5             ; c1

         movq        mm5,            mm1
         movq        mm4,            mm3

         pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
         paddw       mm5,            mm1

         pmulhw      mm3,            [x_s1sqr2 GLOBAL]
         paddw       mm3,            mm4

         paddw       mm3,            mm5             ; d1
         paddw       mm0,            [fours GLOBAL]

         paddw       mm2,            [fours GLOBAL]
         movq        mm6,            mm2             ; a1

         movq        mm4,            mm0             ; b1
         paddw       mm2,            mm3             ;0

         paddw       mm4,            mm7             ;1
         psubw       mm0,            mm7             ;2

         psubw       mm6,            mm3             ;3
         psraw       mm2,            3

         psraw       mm0,            3
         psraw       mm4,            3

         psraw       mm6,            3

         movq        mm1,            mm2             ; 03 02 01 00
         movq        mm3,            mm4             ; 23 22 21 20

         punpcklwd   mm1,            mm0             ; 11 01 10 00
         punpckhwd   mm2,            mm0             ; 13 03 12 02

         punpcklwd   mm3,            mm6             ; 31 21 30 20
         punpckhwd   mm4,            mm6             ; 33 23 32 22

         movq        mm0,            mm1             ; 11 01 10 00
         movq        mm5,            mm2             ; 13 03 12 02

         punpckldq   mm0,            mm3             ; 30 20 10 00
         punpckhdq   mm1,            mm3             ; 31 21 11 01

         punpckldq   mm2,            mm4             ; 32 22 12 02
         punpckhdq   mm5,            mm4             ; 33 23 13 03

         movq        [rdx],          mm0

         movq        [rdx+rax],      mm1
         movq        [rdx+rax*2],    mm2

         add         rdx,            rax
         movq        [rdx+rax*2],    mm5

     ; begin epilog
     RESTORE_GOT
     UNSHADOW_ARGS
     pop         rbp
     ret


 ;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
 global sym(vp8_short_idct4x4llm_1_mmx)
 sym(vp8_short_idct4x4llm_1_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 3
     GET_GOT     rbx
     ; end prolog

         mov         rax,            arg(0) ;input
         movd        mm0,            [rax]

         paddw       mm0,            [fours GLOBAL]
         mov         rdx,            arg(1) ;output

         psraw       mm0,            3
         movsxd      rax,            dword ptr arg(2) ;pitch

         punpcklwd   mm0,            mm0
         punpckldq   mm0,            mm0

         movq        [rdx],          mm0
         movq        [rdx+rax],      mm0

         movq        [rdx+rax*2],    mm0
         add         rdx,            rax

         movq        [rdx+rax*2],    mm0


     ; begin epilog
     RESTORE_GOT
     UNSHADOW_ARGS
     pop         rbp
     ret

 ;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
 global sym(vp8_dc_only_idct_add_mmx)
 sym(vp8_dc_only_idct_add_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
     GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog

         mov         rsi,            arg(1) ;s -- prediction
         mov         rdi,            arg(2) ;d -- destination
         movsxd      rax,            dword ptr arg(4) ;stride
         movsxd      rdx,            dword ptr arg(3) ;pitch
         pxor        mm0,            mm0

         movd        mm5,            arg(0) ;input_dc

         paddw       mm5,            [fours GLOBAL]

         psraw       mm5,            3

         punpcklwd   mm5,            mm5
         punpckldq   mm5,            mm5

         movd        mm1,            [rsi]
         punpcklbw   mm1,            mm0
         paddsw      mm1,            mm5
         packuswb    mm1,            mm0              ; pack and unpack to saturate
         movd        [rdi],          mm1

         movd        mm2,            [rsi+rdx]
         punpcklbw   mm2,            mm0
         paddsw      mm2,            mm5
         packuswb    mm2,            mm0              ; pack and unpack to saturate
         movd        [rdi+rax],      mm2

         movd        mm3,            [rsi+2*rdx]
         punpcklbw   mm3,            mm0
         paddsw      mm3,            mm5
         packuswb    mm3,            mm0              ; pack and unpack to saturate
         movd        [rdi+2*rax],    mm3

         add         rdi,            rax
         add         rsi,            rdx
         movd        mm4,            [rsi+2*rdx]
         punpcklbw   mm4,            mm0
         paddsw      mm4,            mm5
         packuswb    mm4,            mm0              ; pack and unpack to saturate
         movd        [rdi+2*rax],    mm4

     ; begin epilog
     pop rdi
     pop rsi
     RESTORE_GOT
     UNSHADOW_ARGS
     pop         rbp
     ret

 SECTION_RODATA
 align 16
 x_s1sqr2:
     times 4 dw 0x8A8C
 align 16
 x_c1sqr2less1:
     times 4 dw 0x4E7B
 align 16
 fours:
     times 4 dw 0x0004
	;
	; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
	;
	; Use of this source code is governed by a BSD-style license
	; that can be found in the LICENSE file in the root of the source
	; tree. An additional intellectual property rights grant can be found
	; in the file PATENTS. All contributing project authors may
	; be found in the AUTHORS file in the root of the source tree.
	;


	%include "vpx_ports/x86_abi_support.asm"

	; /****************************************************************************
	; * Notes:
	; *
	; * This implementation makes use of 16 bit fixed point verio of two multiply
	; * constants:
	; * 1. sqrt(2) * cos (pi/8)
	; * 2. sqrt(2) * sin (pi/8)
	; * Becuase the first constant is bigger than 1, to maintain the same 16 bit
	; * fixed point prrcision as the second one, we use a trick of
	; * x * a = x + x*(a-1)
	; * so
	; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
	; *
	; * For the second constant, becuase of the 16bit version is 35468, which
	; * is bigger than 32768, in signed 16 bit multiply, it become a negative
	; * number.
	; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
	; *
	; **************************************************************************/


	;void short_idct4x4llm_mmx(short input, short output, int pitch)
	global sym(vp8_short_idct4x4llm_mmx)
	sym(vp8_short_idct4x4llm_mmx):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 3
	GET_GOT rbx
	; end prolog

	mov rax, arg(0) ;input
	mov rdx, arg(1) ;output

	movq mm0, [rax ]
	movq mm1, [rax+ 8]

	movq mm2, [rax+16]
	movq mm3, [rax+24]

	movsxd rax, dword ptr arg(2) ;pitch

	psubw mm0, mm2 ; b1= 0-2
	paddw mm2, mm2 ;

	movq mm5, mm1
	paddw mm2, mm0 ; a1 =0+2

	pmulhw mm5, [x_s1sqr2 GLOBAL] ;
	paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)

	movq mm7, mm3 ;
	pmulhw mm7, [x_c1sqr2less1 GLOBAL] ;

	paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
	psubw mm7, mm5 ; c1

	movq mm5, mm1
	movq mm4, mm3

	pmulhw mm5, [x_c1sqr2less1 GLOBAL]
	paddw mm5, mm1

	pmulhw mm3, [x_s1sqr2 GLOBAL]
	paddw mm3, mm4

	paddw mm3, mm5 ; d1
	movq mm6, mm2 ; a1

	movq mm4, mm0 ; b1
	paddw mm2, mm3 ;0

	paddw mm4, mm7 ;1
	psubw mm0, mm7 ;2

	psubw mm6, mm3 ;3

	movq mm1, mm2 ; 03 02 01 00
	movq mm3, mm4 ; 23 22 21 20

	punpcklwd mm1, mm0 ; 11 01 10 00
	punpckhwd mm2, mm0 ; 13 03 12 02

	punpcklwd mm3, mm6 ; 31 21 30 20
	punpckhwd mm4, mm6 ; 33 23 32 22

	movq mm0, mm1 ; 11 01 10 00
	movq mm5, mm2 ; 13 03 12 02

	punpckldq mm0, mm3 ; 30 20 10 00
	punpckhdq mm1, mm3 ; 31 21 11 01

	punpckldq mm2, mm4 ; 32 22 12 02
	punpckhdq mm5, mm4 ; 33 23 13 03

	movq mm3, mm5 ; 33 23 13 03

	psubw mm0, mm2 ; b1= 0-2
	paddw mm2, mm2 ;

	movq mm5, mm1
	paddw mm2, mm0 ; a1 =0+2

	pmulhw mm5, [x_s1sqr2 GLOBAL] ;
	paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)

	movq mm7, mm3 ;
	pmulhw mm7, [x_c1sqr2less1 GLOBAL] ;

	paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
	psubw mm7, mm5 ; c1

	movq mm5, mm1
	movq mm4, mm3

	pmulhw mm5, [x_c1sqr2less1 GLOBAL]
	paddw mm5, mm1

	pmulhw mm3, [x_s1sqr2 GLOBAL]
	paddw mm3, mm4

	paddw mm3, mm5 ; d1
	paddw mm0, [fours GLOBAL]

	paddw mm2, [fours GLOBAL]
	movq mm6, mm2 ; a1

	movq mm4, mm0 ; b1
	paddw mm2, mm3 ;0

	paddw mm4, mm7 ;1
	psubw mm0, mm7 ;2

	psubw mm6, mm3 ;3
	psraw mm2, 3

	psraw mm0, 3
	psraw mm4, 3

	psraw mm6, 3

	movq mm1, mm2 ; 03 02 01 00
	movq mm3, mm4 ; 23 22 21 20

	punpcklwd mm1, mm0 ; 11 01 10 00
	punpckhwd mm2, mm0 ; 13 03 12 02

	punpcklwd mm3, mm6 ; 31 21 30 20
	punpckhwd mm4, mm6 ; 33 23 32 22

	movq mm0, mm1 ; 11 01 10 00
	movq mm5, mm2 ; 13 03 12 02

	punpckldq mm0, mm3 ; 30 20 10 00
	punpckhdq mm1, mm3 ; 31 21 11 01

	punpckldq mm2, mm4 ; 32 22 12 02
	punpckhdq mm5, mm4 ; 33 23 13 03

	movq [rdx], mm0

	movq [rdx+rax], mm1
	movq [rdx+rax*2], mm2

	add rdx, rax
	movq [rdx+rax*2], mm5

	; begin epilog
	RESTORE_GOT
	UNSHADOW_ARGS
	pop rbp
	ret


	;void short_idct4x4llm_1_mmx(short input, short output, int pitch)
	global sym(vp8_short_idct4x4llm_1_mmx)
	sym(vp8_short_idct4x4llm_1_mmx):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 3
	GET_GOT rbx
	; end prolog

	mov rax, arg(0) ;input
	movd mm0, [rax]

	paddw mm0, [fours GLOBAL]
	mov rdx, arg(1) ;output

	psraw mm0, 3
	movsxd rax, dword ptr arg(2) ;pitch

	punpcklwd mm0, mm0
	punpckldq mm0, mm0

	movq [rdx], mm0
	movq [rdx+rax], mm0

	movq [rdx+rax*2], mm0
	add rdx, rax

	movq [rdx+rax*2], mm0


	; begin epilog
	RESTORE_GOT
	UNSHADOW_ARGS
	pop rbp
	ret

	;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char pred_ptr, unsigned char dst_ptr, int pitch, int stride)
	global sym(vp8_dc_only_idct_add_mmx)
	sym(vp8_dc_only_idct_add_mmx):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 5
	GET_GOT rbx
	push rsi
	push rdi
	; end prolog

	mov rsi, arg(1) ;s -- prediction
	mov rdi, arg(2) ;d -- destination
	movsxd rax, dword ptr arg(4) ;stride
	movsxd rdx, dword ptr arg(3) ;pitch
	pxor mm0, mm0

	movd mm5, arg(0) ;input_dc

	paddw mm5, [fours GLOBAL]

	psraw mm5, 3

	punpcklwd mm5, mm5
	punpckldq mm5, mm5

	movd mm1, [rsi]
	punpcklbw mm1, mm0
	paddsw mm1, mm5
	packuswb mm1, mm0 ; pack and unpack to saturate
	movd [rdi], mm1

	movd mm2, [rsi+rdx]
	punpcklbw mm2, mm0
	paddsw mm2, mm5
	packuswb mm2, mm0 ; pack and unpack to saturate
	movd [rdi+rax], mm2

	movd mm3, [rsi+2*rdx]
	punpcklbw mm3, mm0
	paddsw mm3, mm5
	packuswb mm3, mm0 ; pack and unpack to saturate
	movd [rdi+2*rax], mm3

	add rdi, rax
	add rsi, rdx
	movd mm4, [rsi+2*rdx]
	punpcklbw mm4, mm0
	paddsw mm4, mm5
	packuswb mm4, mm0 ; pack and unpack to saturate
	movd [rdi+2*rax], mm4

	; begin epilog
	pop rdi
	pop rsi
	RESTORE_GOT
	UNSHADOW_ARGS
	pop rbp
	ret

	SECTION_RODATA
	align 16
	x_s1sqr2:
	times 4 dw 0x8A8C
	align 16
	x_c1sqr2less1:
	times 4 dw 0x4E7B
	align 16
	fours:
	times 4 dw 0x0004