vp8/common/x86/subpixel_mmx.asm - platform/external/libvpx - Git at Google

 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;


 %include "vpx_ports/x86_abi_support.asm"


 %define BLOCK_HEIGHT_WIDTH 4
 %define vp8_filter_weight 128
 %define VP8_FILTER_SHIFT  7


 ;void vp8_filter_block1d_h6_mmx
 ;(
 ;    unsigned char   *src_ptr,
 ;    unsigned short  *output_ptr,
 ;    unsigned int    src_pixels_per_line,
 ;    unsigned int    pixel_step,
 ;    unsigned int    output_height,
 ;    unsigned int    output_width,
 ;    short           * vp8_filter
 ;)
 global sym(vp8_filter_block1d_h6_mmx)
 sym(vp8_filter_block1d_h6_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
     GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog

         mov         rdx,    arg(6) ;vp8_filter

         movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
         movq        mm2,    [rdx + 32]         ;
         movq        mm6,    [rdx + 48]        ;
         movq        mm7,    [rdx + 64]        ;

         mov         rdi,    arg(1) ;output_ptr
         mov         rsi,    arg(0) ;src_ptr
         movsxd      rcx,    dword ptr arg(4) ;output_height
         movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
         pxor        mm0,    mm0              ; mm0 = 00000000

 nextrow:
         movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
         movq        mm4,    mm3              ; mm4 = p-2..p5
         psrlq       mm3,    8                ; mm3 = p-1..p5
         punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
         pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.

         movq        mm5,    mm4              ; mm5 = p-2..p5
         punpckhbw   mm4,    mm0              ; mm5 = p2..p5
         pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
         paddsw      mm3,    mm4              ; mm3 += mm5

         movq        mm4,    mm5              ; mm4 = p-2..p5;
         psrlq       mm5,    16               ; mm5 = p0..p5;
         punpcklbw   mm5,    mm0              ; mm5 = p0..p3
         pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
         paddsw      mm3,    mm5              ; mm3 += mm5

         movq        mm5,    mm4              ; mm5 = p-2..p5
         psrlq       mm4,    24               ; mm4 = p1..p5
         punpcklbw   mm4,    mm0              ; mm4 = p1..p4
         pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
         paddsw      mm3,    mm4              ; mm3 += mm5

         ; do outer positive taps
         movd        mm4,    [rsi+3]
         punpcklbw   mm4,    mm0              ; mm5 = p3..p6
         pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
         paddsw      mm3,    mm4              ; mm3 += mm5

         punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
         pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
         paddsw      mm3,    mm5              ; mm3 += mm5

         paddsw      mm3,    [rd GLOBAL]               ; mm3 += round value
         psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
         packuswb    mm3,    mm0              ; pack and unpack to saturate
         punpcklbw   mm3,    mm0              ;

         movq        [rdi],  mm3              ; store the results in the destination

 %if ABI_IS_32BIT
         add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
         add         rdi,    rax;
 %else
         movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
         add         rdi,    rax;

         add         rsi,    r8               ; next line
 %endif

         dec         rcx                      ; decrement count
         jnz         nextrow                  ; next row

     ; begin epilog
     pop rdi
     pop rsi
     RESTORE_GOT
     UNSHADOW_ARGS
     pop         rbp
     ret


 ;
 ; THIS FUNCTION APPEARS TO BE UNUSED
 ;
 ;void vp8_filter_block1d_v6_mmx
 ;(
 ;   short *src_ptr,
 ;   unsigned char *output_ptr,
 ;   unsigned int pixels_per_line,
 ;   unsigned int pixel_step,
 ;   unsigned int output_height,
 ;   unsigned int output_width,
 ;   short * vp8_filter
 ;)
 global sym(vp8_filter_block1d_v6_mmx)
 sym(vp8_filter_block1d_v6_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
     GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog

         movq      mm5, [rd GLOBAL]
         push        rbx
         mov         rbx, arg(6) ;vp8_filter
         movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
         movq      mm2, [rbx + 32]         ;
         movq      mm6, [rbx + 48]        ;
         movq      mm7, [rbx + 64]        ;

         movsxd      rdx, dword ptr arg(2) ;pixels_per_line
         mov         rdi, arg(1) ;output_ptr
         mov         rsi, arg(0) ;src_ptr
         sub         rsi, rdx
         sub         rsi, rdx
         movsxd      rcx, DWORD PTR arg(4) ;output_height
         movsxd      rax, DWORD PTR arg(5) ;output_width      ; destination pitch?
         pxor        mm0, mm0              ; mm0 = 00000000


 nextrow_v:
         movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
         pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.


         movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
         pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
         paddsw      mm3, mm4              ; mm3 += mm4

         movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
         pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
         paddsw      mm3, mm4              ; mm3 += mm4

         movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
         pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
         paddsw      mm3, mm4              ; mm3 += mm4


         add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
         movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
         pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
         paddsw      mm3, mm4              ; mm3 += mm4

         movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
         pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
         paddsw      mm3, mm4              ; mm3 += mm4


         paddsw      mm3, mm5               ; mm3 += round value
         psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
         packuswb    mm3, mm0              ; pack and saturate

         movd        [rdi],mm3             ; store the results in the destination

         add         rdi,rax;

         dec         rcx                   ; decrement count
         jnz         nextrow_v             ; next row

         pop         rbx

     ; begin epilog
     pop rdi
     pop rsi
     RESTORE_GOT
     UNSHADOW_ARGS
     pop         rbp
     ret


 ;void vp8_filter_block1dc_v6_mmx
 ;(
 ;   short *src_ptr,
 ;   unsigned char *output_ptr,
 ;    int output_pitch,
 ;   unsigned int pixels_per_line,
 ;   unsigned int pixel_step,
 ;   unsigned int output_height,
 ;   unsigned int output_width,
 ;   short * vp8_filter
 ;)
 global sym(vp8_filter_block1dc_v6_mmx)
 sym(vp8_filter_block1dc_v6_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 8
     GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog

         movq      mm5, [rd GLOBAL]
         push        rbx
         mov         rbx, arg(7) ;vp8_filter
         movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
         movq      mm2, [rbx + 32]         ;
         movq      mm6, [rbx + 48]        ;
         movq      mm7, [rbx + 64]        ;

         movsxd      rdx, dword ptr arg(3) ;pixels_per_line
         mov         rdi, arg(1) ;output_ptr
         mov         rsi, arg(0) ;src_ptr
         sub         rsi, rdx
         sub         rsi, rdx
         movsxd      rcx, DWORD PTR arg(5) ;output_height
         movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
         pxor        mm0, mm0              ; mm0 = 00000000


 nextrow_cv:
         movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
         pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.


         movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
         pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
         paddsw      mm3, mm4              ; mm3 += mm4

         movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
         pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
         paddsw      mm3, mm4              ; mm3 += mm4

         movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
         pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
         paddsw      mm3, mm4              ; mm3 += mm4


         add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
         movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
         pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
         paddsw      mm3, mm4              ; mm3 += mm4

         movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
         pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
         paddsw      mm3, mm4              ; mm3 += mm4


         paddsw      mm3, mm5               ; mm3 += round value
         psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
         packuswb    mm3, mm0              ; pack and saturate

         movd        [rdi],mm3             ; store the results in the destination
         ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
         ; recon block should be in cache this shouldn't cost much.  Its obviously
         ; avoidable!!!.
         lea         rdi,  [rdi+rax] ;
         dec         rcx                   ; decrement count
         jnz         nextrow_cv             ; next row

         pop         rbx

     ; begin epilog
     pop rdi
     pop rsi
     RESTORE_GOT
     UNSHADOW_ARGS
     pop         rbp
     ret


 ;void bilinear_predict8x8_mmx
 ;(
 ;    unsigned char  *src_ptr,
 ;    int   src_pixels_per_line,
 ;    int  xoffset,
 ;    int  yoffset,
 ;   unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
 global sym(vp8_bilinear_predict8x8_mmx)
 sym(vp8_bilinear_predict8x8_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
     GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog

     ;const short *HFilter = bilinear_filters_mmx[xoffset];
     ;const short *VFilter = bilinear_filters_mmx[yoffset];

         movsxd      rax,        dword ptr arg(2) ;xoffset
         mov         rdi,        arg(4) ;dst_ptr           ;

         shl         rax,        5 ; offset * 32
         lea         rcx,        [sym(vp8_bilinear_filters_mmx) GLOBAL]

         add         rax,        rcx ; HFilter
         mov         rsi,        arg(0) ;src_ptr              ;

         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
         movq        mm1,        [rax]               ;

         movq        mm2,        [rax+16]            ;
         movsxd      rax,        dword ptr arg(3) ;yoffset

         pxor        mm0,        mm0                 ;

         shl         rax,        5 ; offset*32
         add         rax,        rcx ; VFilter

         lea         rcx,        [rdi+rdx*8]          ;
         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;


         ; get the first horizontal line done       ;
         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         movq        mm4,        mm3                 ; make a copy of current line

         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
         punpckhbw   mm4,        mm0                 ;

         pmullw      mm3,        mm1                 ;
         pmullw      mm4,        mm1                 ;

         movq        mm5,        [rsi+1]             ;
         movq        mm6,        mm5                 ;

         punpcklbw   mm5,        mm0                 ;
         punpckhbw   mm6,        mm0                 ;

         pmullw      mm5,        mm2                 ;
         pmullw      mm6,        mm2                 ;

         paddw       mm3,        mm5                 ;
         paddw       mm4,        mm6                 ;

         paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

         paddw       mm4,        [rd GLOBAL]                  ;
         psraw       mm4,        VP8_FILTER_SHIFT        ;

         movq        mm7,        mm3                 ;
         packuswb    mm7,        mm4                 ;

         add         rsi,        rdx                 ; next line
 next_row_8x8:
         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         movq        mm4,        mm3                 ; make a copy of current line

         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
         punpckhbw   mm4,        mm0                 ;

         pmullw      mm3,        mm1                 ;
         pmullw      mm4,        mm1                 ;

         movq        mm5,        [rsi+1]             ;
         movq        mm6,        mm5                 ;

         punpcklbw   mm5,        mm0                 ;
         punpckhbw   mm6,        mm0                 ;

         pmullw      mm5,        mm2                 ;
         pmullw      mm6,        mm2                 ;

         paddw       mm3,        mm5                 ;
         paddw       mm4,        mm6                 ;

         movq        mm5,        mm7                 ;
         movq        mm6,        mm7                 ;

         punpcklbw   mm5,        mm0                 ;
         punpckhbw   mm6,        mm0

         pmullw      mm5,        [rax]               ;
         pmullw      mm6,        [rax]               ;

         paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

         paddw       mm4,        [rd GLOBAL]                  ;
         psraw       mm4,        VP8_FILTER_SHIFT        ;

         movq        mm7,        mm3                 ;
         packuswb    mm7,        mm4                 ;


         pmullw      mm3,        [rax+16]            ;
         pmullw      mm4,        [rax+16]            ;

         paddw       mm3,        mm5                 ;
         paddw       mm4,        mm6                 ;


         paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

         paddw       mm4,        [rd GLOBAL]                  ;
         psraw       mm4,        VP8_FILTER_SHIFT        ;

         packuswb    mm3,        mm4

         movq        [rdi],      mm3                 ; store the results in the destination

 %if ABI_IS_32BIT
         add         rsi,        rdx                 ; next line
         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
 %else
         movsxd      r8,         dword ptr arg(5) ;dst_pitch
         add         rsi,        rdx                 ; next line
         add         rdi,        r8                  ;dst_pitch
 %endif
         cmp         rdi,        rcx                 ;
         jne         next_row_8x8

     ; begin epilog
     pop rdi
     pop rsi
     RESTORE_GOT
     UNSHADOW_ARGS
     pop         rbp
     ret


 ;void bilinear_predict8x4_mmx
 ;(
 ;    unsigned char  *src_ptr,
 ;    int   src_pixels_per_line,
 ;    int  xoffset,
 ;    int  yoffset,
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
 global sym(vp8_bilinear_predict8x4_mmx)
 sym(vp8_bilinear_predict8x4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
     GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog

     ;const short *HFilter = bilinear_filters_mmx[xoffset];
     ;const short *VFilter = bilinear_filters_mmx[yoffset];

         movsxd      rax,        dword ptr arg(2) ;xoffset
         mov         rdi,        arg(4) ;dst_ptr           ;

         lea         rcx,        [sym(vp8_bilinear_filters_mmx) GLOBAL]
         shl         rax,        5

         mov         rsi,        arg(0) ;src_ptr              ;
         add         rax,        rcx

         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
         movq        mm1,        [rax]               ;

         movq        mm2,        [rax+16]            ;
         movsxd      rax,        dword ptr arg(3) ;yoffset

         pxor        mm0,        mm0                 ;
         shl         rax,        5

         add         rax,        rcx
         lea         rcx,        [rdi+rdx*4]          ;

         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;

         ; get the first horizontal line done       ;
         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         movq        mm4,        mm3                 ; make a copy of current line

         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
         punpckhbw   mm4,        mm0                 ;

         pmullw      mm3,        mm1                 ;
         pmullw      mm4,        mm1                 ;

         movq        mm5,        [rsi+1]             ;
         movq        mm6,        mm5                 ;

         punpcklbw   mm5,        mm0                 ;
         punpckhbw   mm6,        mm0                 ;

         pmullw      mm5,        mm2                 ;
         pmullw      mm6,        mm2                 ;

         paddw       mm3,        mm5                 ;
         paddw       mm4,        mm6                 ;

         paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

         paddw       mm4,        [rd GLOBAL]                  ;
         psraw       mm4,        VP8_FILTER_SHIFT        ;

         movq        mm7,        mm3                 ;
         packuswb    mm7,        mm4                 ;

         add         rsi,        rdx                 ; next line
 next_row_8x4:
         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         movq        mm4,        mm3                 ; make a copy of current line

         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
         punpckhbw   mm4,        mm0                 ;

         pmullw      mm3,        mm1                 ;
         pmullw      mm4,        mm1                 ;

         movq        mm5,        [rsi+1]             ;
         movq        mm6,        mm5                 ;

         punpcklbw   mm5,        mm0                 ;
         punpckhbw   mm6,        mm0                 ;

         pmullw      mm5,        mm2                 ;
         pmullw      mm6,        mm2                 ;

         paddw       mm3,        mm5                 ;
         paddw       mm4,        mm6                 ;

         movq        mm5,        mm7                 ;
         movq        mm6,        mm7                 ;

         punpcklbw   mm5,        mm0                 ;
         punpckhbw   mm6,        mm0

         pmullw      mm5,        [rax]               ;
         pmullw      mm6,        [rax]               ;

         paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

         paddw       mm4,        [rd GLOBAL]                  ;
         psraw       mm4,        VP8_FILTER_SHIFT        ;

         movq        mm7,        mm3                 ;
         packuswb    mm7,        mm4                 ;


         pmullw      mm3,        [rax+16]            ;
         pmullw      mm4,        [rax+16]            ;

         paddw       mm3,        mm5                 ;
         paddw       mm4,        mm6                 ;


         paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

         paddw       mm4,        [rd GLOBAL]                  ;
         psraw       mm4,        VP8_FILTER_SHIFT        ;

         packuswb    mm3,        mm4

         movq        [rdi],      mm3                 ; store the results in the destination

 %if ABI_IS_32BIT
         add         rsi,        rdx                 ; next line
         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
 %else
         movsxd      r8,         dword ptr arg(5) ;dst_pitch
         add         rsi,        rdx                 ; next line
         add         rdi,        r8
 %endif
         cmp         rdi,        rcx                 ;
         jne         next_row_8x4

     ; begin epilog
     pop rdi
     pop rsi
     RESTORE_GOT
     UNSHADOW_ARGS
     pop         rbp
     ret


 ;void bilinear_predict4x4_mmx
 ;(
 ;    unsigned char  *src_ptr,
 ;    int   src_pixels_per_line,
 ;    int  xoffset,
 ;    int  yoffset,
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
 global sym(vp8_bilinear_predict4x4_mmx)
 sym(vp8_bilinear_predict4x4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
     GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog

     ;const short *HFilter = bilinear_filters_mmx[xoffset];
     ;const short *VFilter = bilinear_filters_mmx[yoffset];

         movsxd      rax,        dword ptr arg(2) ;xoffset
         mov         rdi,        arg(4) ;dst_ptr           ;

         lea         rcx,        [sym(vp8_bilinear_filters_mmx) GLOBAL]
         shl         rax,        5

         add         rax,        rcx ; HFilter
         mov         rsi,        arg(0) ;src_ptr              ;

         movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
         movq        mm1,        [rax]               ;

         movq        mm2,        [rax+16]            ;
         movsxd      rax,        dword ptr arg(3) ;yoffset

         pxor        mm0,        mm0                 ;
         shl         rax,        5

         add         rax,        rcx
         lea         rcx,        [rdi+rdx*4]          ;

         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;

         ; get the first horizontal line done       ;
         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

         pmullw      mm3,        mm1                 ;
         movd        mm5,        [rsi+1]             ;

         punpcklbw   mm5,        mm0                 ;
         pmullw      mm5,        mm2                 ;

         paddw       mm3,        mm5                 ;
         paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value

         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

         movq        mm7,        mm3                 ;
         packuswb    mm7,        mm0                 ;

         add         rsi,        rdx                 ; next line
 next_row_4x4:
         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

         pmullw      mm3,        mm1                 ;
         movd        mm5,        [rsi+1]             ;

         punpcklbw   mm5,        mm0                 ;
         pmullw      mm5,        mm2                 ;

         paddw       mm3,        mm5                 ;

         movq        mm5,        mm7                 ;
         punpcklbw   mm5,        mm0                 ;

         pmullw      mm5,        [rax]               ;
         paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value

         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
         movq        mm7,        mm3                 ;

         packuswb    mm7,        mm0                 ;

         pmullw      mm3,        [rax+16]            ;
         paddw       mm3,        mm5                 ;


         paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

         packuswb    mm3,        mm0
         movd        [rdi],      mm3                 ; store the results in the destination

 %if ABI_IS_32BIT
         add         rsi,        rdx                 ; next line
         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
 %else
         movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
         add         rsi,        rdx                 ; next line
         add         rdi,        r8
 %endif

         cmp         rdi,        rcx                 ;
         jne         next_row_4x4

     ; begin epilog
     pop rdi
     pop rsi
     RESTORE_GOT
     UNSHADOW_ARGS
     pop         rbp
     ret


 SECTION_RODATA
 align 16
 rd:
     times 4 dw 0x40

 align 16
 global HIDDEN_DATA(sym(vp8_six_tap_mmx))
 sym(vp8_six_tap_mmx):
     times 8 dw 0
     times 8 dw 0
     times 8 dw 128
     times 8 dw 0
     times 8 dw 0
     times 8 dw 0

     times 8 dw 0
     times 8 dw -6
     times 8 dw 123
     times 8 dw 12
     times 8 dw -1
     times 8 dw 0

     times 8 dw 2
     times 8 dw -11
     times 8 dw 108
     times 8 dw 36
     times 8 dw -8
     times 8 dw 1

     times 8 dw 0
     times 8 dw -9
     times 8 dw 93
     times 8 dw 50
     times 8 dw -6
     times 8 dw 0

     times 8 dw 3
     times 8 dw -16
     times 8 dw 77
     times 8 dw 77
     times 8 dw -16
     times 8 dw 3

     times 8 dw 0
     times 8 dw -6
     times 8 dw 50
     times 8 dw 93
     times 8 dw -9
     times 8 dw 0

     times 8 dw 1
     times 8 dw -8
     times 8 dw 36
     times 8 dw 108
     times 8 dw -11
     times 8 dw 2

     times 8 dw 0
     times 8 dw -1
     times 8 dw 12
     times 8 dw 123
     times 8 dw -6
     times 8 dw 0


 align 16
 global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx))
 sym(vp8_bilinear_filters_mmx):
     times 8 dw 128
     times 8 dw 0

     times 8 dw 112
     times 8 dw 16

     times 8 dw 96
     times 8 dw 32

     times 8 dw 80
     times 8 dw 48

     times 8 dw 64
     times 8 dw 64

     times 8 dw 48
     times 8 dw 80

     times 8 dw 32
     times 8 dw 96

     times 8 dw 16
     times 8 dw 112
	;
	; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
	;
	; Use of this source code is governed by a BSD-style license
	; that can be found in the LICENSE file in the root of the source
	; tree. An additional intellectual property rights grant can be found
	; in the file PATENTS. All contributing project authors may
	; be found in the AUTHORS file in the root of the source tree.
	;


	%include "vpx_ports/x86_abi_support.asm"


	%define BLOCK_HEIGHT_WIDTH 4
	%define vp8_filter_weight 128
	%define VP8_FILTER_SHIFT 7


	;void vp8_filter_block1d_h6_mmx
	;(
	; unsigned char *src_ptr,
	; unsigned short *output_ptr,
	; unsigned int src_pixels_per_line,
	; unsigned int pixel_step,
	; unsigned int output_height,
	; unsigned int output_width,
	; short * vp8_filter
	;)
	global sym(vp8_filter_block1d_h6_mmx)
	sym(vp8_filter_block1d_h6_mmx):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 7
	GET_GOT rbx
	push rsi
	push rdi
	; end prolog

	mov rdx, arg(6) ;vp8_filter

	movq mm1, [rdx + 16] ; do both the negative taps first!!!
	movq mm2, [rdx + 32] ;
	movq mm6, [rdx + 48] ;
	movq mm7, [rdx + 64] ;

	mov rdi, arg(1) ;output_ptr
	mov rsi, arg(0) ;src_ptr
	movsxd rcx, dword ptr arg(4) ;output_height
	movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
	pxor mm0, mm0 ; mm0 = 00000000

	nextrow:
	movq mm3, [rsi-2] ; mm3 = p-2..p5
	movq mm4, mm3 ; mm4 = p-2..p5
	psrlq mm3, 8 ; mm3 = p-1..p5
	punpcklbw mm3, mm0 ; mm3 = p-1..p2
	pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.

	movq mm5, mm4 ; mm5 = p-2..p5
	punpckhbw mm4, mm0 ; mm5 = p2..p5
	pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
	paddsw mm3, mm4 ; mm3 += mm5

	movq mm4, mm5 ; mm4 = p-2..p5;
	psrlq mm5, 16 ; mm5 = p0..p5;
	punpcklbw mm5, mm0 ; mm5 = p0..p3
	pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
	paddsw mm3, mm5 ; mm3 += mm5

	movq mm5, mm4 ; mm5 = p-2..p5
	psrlq mm4, 24 ; mm4 = p1..p5
	punpcklbw mm4, mm0 ; mm4 = p1..p4
	pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
	paddsw mm3, mm4 ; mm3 += mm5

	; do outer positive taps
	movd mm4, [rsi+3]
	punpcklbw mm4, mm0 ; mm5 = p3..p6
	pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
	paddsw mm3, mm4 ; mm3 += mm5

	punpcklbw mm5, mm0 ; mm5 = p-2..p1
	pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
	paddsw mm3, mm5 ; mm3 += mm5

	paddsw mm3, [rd GLOBAL] ; mm3 += round value
	psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
	packuswb mm3, mm0 ; pack and unpack to saturate
	punpcklbw mm3, mm0 ;

	movq [rdi], mm3 ; store the results in the destination

	%if ABI_IS_32BIT
	add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
	add rdi, rax;
	%else
	movsxd r8, dword ptr arg(2) ;src_pixels_per_line
	add rdi, rax;

	add rsi, r8 ; next line
	%endif

	dec rcx ; decrement count
	jnz nextrow ; next row

	; begin epilog
	pop rdi
	pop rsi
	RESTORE_GOT
	UNSHADOW_ARGS
	pop rbp
	ret


	;
	; THIS FUNCTION APPEARS TO BE UNUSED
	;
	;void vp8_filter_block1d_v6_mmx
	;(
	; short *src_ptr,
	; unsigned char *output_ptr,
	; unsigned int pixels_per_line,
	; unsigned int pixel_step,
	; unsigned int output_height,
	; unsigned int output_width,
	; short * vp8_filter
	;)
	global sym(vp8_filter_block1d_v6_mmx)
	sym(vp8_filter_block1d_v6_mmx):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 7
	GET_GOT rbx
	push rsi
	push rdi
	; end prolog

	movq mm5, [rd GLOBAL]
	push rbx
	mov rbx, arg(6) ;vp8_filter
	movq mm1, [rbx + 16] ; do both the negative taps first!!!
	movq mm2, [rbx + 32] ;
	movq mm6, [rbx + 48] ;
	movq mm7, [rbx + 64] ;

	movsxd rdx, dword ptr arg(2) ;pixels_per_line
	mov rdi, arg(1) ;output_ptr
	mov rsi, arg(0) ;src_ptr
	sub rsi, rdx
	sub rsi, rdx
	movsxd rcx, DWORD PTR arg(4) ;output_height
	movsxd rax, DWORD PTR arg(5) ;output_width ; destination pitch?
	pxor mm0, mm0 ; mm0 = 00000000


	nextrow_v:
	movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
	pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.


	movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
	pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
	paddsw mm3, mm4 ; mm3 += mm4

	movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
	pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
	paddsw mm3, mm4 ; mm3 += mm4

	movq mm4, [rsi] ; mm4 = p0..p3 = row -2
	pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
	paddsw mm3, mm4 ; mm3 += mm4


	add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
	movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
	pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
	paddsw mm3, mm4 ; mm3 += mm4

	movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
	pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
	paddsw mm3, mm4 ; mm3 += mm4


	paddsw mm3, mm5 ; mm3 += round value
	psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
	packuswb mm3, mm0 ; pack and saturate

	movd [rdi],mm3 ; store the results in the destination

	add rdi,rax;

	dec rcx ; decrement count
	jnz nextrow_v ; next row

	pop rbx

	; begin epilog
	pop rdi
	pop rsi
	RESTORE_GOT
	UNSHADOW_ARGS
	pop rbp
	ret


	;void vp8_filter_block1dc_v6_mmx
	;(
	; short *src_ptr,
	; unsigned char *output_ptr,
	; int output_pitch,
	; unsigned int pixels_per_line,
	; unsigned int pixel_step,
	; unsigned int output_height,
	; unsigned int output_width,
	; short * vp8_filter
	;)
	global sym(vp8_filter_block1dc_v6_mmx)
	sym(vp8_filter_block1dc_v6_mmx):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 8
	GET_GOT rbx
	push rsi
	push rdi
	; end prolog

	movq mm5, [rd GLOBAL]
	push rbx
	mov rbx, arg(7) ;vp8_filter
	movq mm1, [rbx + 16] ; do both the negative taps first!!!
	movq mm2, [rbx + 32] ;
	movq mm6, [rbx + 48] ;
	movq mm7, [rbx + 64] ;

	movsxd rdx, dword ptr arg(3) ;pixels_per_line
	mov rdi, arg(1) ;output_ptr
	mov rsi, arg(0) ;src_ptr
	sub rsi, rdx
	sub rsi, rdx
	movsxd rcx, DWORD PTR arg(5) ;output_height
	movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
	pxor mm0, mm0 ; mm0 = 00000000


	nextrow_cv:
	movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
	pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.


	movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
	pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
	paddsw mm3, mm4 ; mm3 += mm4

	movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
	pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
	paddsw mm3, mm4 ; mm3 += mm4

	movq mm4, [rsi] ; mm4 = p0..p3 = row -2
	pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
	paddsw mm3, mm4 ; mm3 += mm4


	add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
	movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
	pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
	paddsw mm3, mm4 ; mm3 += mm4

	movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
	pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
	paddsw mm3, mm4 ; mm3 += mm4


	paddsw mm3, mm5 ; mm3 += round value
	psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
	packuswb mm3, mm0 ; pack and saturate

	movd [rdi],mm3 ; store the results in the destination
	; the subsequent iterations repeat 3 out of 4 of these reads. Since the
	; recon block should be in cache this shouldn't cost much. Its obviously
	; avoidable!!!.
	lea rdi, [rdi+rax] ;
	dec rcx ; decrement count
	jnz nextrow_cv ; next row

	pop rbx

	; begin epilog
	pop rdi
	pop rsi
	RESTORE_GOT
	UNSHADOW_ARGS
	pop rbp
	ret


	;void bilinear_predict8x8_mmx
	;(
	; unsigned char *src_ptr,
	; int src_pixels_per_line,
	; int xoffset,
	; int yoffset,
	; unsigned char *dst_ptr,
	; int dst_pitch
	;)
	global sym(vp8_bilinear_predict8x8_mmx)
	sym(vp8_bilinear_predict8x8_mmx):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 6
	GET_GOT rbx
	push rsi
	push rdi
	; end prolog

	;const short *HFilter = bilinear_filters_mmx[xoffset];
	;const short *VFilter = bilinear_filters_mmx[yoffset];

	movsxd rax, dword ptr arg(2) ;xoffset
	mov rdi, arg(4) ;dst_ptr ;

	shl rax, 5 ; offset * 32
	lea rcx, [sym(vp8_bilinear_filters_mmx) GLOBAL]

	add rax, rcx ; HFilter
	mov rsi, arg(0) ;src_ptr ;

	movsxd rdx, dword ptr arg(5) ;dst_pitch
	movq mm1, [rax] ;

	movq mm2, [rax+16] ;
	movsxd rax, dword ptr arg(3) ;yoffset

	pxor mm0, mm0 ;

	shl rax, 5 ; offset*32
	add rax, rcx ; VFilter

	lea rcx, [rdi+rdx*8] ;
	movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;



	; get the first horizontal line done ;
	movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
	movq mm4, mm3 ; make a copy of current line

	punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
	punpckhbw mm4, mm0 ;

	pmullw mm3, mm1 ;
	pmullw mm4, mm1 ;

	movq mm5, [rsi+1] ;
	movq mm6, mm5 ;

	punpcklbw mm5, mm0 ;
	punpckhbw mm6, mm0 ;

	pmullw mm5, mm2 ;
	pmullw mm6, mm2 ;

	paddw mm3, mm5 ;
	paddw mm4, mm6 ;

	paddw mm3, [rd GLOBAL] ; xmm3 += round value
	psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128

	paddw mm4, [rd GLOBAL] ;
	psraw mm4, VP8_FILTER_SHIFT ;

	movq mm7, mm3 ;
	packuswb mm7, mm4 ;

	add rsi, rdx ; next line
	next_row_8x8:
	movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
	movq mm4, mm3 ; make a copy of current line

	punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
	punpckhbw mm4, mm0 ;

	pmullw mm3, mm1 ;
	pmullw mm4, mm1 ;

	movq mm5, [rsi+1] ;
	movq mm6, mm5 ;

	punpcklbw mm5, mm0 ;
	punpckhbw mm6, mm0 ;

	pmullw mm5, mm2 ;
	pmullw mm6, mm2 ;

	paddw mm3, mm5 ;
	paddw mm4, mm6 ;

	movq mm5, mm7 ;
	movq mm6, mm7 ;

	punpcklbw mm5, mm0 ;
	punpckhbw mm6, mm0

	pmullw mm5, [rax] ;
	pmullw mm6, [rax] ;

	paddw mm3, [rd GLOBAL] ; xmm3 += round value
	psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128

	paddw mm4, [rd GLOBAL] ;
	psraw mm4, VP8_FILTER_SHIFT ;

	movq mm7, mm3 ;
	packuswb mm7, mm4 ;


	pmullw mm3, [rax+16] ;
	pmullw mm4, [rax+16] ;

	paddw mm3, mm5 ;
	paddw mm4, mm6 ;


	paddw mm3, [rd GLOBAL] ; xmm3 += round value
	psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128

	paddw mm4, [rd GLOBAL] ;
	psraw mm4, VP8_FILTER_SHIFT ;

	packuswb mm3, mm4

	movq [rdi], mm3 ; store the results in the destination

	%if ABI_IS_32BIT
	add rsi, rdx ; next line
	add rdi, dword ptr arg(5) ;dst_pitch ;
	%else
	movsxd r8, dword ptr arg(5) ;dst_pitch
	add rsi, rdx ; next line
	add rdi, r8 ;dst_pitch
	%endif
	cmp rdi, rcx ;
	jne next_row_8x8

	; begin epilog
	pop rdi
	pop rsi
	RESTORE_GOT
	UNSHADOW_ARGS
	pop rbp
	ret


	;void bilinear_predict8x4_mmx
	;(
	; unsigned char *src_ptr,
	; int src_pixels_per_line,
	; int xoffset,
	; int yoffset,
	; unsigned char *dst_ptr,
	; int dst_pitch
	;)
	global sym(vp8_bilinear_predict8x4_mmx)
	sym(vp8_bilinear_predict8x4_mmx):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 6
	GET_GOT rbx
	push rsi
	push rdi
	; end prolog

	;const short *HFilter = bilinear_filters_mmx[xoffset];
	;const short *VFilter = bilinear_filters_mmx[yoffset];

	movsxd rax, dword ptr arg(2) ;xoffset
	mov rdi, arg(4) ;dst_ptr ;

	lea rcx, [sym(vp8_bilinear_filters_mmx) GLOBAL]
	shl rax, 5

	mov rsi, arg(0) ;src_ptr ;
	add rax, rcx

	movsxd rdx, dword ptr arg(5) ;dst_pitch
	movq mm1, [rax] ;

	movq mm2, [rax+16] ;
	movsxd rax, dword ptr arg(3) ;yoffset

	pxor mm0, mm0 ;
	shl rax, 5

	add rax, rcx
	lea rcx, [rdi+rdx*4] ;

	movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;

	; get the first horizontal line done ;
	movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
	movq mm4, mm3 ; make a copy of current line

	punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
	punpckhbw mm4, mm0 ;

	pmullw mm3, mm1 ;
	pmullw mm4, mm1 ;

	movq mm5, [rsi+1] ;
	movq mm6, mm5 ;

	punpcklbw mm5, mm0 ;
	punpckhbw mm6, mm0 ;

	pmullw mm5, mm2 ;
	pmullw mm6, mm2 ;

	paddw mm3, mm5 ;
	paddw mm4, mm6 ;

	paddw mm3, [rd GLOBAL] ; xmm3 += round value
	psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128

	paddw mm4, [rd GLOBAL] ;
	psraw mm4, VP8_FILTER_SHIFT ;

	movq mm7, mm3 ;
	packuswb mm7, mm4 ;

	add rsi, rdx ; next line
	next_row_8x4:
	movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
	movq mm4, mm3 ; make a copy of current line

	punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
	punpckhbw mm4, mm0 ;

	pmullw mm3, mm1 ;
	pmullw mm4, mm1 ;

	movq mm5, [rsi+1] ;
	movq mm6, mm5 ;

	punpcklbw mm5, mm0 ;
	punpckhbw mm6, mm0 ;

	pmullw mm5, mm2 ;
	pmullw mm6, mm2 ;

	paddw mm3, mm5 ;
	paddw mm4, mm6 ;

	movq mm5, mm7 ;
	movq mm6, mm7 ;

	punpcklbw mm5, mm0 ;
	punpckhbw mm6, mm0

	pmullw mm5, [rax] ;
	pmullw mm6, [rax] ;

	paddw mm3, [rd GLOBAL] ; xmm3 += round value
	psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128

	paddw mm4, [rd GLOBAL] ;
	psraw mm4, VP8_FILTER_SHIFT ;

	movq mm7, mm3 ;
	packuswb mm7, mm4 ;


	pmullw mm3, [rax+16] ;
	pmullw mm4, [rax+16] ;

	paddw mm3, mm5 ;
	paddw mm4, mm6 ;


	paddw mm3, [rd GLOBAL] ; xmm3 += round value
	psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128

	paddw mm4, [rd GLOBAL] ;
	psraw mm4, VP8_FILTER_SHIFT ;

	packuswb mm3, mm4

	movq [rdi], mm3 ; store the results in the destination

	%if ABI_IS_32BIT
	add rsi, rdx ; next line
	add rdi, dword ptr arg(5) ;dst_pitch ;
	%else
	movsxd r8, dword ptr arg(5) ;dst_pitch
	add rsi, rdx ; next line
	add rdi, r8
	%endif
	cmp rdi, rcx ;
	jne next_row_8x4

	; begin epilog
	pop rdi
	pop rsi
	RESTORE_GOT
	UNSHADOW_ARGS
	pop rbp
	ret


	;void bilinear_predict4x4_mmx
	;(
	; unsigned char *src_ptr,
	; int src_pixels_per_line,
	; int xoffset,
	; int yoffset,
	; unsigned char *dst_ptr,
	; int dst_pitch
	;)
	global sym(vp8_bilinear_predict4x4_mmx)
	sym(vp8_bilinear_predict4x4_mmx):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 6
	GET_GOT rbx
	push rsi
	push rdi
	; end prolog

	;const short *HFilter = bilinear_filters_mmx[xoffset];
	;const short *VFilter = bilinear_filters_mmx[yoffset];

	movsxd rax, dword ptr arg(2) ;xoffset
	mov rdi, arg(4) ;dst_ptr ;

	lea rcx, [sym(vp8_bilinear_filters_mmx) GLOBAL]
	shl rax, 5

	add rax, rcx ; HFilter
	mov rsi, arg(0) ;src_ptr ;

	movsxd rdx, dword ptr arg(5) ;ldst_pitch
	movq mm1, [rax] ;

	movq mm2, [rax+16] ;
	movsxd rax, dword ptr arg(3) ;yoffset

	pxor mm0, mm0 ;
	shl rax, 5

	add rax, rcx
	lea rcx, [rdi+rdx*4] ;

	movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;

	; get the first horizontal line done ;
	movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
	punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06

	pmullw mm3, mm1 ;
	movd mm5, [rsi+1] ;

	punpcklbw mm5, mm0 ;
	pmullw mm5, mm2 ;

	paddw mm3, mm5 ;
	paddw mm3, [rd GLOBAL] ; xmm3 += round value

	psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128

	movq mm7, mm3 ;
	packuswb mm7, mm0 ;

	add rsi, rdx ; next line
	next_row_4x4:
	movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
	punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06

	pmullw mm3, mm1 ;
	movd mm5, [rsi+1] ;

	punpcklbw mm5, mm0 ;
	pmullw mm5, mm2 ;

	paddw mm3, mm5 ;

	movq mm5, mm7 ;
	punpcklbw mm5, mm0 ;

	pmullw mm5, [rax] ;
	paddw mm3, [rd GLOBAL] ; xmm3 += round value

	psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
	movq mm7, mm3 ;

	packuswb mm7, mm0 ;

	pmullw mm3, [rax+16] ;
	paddw mm3, mm5 ;


	paddw mm3, [rd GLOBAL] ; xmm3 += round value
	psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128

	packuswb mm3, mm0
	movd [rdi], mm3 ; store the results in the destination

	%if ABI_IS_32BIT
	add rsi, rdx ; next line
	add rdi, dword ptr arg(5) ;dst_pitch ;
	%else
	movsxd r8, dword ptr arg(5) ;dst_pitch ;
	add rsi, rdx ; next line
	add rdi, r8
	%endif

	cmp rdi, rcx ;
	jne next_row_4x4

	; begin epilog
	pop rdi
	pop rsi
	RESTORE_GOT
	UNSHADOW_ARGS
	pop rbp
	ret



	SECTION_RODATA
	align 16
	rd:
	times 4 dw 0x40

	align 16
	global HIDDEN_DATA(sym(vp8_six_tap_mmx))
	sym(vp8_six_tap_mmx):
	times 8 dw 0
	times 8 dw 0
	times 8 dw 128
	times 8 dw 0
	times 8 dw 0
	times 8 dw 0

	times 8 dw 0
	times 8 dw -6
	times 8 dw 123
	times 8 dw 12
	times 8 dw -1
	times 8 dw 0

	times 8 dw 2
	times 8 dw -11
	times 8 dw 108
	times 8 dw 36
	times 8 dw -8
	times 8 dw 1

	times 8 dw 0
	times 8 dw -9
	times 8 dw 93
	times 8 dw 50
	times 8 dw -6
	times 8 dw 0

	times 8 dw 3
	times 8 dw -16
	times 8 dw 77
	times 8 dw 77
	times 8 dw -16
	times 8 dw 3

	times 8 dw 0
	times 8 dw -6
	times 8 dw 50
	times 8 dw 93
	times 8 dw -9
	times 8 dw 0

	times 8 dw 1
	times 8 dw -8
	times 8 dw 36
	times 8 dw 108
	times 8 dw -11
	times 8 dw 2

	times 8 dw 0
	times 8 dw -1
	times 8 dw 12
	times 8 dw 123
	times 8 dw -6
	times 8 dw 0


	align 16
	global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx))
	sym(vp8_bilinear_filters_mmx):
	times 8 dw 128
	times 8 dw 0

	times 8 dw 112
	times 8 dw 16

	times 8 dw 96
	times 8 dw 32

	times 8 dw 80
	times 8 dw 48

	times 8 dw 64
	times 8 dw 64

	times 8 dw 48
	times 8 dw 80

	times 8 dw 32
	times 8 dw 96

	times 8 dw 16
	times 8 dw 112