vp8/common/x86/postproc_mmx.c - platform/external/libvpx - Git at Google

 /*
  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */


 #include <math.h>
 #include <stdlib.h>
 #include "vpx_scale/yv12config.h"
 #include "pragmas.h"

 #define VP8_FILTER_WEIGHT 128
 #define VP8_FILTER_SHIFT  7


 /* static constants */
 __declspec(align(16))
 const static short  Blur[48] =
 {

     16, 16, 16, 16, 16, 16, 16, 16,
     16, 16, 16, 16, 16, 16, 16, 16,
     64, 64, 64, 64, 64, 64, 64, 64,
     16, 16, 16, 16, 16, 16, 16, 16,
     16, 16, 16, 16, 16, 16, 16, 16,
     0,  0,  0,  0,  0,  0,  0,  0,

 };
 #define RD  __declspec(align(16)) __int64 rd  = 0x0040004000400040;
 #define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004};

 #ifndef RELOCATEABLE
 const static RD;
 const static R4D2;
 #endif


 /* external references */
 extern double vp8_gaussian(double sigma, double mu, double x);
 extern short vp8_rv[];
 extern int vp8_q2mbl(int x) ;


 void vp8_post_proc_down_and_across_mmx
 (
     unsigned char *src_ptr,
     unsigned char *dst_ptr,
     int src_pixels_per_line,
     int dst_pixels_per_line,
     int rows,
     int cols,
     int flimit
 )
 {
 #ifdef RELOCATEABLE
     RD
     R4D2
 #endif

     __asm
     {
         push        ebx
         lea         ebx, Blur
         movd        mm2, flimit
         punpcklwd   mm2, mm2
         punpckldq   mm2, mm2

         mov         esi,        src_ptr
         mov         edi,        dst_ptr

         mov         ecx, DWORD PTR rows
         mov         eax, src_pixels_per_line ;
         destination pitch?
         pxor        mm0, mm0              ;
         mm0 = 00000000

         nextrow:

         xor         edx,        edx       ;

         clear out edx for use as loop counter
         nextcol:

         pxor        mm7, mm7              ;

     mm7 = 00000000
     movq        mm6, [ebx + 32 ]      ;
         mm6 = kernel 2 taps
         movq        mm3, [esi]            ;
         mm4 = r0 p0..p7
         punpcklbw   mm3, mm0              ;
         mm3 = p0..p3
         movq        mm1, mm3              ;
         mm1 = p0..p3
         pmullw      mm3, mm6              ;
         mm3 *= kernel 2 modifiers

         movq        mm6, [ebx + 48]       ;
         mm6 = kernel 3 taps
         movq        mm5, [esi + eax]      ;
         mm4 = r1 p0..p7
         punpcklbw   mm5, mm0              ;
         mm5 = r1 p0..p3
         pmullw      mm6, mm5              ;
         mm6 *= p0..p3 * kernel 3 modifiers
         paddusw     mm3, mm6              ;
         mm3 += mm6

         ;
         thresholding
         movq        mm7, mm1              ;
         mm7 = r0 p0..p3
         psubusw     mm7, mm5              ;
         mm7 = r0 p0..p3 - r1 p0..p3
         psubusw     mm5, mm1              ;
         mm5 = r1 p0..p3 - r0 p0..p3
         paddusw     mm7, mm5              ;
         mm7 = abs(r0 p0..p3 - r1 p0..p3)
         pcmpgtw     mm7, mm2

         movq        mm6, [ebx + 64 ]      ;
         mm6 = kernel 4 modifiers
         movq        mm5, [esi + 2*eax]    ;
         mm4 = r2 p0..p7
         punpcklbw   mm5, mm0              ;
         mm5 = r2 p0..p3
         pmullw      mm6, mm5              ;
         mm5 *= kernel 4 modifiers
         paddusw     mm3, mm6              ;
         mm3 += mm5

         ;
         thresholding
         movq        mm6, mm1              ;
         mm6 = r0 p0..p3
         psubusw     mm6, mm5              ;
         mm6 = r0 p0..p3 - r2 p0..p3
         psubusw     mm5, mm1              ;
         mm5 = r2 p0..p3 - r2 p0..p3
         paddusw     mm6, mm5              ;
         mm6 = abs(r0 p0..p3 - r2 p0..p3)
         pcmpgtw     mm6, mm2
         por         mm7, mm6              ;
         accumulate thresholds


         neg         eax
         movq        mm6, [ebx ]           ;
         kernel 0 taps
         movq        mm5, [esi+2*eax]      ;
         mm4 = r-2 p0..p7
         punpcklbw   mm5, mm0              ;
         mm5 = r-2 p0..p3
         pmullw      mm6, mm5              ;
         mm5 *= kernel 0 modifiers
         paddusw     mm3, mm6              ;
         mm3 += mm5

         ;
         thresholding
         movq        mm6, mm1              ;
         mm6 = r0 p0..p3
         psubusw     mm6, mm5              ;
         mm6 = p0..p3 - r-2 p0..p3
         psubusw     mm5, mm1              ;
         mm5 = r-2 p0..p3 - p0..p3
         paddusw     mm6, mm5              ;
         mm6 = abs(r0 p0..p3 - r-2 p0..p3)
         pcmpgtw     mm6, mm2
         por         mm7, mm6              ;
         accumulate thresholds

         movq        mm6, [ebx + 16]       ;
         kernel 1 taps
         movq        mm4, [esi+eax]        ;
         mm4 = r-1 p0..p7
         punpcklbw   mm4, mm0              ;
         mm4 = r-1 p0..p3
         pmullw      mm6, mm4              ;
         mm4 *= kernel 1 modifiers.
         paddusw     mm3, mm6              ;
         mm3 += mm5

         ;
         thresholding
         movq        mm6, mm1              ;
         mm6 = r0 p0..p3
         psubusw     mm6, mm4              ;
         mm6 = p0..p3 - r-2 p0..p3
         psubusw     mm4, mm1              ;
         mm5 = r-1 p0..p3 - p0..p3
         paddusw     mm6, mm4              ;
         mm6 = abs(r0 p0..p3 - r-1 p0..p3)
         pcmpgtw     mm6, mm2
         por         mm7, mm6              ;
         accumulate thresholds


         paddusw     mm3, rd               ;
         mm3 += round value
         psraw       mm3, VP8_FILTER_SHIFT     ;
         mm3 /= 128

         pand        mm1, mm7              ;
         mm1 select vals > thresh from source
         pandn       mm7, mm3              ;
         mm7 select vals < thresh from blurred result
         paddusw     mm1, mm7              ;
         combination

         packuswb    mm1, mm0              ;
         pack to bytes

         movd        [edi], mm1            ;
         neg         eax                   ;
         pitch is positive


         add         esi, 4
         add         edi, 4
         add         edx, 4

         cmp         edx, cols
         jl          nextcol
         // done with the all cols, start the across filtering in place
         sub         esi, edx
         sub         edi, edx


         push        eax
         xor         edx,    edx
         mov         eax,    [edi-4];

         acrossnextcol:
         pxor        mm7, mm7              ;
         mm7 = 00000000
         movq        mm6, [ebx + 32 ]      ;
         movq        mm4, [edi+edx]        ;
         mm4 = p0..p7
         movq        mm3, mm4              ;
         mm3 = p0..p7
         punpcklbw   mm3, mm0              ;
         mm3 = p0..p3
         movq        mm1, mm3              ;
         mm1 = p0..p3
         pmullw      mm3, mm6              ;
         mm3 *= kernel 2 modifiers

         movq        mm6, [ebx + 48]
         psrlq       mm4, 8                ;
         mm4 = p1..p7
         movq        mm5, mm4              ;
         mm5 = p1..p7
         punpcklbw   mm5, mm0              ;
         mm5 = p1..p4
         pmullw      mm6, mm5              ;
         mm6 *= p1..p4 * kernel 3 modifiers
         paddusw     mm3, mm6              ;
         mm3 += mm6

         ;
         thresholding
         movq        mm7, mm1              ;
         mm7 = p0..p3
         psubusw     mm7, mm5              ;
         mm7 = p0..p3 - p1..p4
         psubusw     mm5, mm1              ;
         mm5 = p1..p4 - p0..p3
         paddusw     mm7, mm5              ;
         mm7 = abs(p0..p3 - p1..p4)
         pcmpgtw     mm7, mm2

         movq        mm6, [ebx + 64 ]
         psrlq       mm4, 8                ;
         mm4 = p2..p7
         movq        mm5, mm4              ;
         mm5 = p2..p7
         punpcklbw   mm5, mm0              ;
         mm5 = p2..p5
         pmullw      mm6, mm5              ;
         mm5 *= kernel 4 modifiers
         paddusw     mm3, mm6              ;
         mm3 += mm5

         ;
         thresholding
         movq        mm6, mm1              ;
         mm6 = p0..p3
         psubusw     mm6, mm5              ;
         mm6 = p0..p3 - p1..p4
         psubusw     mm5, mm1              ;
         mm5 = p1..p4 - p0..p3
         paddusw     mm6, mm5              ;
         mm6 = abs(p0..p3 - p1..p4)
         pcmpgtw     mm6, mm2
         por         mm7, mm6              ;
         accumulate thresholds


         movq        mm6, [ebx ]
         movq        mm4, [edi+edx-2]      ;
         mm4 = p-2..p5
         movq        mm5, mm4              ;
         mm5 = p-2..p5
         punpcklbw   mm5, mm0              ;
         mm5 = p-2..p1
         pmullw      mm6, mm5              ;
         mm5 *= kernel 0 modifiers
         paddusw     mm3, mm6              ;
         mm3 += mm5

         ;
         thresholding
         movq        mm6, mm1              ;
         mm6 = p0..p3
         psubusw     mm6, mm5              ;
         mm6 = p0..p3 - p1..p4
         psubusw     mm5, mm1              ;
         mm5 = p1..p4 - p0..p3
         paddusw     mm6, mm5              ;
         mm6 = abs(p0..p3 - p1..p4)
         pcmpgtw     mm6, mm2
         por         mm7, mm6              ;
         accumulate thresholds

         movq        mm6, [ebx + 16]
         psrlq       mm4, 8                ;
         mm4 = p-1..p5
         punpcklbw   mm4, mm0              ;
         mm4 = p-1..p2
         pmullw      mm6, mm4              ;
         mm4 *= kernel 1 modifiers.
         paddusw     mm3, mm6              ;
         mm3 += mm5

         ;
         thresholding
         movq        mm6, mm1              ;
         mm6 = p0..p3
         psubusw     mm6, mm4              ;
         mm6 = p0..p3 - p1..p4
         psubusw     mm4, mm1              ;
         mm5 = p1..p4 - p0..p3
         paddusw     mm6, mm4              ;
         mm6 = abs(p0..p3 - p1..p4)
         pcmpgtw     mm6, mm2
         por         mm7, mm6              ;
         accumulate thresholds

         paddusw     mm3, rd               ;
         mm3 += round value
         psraw       mm3, VP8_FILTER_SHIFT     ;
         mm3 /= 128

         pand        mm1, mm7              ;
         mm1 select vals > thresh from source
         pandn       mm7, mm3              ;
         mm7 select vals < thresh from blurred result
         paddusw     mm1, mm7              ;
         combination

         packuswb    mm1, mm0              ;
         pack to bytes
         mov         DWORD PTR [edi+edx-4],  eax   ;
         store previous four bytes
         movd        eax,    mm1

         add         edx, 4
         cmp         edx, cols
         jl          acrossnextcol;

         mov         DWORD PTR [edi+edx-4],  eax
         pop         eax

         // done with this rwo
         add         esi, eax               ;
         next line
         mov         eax, dst_pixels_per_line ;
         destination pitch?
         add         edi, eax               ;
         next destination
         mov         eax, src_pixels_per_line ;
         destination pitch?

         dec         ecx                   ;
         decrement count
         jnz         nextrow               ;
         next row
         pop         ebx

     }
 }


 void vp8_post_proc_down_and_across_xmm
 (
     unsigned char *src_ptr,
     unsigned char *dst_ptr,
     int src_pixels_per_line,
     int dst_pixels_per_line,
     int rows,
     int cols,
     int flimit
 )
 {
 #ifdef RELOCATEABLE
     R4D2
 #endif

     __asm
     {
         movd        xmm2,       flimit
         punpcklwd   xmm2,       xmm2
         punpckldq   xmm2,       xmm2
         punpcklqdq  xmm2,       xmm2

         mov         esi,        src_ptr
         mov         edi,        dst_ptr

         mov         ecx,        DWORD PTR rows
         mov         eax,        src_pixels_per_line ;
         destination pitch?
         pxor        xmm0,       xmm0              ;
         mm0 = 00000000

         nextrow:

         xor         edx,        edx       ;

         clear out edx for use as loop counter
         nextcol:
         movq        xmm3,       QWORD PTR [esi]         ;

         mm4 = r0 p0..p7
         punpcklbw   xmm3,       xmm0                    ;
         mm3 = p0..p3
         movdqa      xmm1,       xmm3                    ;
         mm1 = p0..p3
         psllw       xmm3,       2                       ;

         movq        xmm5,       QWORD PTR [esi + eax]   ;
         mm4 = r1 p0..p7
         punpcklbw   xmm5,       xmm0                    ;
         mm5 = r1 p0..p3
         paddusw     xmm3,       xmm5                    ;
         mm3 += mm6

         ;
         thresholding
         movdqa      xmm7,       xmm1                    ;
         mm7 = r0 p0..p3
         psubusw     xmm7,       xmm5                    ;
         mm7 = r0 p0..p3 - r1 p0..p3
         psubusw     xmm5,       xmm1                    ;
         mm5 = r1 p0..p3 - r0 p0..p3
         paddusw     xmm7,       xmm5                    ;
         mm7 = abs(r0 p0..p3 - r1 p0..p3)
         pcmpgtw     xmm7,       xmm2

         movq        xmm5,       QWORD PTR [esi + 2*eax] ;
         mm4 = r2 p0..p7
         punpcklbw   xmm5,       xmm0                    ;
         mm5 = r2 p0..p3
         paddusw     xmm3,       xmm5                    ;
         mm3 += mm5

         ;
         thresholding
         movdqa      xmm6,       xmm1                    ;
         mm6 = r0 p0..p3
         psubusw     xmm6,       xmm5                    ;
         mm6 = r0 p0..p3 - r2 p0..p3
         psubusw     xmm5,       xmm1                    ;
         mm5 = r2 p0..p3 - r2 p0..p3
         paddusw     xmm6,       xmm5                    ;
         mm6 = abs(r0 p0..p3 - r2 p0..p3)
         pcmpgtw     xmm6,       xmm2
         por         xmm7,       xmm6                    ;
         accumulate thresholds


         neg         eax
         movq        xmm5,       QWORD PTR [esi+2*eax]   ;
         mm4 = r-2 p0..p7
         punpcklbw   xmm5,       xmm0                    ;
         mm5 = r-2 p0..p3
         paddusw     xmm3,       xmm5                    ;
         mm3 += mm5

         ;
         thresholding
         movdqa      xmm6,       xmm1                    ;
         mm6 = r0 p0..p3
         psubusw     xmm6,       xmm5                    ;
         mm6 = p0..p3 - r-2 p0..p3
         psubusw     xmm5,       xmm1                    ;
         mm5 = r-2 p0..p3 - p0..p3
         paddusw     xmm6,       xmm5                    ;
         mm6 = abs(r0 p0..p3 - r-2 p0..p3)
         pcmpgtw     xmm6,       xmm2
         por         xmm7,       xmm6                    ;
         accumulate thresholds

         movq        xmm4,       QWORD PTR [esi+eax]     ;
         mm4 = r-1 p0..p7
         punpcklbw   xmm4,       xmm0                    ;
         mm4 = r-1 p0..p3
         paddusw     xmm3,       xmm4                    ;
         mm3 += mm5

         ;
         thresholding
         movdqa      xmm6,       xmm1                    ;
         mm6 = r0 p0..p3
         psubusw     xmm6,       xmm4                    ;
         mm6 = p0..p3 - r-2 p0..p3
         psubusw     xmm4,       xmm1                    ;
         mm5 = r-1 p0..p3 - p0..p3
         paddusw     xmm6,       xmm4                    ;
         mm6 = abs(r0 p0..p3 - r-1 p0..p3)
         pcmpgtw     xmm6,       xmm2
         por         xmm7,       xmm6                    ;
         accumulate thresholds


         paddusw     xmm3,       rd42                    ;
         mm3 += round value
         psraw       xmm3,       3                       ;
         mm3 /= 8

         pand        xmm1,       xmm7                    ;
         mm1 select vals > thresh from source
         pandn       xmm7,       xmm3                    ;
         mm7 select vals < thresh from blurred result
         paddusw     xmm1,       xmm7                    ;
         combination

         packuswb    xmm1,       xmm0                    ;
         pack to bytes
         movq        QWORD PTR [edi], xmm1             ;

         neg         eax                   ;
         pitch is positive
         add         esi,        8
         add         edi,        8

         add         edx,        8
         cmp         edx,        cols

         jl          nextcol

         // done with the all cols, start the across filtering in place
         sub         esi,        edx
         sub         edi,        edx

         xor         edx,        edx
         movq        mm0,        QWORD PTR [edi-8];

         acrossnextcol:
         movq        xmm7,       QWORD PTR [edi +edx -2]
         movd        xmm4,       DWORD PTR [edi +edx +6]

         pslldq      xmm4,       8
         por         xmm4,       xmm7

         movdqa      xmm3,       xmm4
         psrldq      xmm3,       2
         punpcklbw   xmm3,       xmm0              ;
         mm3 = p0..p3
         movdqa      xmm1,       xmm3              ;
         mm1 = p0..p3
         psllw       xmm3,       2


         movdqa      xmm5,       xmm4
         psrldq      xmm5,       3
         punpcklbw   xmm5,       xmm0              ;
         mm5 = p1..p4
         paddusw     xmm3,       xmm5              ;
         mm3 += mm6

         ;
         thresholding
         movdqa      xmm7,       xmm1              ;
         mm7 = p0..p3
         psubusw     xmm7,       xmm5              ;
         mm7 = p0..p3 - p1..p4
         psubusw     xmm5,       xmm1              ;
         mm5 = p1..p4 - p0..p3
         paddusw     xmm7,       xmm5              ;
         mm7 = abs(p0..p3 - p1..p4)
         pcmpgtw     xmm7,       xmm2

         movdqa      xmm5,       xmm4
         psrldq      xmm5,       4
         punpcklbw   xmm5,       xmm0              ;
         mm5 = p2..p5
         paddusw     xmm3,       xmm5              ;
         mm3 += mm5

         ;
         thresholding
         movdqa      xmm6,       xmm1              ;
         mm6 = p0..p3
         psubusw     xmm6,       xmm5              ;
         mm6 = p0..p3 - p1..p4
         psubusw     xmm5,       xmm1              ;
         mm5 = p1..p4 - p0..p3
         paddusw     xmm6,       xmm5              ;
         mm6 = abs(p0..p3 - p1..p4)
         pcmpgtw     xmm6,       xmm2
         por         xmm7,       xmm6              ;
         accumulate thresholds


         movdqa      xmm5,       xmm4              ;
         mm5 = p-2..p5
         punpcklbw   xmm5,       xmm0              ;
         mm5 = p-2..p1
         paddusw     xmm3,       xmm5              ;
         mm3 += mm5

         ;
         thresholding
         movdqa      xmm6,       xmm1              ;
         mm6 = p0..p3
         psubusw     xmm6,       xmm5              ;
         mm6 = p0..p3 - p1..p4
         psubusw     xmm5,       xmm1              ;
         mm5 = p1..p4 - p0..p3
         paddusw     xmm6,       xmm5              ;
         mm6 = abs(p0..p3 - p1..p4)
         pcmpgtw     xmm6,       xmm2
         por         xmm7,       xmm6              ;
         accumulate thresholds

         psrldq      xmm4,       1                   ;
         mm4 = p-1..p5
         punpcklbw   xmm4,       xmm0              ;
         mm4 = p-1..p2
         paddusw     xmm3,       xmm4              ;
         mm3 += mm5

         ;
         thresholding
         movdqa      xmm6,       xmm1              ;
         mm6 = p0..p3
         psubusw     xmm6,       xmm4              ;
         mm6 = p0..p3 - p1..p4
         psubusw     xmm4,       xmm1              ;
         mm5 = p1..p4 - p0..p3
         paddusw     xmm6,       xmm4              ;
         mm6 = abs(p0..p3 - p1..p4)
         pcmpgtw     xmm6,       xmm2
         por         xmm7,       xmm6              ;
         accumulate thresholds

         paddusw     xmm3,       rd42              ;
         mm3 += round value
         psraw       xmm3,       3                 ;
         mm3 /= 8

         pand        xmm1,       xmm7              ;
         mm1 select vals > thresh from source
         pandn       xmm7,       xmm3              ;
         mm7 select vals < thresh from blurred result
         paddusw     xmm1,       xmm7              ;
         combination

         packuswb    xmm1,       xmm0              ;
         pack to bytes
         movq        QWORD PTR [edi+edx-8],  mm0   ;
         store previous four bytes
         movdq2q     mm0,        xmm1

         add         edx,        8
         cmp         edx,        cols
         jl          acrossnextcol;

         // last 8 pixels
         movq        QWORD PTR [edi+edx-8],  mm0

         // done with this rwo
         add         esi, eax               ;
         next line
         mov         eax, dst_pixels_per_line ;
         destination pitch?
         add         edi, eax               ;
         next destination
         mov         eax, src_pixels_per_line ;
         destination pitch?

         dec         ecx                   ;
         decrement count
         jnz         nextrow               ;
         next row
     }
 }


 void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit)
 {
     int c, i;
     __declspec(align(16))
     int flimit2[2];
     __declspec(align(16))
     unsigned char d[16][8];

     flimit = vp8_q2mbl(flimit);

     for (i = 0; i < 2; i++)
         flimit2[i] = flimit;

     rows += 8;

     for (c = 0; c < cols; c += 4)
     {
         unsigned char *s = &dst[c];

         __asm
         {
             mov         esi,        s           ;
             pxor        mm0,        mm0     ;

             mov         eax,        pitch       ;
             neg         eax                                     // eax = -pitch

             lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
             neg         eax


             pxor        mm5,        mm5
             pxor        mm6,        mm6     ;

             pxor        mm7,        mm7     ;
             mov         edi,        esi

             mov         ecx,        15          ;

             loop_initvar:
             movd        mm1,        DWORD PTR [edi];
             punpcklbw   mm1,        mm0     ;

             paddw       mm5,        mm1     ;
             pmullw      mm1,        mm1     ;

             movq        mm2,        mm1     ;
             punpcklwd   mm1,        mm0     ;

             punpckhwd   mm2,        mm0     ;
             paddd       mm6,        mm1     ;

             paddd       mm7,        mm2     ;
             lea         edi,        [edi+eax]   ;

             dec         ecx
             jne         loop_initvar
             //save the var and sum
             xor         edx,        edx
             loop_row:
             movd        mm1,        DWORD PTR [esi]     // [s-pitch*8]
             movd        mm2,        DWORD PTR [edi]     // [s+pitch*7]

             punpcklbw   mm1,        mm0
             punpcklbw   mm2,        mm0

             paddw       mm5,        mm2
             psubw       mm5,        mm1

             pmullw      mm2,        mm2
             movq        mm4,        mm2

             punpcklwd   mm2,        mm0
             punpckhwd   mm4,        mm0

             paddd       mm6,        mm2
             paddd       mm7,        mm4

             pmullw      mm1,        mm1
             movq        mm2,        mm1

             punpcklwd   mm1,        mm0
             psubd       mm6,        mm1

             punpckhwd   mm2,        mm0
             psubd       mm7,        mm2


             movq        mm3,        mm6
             pslld       mm3,        4

             psubd       mm3,        mm6
             movq        mm1,        mm5

             movq        mm4,        mm5
             pmullw      mm1,        mm1

             pmulhw      mm4,        mm4
             movq        mm2,        mm1

             punpcklwd   mm1,        mm4
             punpckhwd   mm2,        mm4

             movq        mm4,        mm7
             pslld       mm4,        4

             psubd       mm4,        mm7

             psubd       mm3,        mm1
             psubd       mm4,        mm2

             psubd       mm3,        flimit2
             psubd       mm4,        flimit2

             psrad       mm3,        31
             psrad       mm4,        31

             packssdw    mm3,        mm4
             packsswb    mm3,        mm0

             movd        mm1,        DWORD PTR [esi+eax*8]

             movq        mm2,        mm1
             punpcklbw   mm1,        mm0

             paddw       mm1,        mm5
             mov         ecx,        edx

             and         ecx,        127
             movq        mm4,        vp8_rv[ecx*2]

             paddw       mm1,        mm4
             //paddw     xmm1,       eight8s
             psraw       mm1,        4

             packuswb    mm1,        mm0
             pand        mm1,        mm3

             pandn       mm3,        mm2
             por         mm1,        mm3

             and         ecx,        15
             movd        DWORD PTR  d[ecx*4], mm1

             mov         ecx,        edx
             sub         ecx,        8

             and         ecx,        15
             movd        mm1,        DWORD PTR d[ecx*4]

             movd        [esi],      mm1
             lea         esi,        [esi+eax]

             lea         edi,        [edi+eax]
             add         edx,        1

             cmp         edx,        rows
             jl          loop_row

         }

     }
 }

 void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit)
 {
     int c, i;
     __declspec(align(16))
     int flimit4[4];
     __declspec(align(16))
     unsigned char d[16][8];

     flimit = vp8_q2mbl(flimit);

     for (i = 0; i < 4; i++)
         flimit4[i] = flimit;

     rows += 8;

     for (c = 0; c < cols; c += 8)
     {
         unsigned char *s = &dst[c];

         __asm
         {
             mov         esi,        s           ;
             pxor        xmm0,       xmm0        ;

             mov         eax,        pitch       ;
             neg         eax                                     // eax = -pitch

             lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
             neg         eax


             pxor        xmm5,       xmm5
             pxor        xmm6,       xmm6        ;

             pxor        xmm7,       xmm7        ;
             mov         edi,        esi

             mov         ecx,        15          ;

             loop_initvar:
             movq        xmm1,       QWORD PTR [edi];
             punpcklbw   xmm1,       xmm0        ;

             paddw       xmm5,       xmm1        ;
             pmullw      xmm1,       xmm1        ;

             movdqa      xmm2,       xmm1        ;
             punpcklwd   xmm1,       xmm0        ;

             punpckhwd   xmm2,       xmm0        ;
             paddd       xmm6,       xmm1        ;

             paddd       xmm7,       xmm2        ;
             lea         edi,        [edi+eax]   ;

             dec         ecx
             jne         loop_initvar
             //save the var and sum
             xor         edx,        edx
             loop_row:
             movq        xmm1,       QWORD PTR [esi]     // [s-pitch*8]
             movq        xmm2,       QWORD PTR [edi]     // [s+pitch*7]

             punpcklbw   xmm1,       xmm0
             punpcklbw   xmm2,       xmm0

             paddw       xmm5,       xmm2
             psubw       xmm5,       xmm1

             pmullw      xmm2,       xmm2
             movdqa      xmm4,       xmm2

             punpcklwd   xmm2,       xmm0
             punpckhwd   xmm4,       xmm0

             paddd       xmm6,       xmm2
             paddd       xmm7,       xmm4

             pmullw      xmm1,       xmm1
             movdqa      xmm2,       xmm1

             punpcklwd   xmm1,       xmm0
             psubd       xmm6,       xmm1

             punpckhwd   xmm2,       xmm0
             psubd       xmm7,       xmm2


             movdqa      xmm3,       xmm6
             pslld       xmm3,       4

             psubd       xmm3,       xmm6
             movdqa      xmm1,       xmm5

             movdqa      xmm4,       xmm5
             pmullw      xmm1,       xmm1

             pmulhw      xmm4,       xmm4
             movdqa      xmm2,       xmm1

             punpcklwd   xmm1,       xmm4
             punpckhwd   xmm2,       xmm4

             movdqa      xmm4,       xmm7
             pslld       xmm4,       4

             psubd       xmm4,       xmm7

             psubd       xmm3,       xmm1
             psubd       xmm4,       xmm2

             psubd       xmm3,       flimit4
             psubd       xmm4,       flimit4

             psrad       xmm3,       31
             psrad       xmm4,       31

             packssdw    xmm3,       xmm4
             packsswb    xmm3,       xmm0

             movq        xmm1,       QWORD PTR [esi+eax*8]

             movq        xmm2,       xmm1
             punpcklbw   xmm1,       xmm0

             paddw       xmm1,       xmm5
             mov         ecx,        edx

             and         ecx,        127
             movdqu      xmm4,       vp8_rv[ecx*2]

             paddw       xmm1,       xmm4
             //paddw     xmm1,       eight8s
             psraw       xmm1,       4

             packuswb    xmm1,       xmm0
             pand        xmm1,       xmm3

             pandn       xmm3,       xmm2
             por         xmm1,       xmm3

             and         ecx,        15
             movq        QWORD PTR  d[ecx*8], xmm1

             mov         ecx,        edx
             sub         ecx,        8

             and         ecx,        15
             movq        mm0,        d[ecx*8]

             movq        [esi],      mm0
             lea         esi,        [esi+eax]

             lea         edi,        [edi+eax]
             add         edx,        1

             cmp         edx,        rows
             jl          loop_row

         }

     }
 }
 #if 0
 /****************************************************************************
  *
  *  ROUTINE       : plane_add_noise_wmt
  *
  *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
  *                                  noise to
  *                  unsigned int Width    width of plane
  *                  unsigned int Height   height of plane
  *                  int  Pitch    distance between subsequent lines of frame
  *                  int  q        quantizer used to determine amount of noise
  *                                  to add
  *
  *  OUTPUTS       : None.
  *
  *  RETURNS       : void.
  *
  *  FUNCTION      : adds gaussian noise to a plane of pixels
  *
  *  SPECIAL NOTES : None.
  *
  ****************************************************************************/
 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
 {
     unsigned int i;

     __declspec(align(16)) unsigned char blackclamp[16];
     __declspec(align(16)) unsigned char whiteclamp[16];
     __declspec(align(16)) unsigned char bothclamp[16];
     char char_dist[300];
     char Rand[2048];
     double sigma;
 //    return;
     __asm emms
     sigma = a + .5 + .6 * (63 - q) / 63.0;

     // set up a lookup table of 256 entries that matches
     // a gaussian distribution with sigma determined by q.
     //
     {
         double i;
         int next, j;

         next = 0;

         for (i = -32; i < 32; i++)
         {
             double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i);
             int a = (int)(g + .5);

             if (a)
             {
                 for (j = 0; j < a; j++)
                 {
                     char_dist[next+j] = (char) i;
                 }

                 next = next + j;
             }

         }

         for (next = next; next < 256; next++)
             char_dist[next] = 0;

     }

     for (i = 0; i < 2048; i++)
     {
         Rand[i] = char_dist[rand() & 0xff];
     }

     for (i = 0; i < 16; i++)
     {
         blackclamp[i] = -char_dist[0];
         whiteclamp[i] = -char_dist[0];
         bothclamp[i] = -2 * char_dist[0];
     }

     for (i = 0; i < Height; i++)
     {
         unsigned char *Pos = Start + i * Pitch;
         char  *Ref = Rand + (rand() & 0xff);

         __asm
         {
             mov ecx, [Width]
             mov esi, Pos
             mov edi, Ref
             xor         eax, eax

             nextset:
             movdqu      xmm1, [esi+eax]        // get the source

             psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
             paddusb     xmm1, bothclamp
             psubusb     xmm1, whiteclamp

             movdqu      xmm2, [edi+eax]        // get the noise for this line
             paddb       xmm1, xmm2             // add it in
             movdqu      [esi+eax], xmm1        // store the result

             add         eax, 16                // move to the next line

             cmp         eax, ecx
             jl          nextset


         }

     }
 }
 #endif
 __declspec(align(16))
 static const int four8s[4] = { 8, 8, 8, 8};
 void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit)
 {
     int r, i;
     __declspec(align(16))
     int flimit4[4];
     unsigned char *s = src;
     int sumsq;
     int sum;


     flimit = vp8_q2mbl(flimit);
     flimit4[0] =
         flimit4[1] =
             flimit4[2] =
                 flimit4[3] = flimit;

     for (r = 0; r < rows; r++)
     {


         sumsq = 0;
         sum = 0;

         for (i = -8; i <= 6; i++)
         {
             sumsq += s[i] * s[i];
             sum   += s[i];
         }

         __asm
         {
             mov         eax,    sumsq
             movd        xmm7,   eax

             mov         eax,    sum
             movd        xmm6,   eax

             mov         esi,    s
             xor         ecx,    ecx

             mov         edx,    cols
             add         edx,    8
             pxor        mm0,    mm0
             pxor        mm1,    mm1

             pxor        xmm0,   xmm0
             nextcol4:

             movd        xmm1,   DWORD PTR [esi+ecx-8]   // -8 -7 -6 -5
             movd        xmm2,   DWORD PTR [esi+ecx+7]   // +7 +8 +9 +10

             punpcklbw   xmm1,   xmm0                    // expanding
             punpcklbw   xmm2,   xmm0                    // expanding

             punpcklwd   xmm1,   xmm0                    // expanding to dwords
             punpcklwd   xmm2,   xmm0                    // expanding to dwords

             psubd       xmm2,   xmm1                    // 7--8   8--7   9--6 10--5
             paddd       xmm1,   xmm1                    // -8*2   -7*2   -6*2 -5*2

             paddd       xmm1,   xmm2                    // 7+-8   8+-7   9+-6 10+-5
             pmaddwd     xmm1,   xmm2                    // squared of 7+-8   8+-7   9+-6 10+-5

             paddd       xmm6,   xmm2
             paddd       xmm7,   xmm1

             pshufd      xmm6,   xmm6,   0               // duplicate the last ones
             pshufd      xmm7,   xmm7,   0               // duplicate the last ones

             psrldq      xmm1,       4                   // 8--7   9--6 10--5  0000
             psrldq      xmm2,       4                   // 8--7   9--6 10--5  0000

             pshufd      xmm3,   xmm1,   3               // 0000  8--7   8--7   8--7 squared
             pshufd      xmm4,   xmm2,   3               // 0000  8--7   8--7   8--7 squared

             paddd       xmm6,   xmm4
             paddd       xmm7,   xmm3

             pshufd      xmm3,   xmm1,   01011111b       // 0000  0000   9--6   9--6 squared
             pshufd      xmm4,   xmm2,   01011111b       // 0000  0000   9--6   9--6 squared

             paddd       xmm7,   xmm3
             paddd       xmm6,   xmm4

             pshufd      xmm3,   xmm1,   10111111b       // 0000  0000   8--7   8--7 squared
             pshufd      xmm4,   xmm2,   10111111b       // 0000  0000   8--7   8--7 squared

             paddd       xmm7,   xmm3
             paddd       xmm6,   xmm4

             movdqa      xmm3,   xmm6
             pmaddwd     xmm3,   xmm3

             movdqa      xmm5,   xmm7
             pslld       xmm5,   4

             psubd       xmm5,   xmm7
             psubd       xmm5,   xmm3

             psubd       xmm5,   flimit4
             psrad       xmm5,   31

             packssdw    xmm5,   xmm0
             packsswb    xmm5,   xmm0

             movd        xmm1,   DWORD PTR [esi+ecx]
             movq        xmm2,   xmm1

             punpcklbw   xmm1,   xmm0
             punpcklwd   xmm1,   xmm0

             paddd       xmm1,   xmm6
             paddd       xmm1,   four8s

             psrad       xmm1,   4
             packssdw    xmm1,   xmm0

             packuswb    xmm1,   xmm0
             pand        xmm1,   xmm5

             pandn       xmm5,   xmm2
             por         xmm5,   xmm1

             movd        [esi+ecx-8],  mm0
             movq        mm0,    mm1

             movdq2q     mm1,    xmm5
             psrldq      xmm7,   12

             psrldq      xmm6,   12
             add         ecx,    4

             cmp         ecx,    edx
             jl          nextcol4

         }
         s += pitch;
     }
 }

 #if 0

 /****************************************************************************
  *
  *  ROUTINE       : plane_add_noise_mmx
  *
  *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
  *                                  noise to
  *                  unsigned int Width    width of plane
  *                  unsigned int Height   height of plane
  *                  int  Pitch    distance between subsequent lines of frame
  *                  int  q        quantizer used to determine amount of noise
  *                                  to add
  *
  *  OUTPUTS       : None.
  *
  *  RETURNS       : void.
  *
  *  FUNCTION      : adds gaussian noise to a plane of pixels
  *
  *  SPECIAL NOTES : None.
  *
  ****************************************************************************/
 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
 {
     unsigned int i;
     int Pitch4 = Pitch * 4;
     const int noise_amount = 2;
     const int noise_adder = 2 * noise_amount + 1;

     __declspec(align(16)) unsigned char blackclamp[16];
     __declspec(align(16)) unsigned char whiteclamp[16];
     __declspec(align(16)) unsigned char bothclamp[16];

     char char_dist[300];
     char Rand[2048];

     double sigma;
     __asm emms
     sigma = a + .5 + .6 * (63 - q) / 63.0;

     // set up a lookup table of 256 entries that matches
     // a gaussian distribution with sigma determined by q.
     //
     {
         double i, sum = 0;
         int next, j;

         next = 0;

         for (i = -32; i < 32; i++)
         {
             int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));

             if (a)
             {
                 for (j = 0; j < a; j++)
                 {
                     char_dist[next+j] = (char) i;
                 }

                 next = next + j;
             }

         }

         for (next = next; next < 256; next++)
             char_dist[next] = 0;

     }

     for (i = 0; i < 2048; i++)
     {
         Rand[i] = char_dist[rand() & 0xff];
     }

     for (i = 0; i < 16; i++)
     {
         blackclamp[i] = -char_dist[0];
         whiteclamp[i] = -char_dist[0];
         bothclamp[i] = -2 * char_dist[0];
     }

     for (i = 0; i < Height; i++)
     {
         unsigned char *Pos = Start + i * Pitch;
         char  *Ref = Rand + (rand() & 0xff);

         __asm
         {
             mov ecx, [Width]
             mov esi, Pos
             mov edi, Ref
             xor         eax, eax

             nextset:
             movq        mm1, [esi+eax]        // get the source

             psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
             paddusb     mm1, bothclamp
             psubusb     mm1, whiteclamp

             movq        mm2, [edi+eax]        // get the noise for this line
             paddb       mm1, mm2             // add it in
             movq        [esi+eax], mm1        // store the result

             add         eax, 8                // move to the next line

             cmp         eax, ecx
             jl          nextset


         }

     }
 }
 #else
 extern char an[8][64][3072];
 extern int cd[8][64];

 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
 {
     unsigned int i;
     __declspec(align(16)) unsigned char blackclamp[16];
     __declspec(align(16)) unsigned char whiteclamp[16];
     __declspec(align(16)) unsigned char bothclamp[16];


     __asm emms

     for (i = 0; i < 16; i++)
     {
         blackclamp[i] = -cd[a][q];
         whiteclamp[i] = -cd[a][q];
         bothclamp[i] = -2 * cd[a][q];
     }

     for (i = 0; i < Height; i++)
     {
         unsigned char *Pos = Start + i * Pitch;
         char  *Ref = an[a][q] + (rand() & 0xff);

         __asm
         {
             mov ecx, [Width]
             mov esi, Pos
             mov edi, Ref
             xor         eax, eax

             nextset:
             movq        mm1, [esi+eax]        // get the source

             psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
             paddusb     mm1, bothclamp
             psubusb     mm1, whiteclamp

             movq        mm2, [edi+eax]        // get the noise for this line
             paddb       mm1, mm2             // add it in
             movq        [esi+eax], mm1        // store the result

             add         eax, 8                // move to the next line

             cmp         eax, ecx
             jl          nextset
         }
     }
 }


 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
 {
     unsigned int i;

     __declspec(align(16)) unsigned char blackclamp[16];
     __declspec(align(16)) unsigned char whiteclamp[16];
     __declspec(align(16)) unsigned char bothclamp[16];

     __asm emms

     for (i = 0; i < 16; i++)
     {
         blackclamp[i] = -cd[a][q];
         whiteclamp[i] = -cd[a][q];
         bothclamp[i] = -2 * cd[a][q];
     }

     for (i = 0; i < Height; i++)
     {
         unsigned char *Pos = Start + i * Pitch;
         char *Ref = an[a][q] + (rand() & 0xff);

         __asm
         {
             mov ecx,    [Width]
             mov esi,    Pos
             mov edi,    Ref
             xor         eax, eax

             nextset:
             movdqu      xmm1, [esi+eax]        // get the source

             psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
             paddusb     xmm1, bothclamp
             psubusb     xmm1, whiteclamp

             movdqu      xmm2, [edi+eax]        // get the noise for this line
             paddb       xmm1, xmm2             // add it in
             movdqu      [esi+eax], xmm1        // store the result

             add         eax, 16                // move to the next line

             cmp         eax, ecx
             jl          nextset
         }
     }
 }

 #endif