| /* |
| * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| |
| #include <math.h> |
| #include <stdlib.h> |
| #include "vpx_scale/yv12config.h" |
| #include "pragmas.h" |
| |
| #define VP8_FILTER_WEIGHT 128 |
| #define VP8_FILTER_SHIFT 7 |
| |
| |
| |
| /* static constants */ |
| __declspec(align(16)) |
| const static short Blur[48] = |
| { |
| |
| 16, 16, 16, 16, 16, 16, 16, 16, |
| 16, 16, 16, 16, 16, 16, 16, 16, |
| 64, 64, 64, 64, 64, 64, 64, 64, |
| 16, 16, 16, 16, 16, 16, 16, 16, |
| 16, 16, 16, 16, 16, 16, 16, 16, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| |
| }; |
| #define RD __declspec(align(16)) __int64 rd = 0x0040004000400040; |
| #define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004}; |
| |
| #ifndef RELOCATEABLE |
| const static RD; |
| const static R4D2; |
| #endif |
| |
| |
| /* external references */ |
| extern double vp8_gaussian(double sigma, double mu, double x); |
| extern short vp8_rv[]; |
| extern int vp8_q2mbl(int x) ; |
| |
| |
| |
| void vp8_post_proc_down_and_across_mmx |
| ( |
| unsigned char *src_ptr, |
| unsigned char *dst_ptr, |
| int src_pixels_per_line, |
| int dst_pixels_per_line, |
| int rows, |
| int cols, |
| int flimit |
| ) |
| { |
| #ifdef RELOCATEABLE |
| RD |
| R4D2 |
| #endif |
| |
| __asm |
| { |
| push ebx |
| lea ebx, Blur |
| movd mm2, flimit |
| punpcklwd mm2, mm2 |
| punpckldq mm2, mm2 |
| |
| mov esi, src_ptr |
| mov edi, dst_ptr |
| |
| mov ecx, DWORD PTR rows |
| mov eax, src_pixels_per_line ; |
| destination pitch? |
| pxor mm0, mm0 ; |
| mm0 = 00000000 |
| |
| nextrow: |
| |
| xor edx, edx ; |
| |
| clear out edx for use as loop counter |
| nextcol: |
| |
| pxor mm7, mm7 ; |
| |
| mm7 = 00000000 |
| movq mm6, [ebx + 32 ] ; |
| mm6 = kernel 2 taps |
| movq mm3, [esi] ; |
| mm4 = r0 p0..p7 |
| punpcklbw mm3, mm0 ; |
| mm3 = p0..p3 |
| movq mm1, mm3 ; |
| mm1 = p0..p3 |
| pmullw mm3, mm6 ; |
| mm3 *= kernel 2 modifiers |
| |
| movq mm6, [ebx + 48] ; |
| mm6 = kernel 3 taps |
| movq mm5, [esi + eax] ; |
| mm4 = r1 p0..p7 |
| punpcklbw mm5, mm0 ; |
| mm5 = r1 p0..p3 |
| pmullw mm6, mm5 ; |
| mm6 *= p0..p3 * kernel 3 modifiers |
| paddusw mm3, mm6 ; |
| mm3 += mm6 |
| |
| ; |
| thresholding |
| movq mm7, mm1 ; |
| mm7 = r0 p0..p3 |
| psubusw mm7, mm5 ; |
| mm7 = r0 p0..p3 - r1 p0..p3 |
| psubusw mm5, mm1 ; |
| mm5 = r1 p0..p3 - r0 p0..p3 |
| paddusw mm7, mm5 ; |
| mm7 = abs(r0 p0..p3 - r1 p0..p3) |
| pcmpgtw mm7, mm2 |
| |
| movq mm6, [ebx + 64 ] ; |
| mm6 = kernel 4 modifiers |
| movq mm5, [esi + 2*eax] ; |
| mm4 = r2 p0..p7 |
| punpcklbw mm5, mm0 ; |
| mm5 = r2 p0..p3 |
| pmullw mm6, mm5 ; |
| mm5 *= kernel 4 modifiers |
| paddusw mm3, mm6 ; |
| mm3 += mm5 |
| |
| ; |
| thresholding |
| movq mm6, mm1 ; |
| mm6 = r0 p0..p3 |
| psubusw mm6, mm5 ; |
| mm6 = r0 p0..p3 - r2 p0..p3 |
| psubusw mm5, mm1 ; |
| mm5 = r2 p0..p3 - r2 p0..p3 |
| paddusw mm6, mm5 ; |
| mm6 = abs(r0 p0..p3 - r2 p0..p3) |
| pcmpgtw mm6, mm2 |
| por mm7, mm6 ; |
| accumulate thresholds |
| |
| |
| neg eax |
| movq mm6, [ebx ] ; |
| kernel 0 taps |
| movq mm5, [esi+2*eax] ; |
| mm4 = r-2 p0..p7 |
| punpcklbw mm5, mm0 ; |
| mm5 = r-2 p0..p3 |
| pmullw mm6, mm5 ; |
| mm5 *= kernel 0 modifiers |
| paddusw mm3, mm6 ; |
| mm3 += mm5 |
| |
| ; |
| thresholding |
| movq mm6, mm1 ; |
| mm6 = r0 p0..p3 |
| psubusw mm6, mm5 ; |
| mm6 = p0..p3 - r-2 p0..p3 |
| psubusw mm5, mm1 ; |
| mm5 = r-2 p0..p3 - p0..p3 |
| paddusw mm6, mm5 ; |
| mm6 = abs(r0 p0..p3 - r-2 p0..p3) |
| pcmpgtw mm6, mm2 |
| por mm7, mm6 ; |
| accumulate thresholds |
| |
| movq mm6, [ebx + 16] ; |
| kernel 1 taps |
| movq mm4, [esi+eax] ; |
| mm4 = r-1 p0..p7 |
| punpcklbw mm4, mm0 ; |
| mm4 = r-1 p0..p3 |
| pmullw mm6, mm4 ; |
| mm4 *= kernel 1 modifiers. |
| paddusw mm3, mm6 ; |
| mm3 += mm5 |
| |
| ; |
| thresholding |
| movq mm6, mm1 ; |
| mm6 = r0 p0..p3 |
| psubusw mm6, mm4 ; |
| mm6 = p0..p3 - r-2 p0..p3 |
| psubusw mm4, mm1 ; |
| mm5 = r-1 p0..p3 - p0..p3 |
| paddusw mm6, mm4 ; |
| mm6 = abs(r0 p0..p3 - r-1 p0..p3) |
| pcmpgtw mm6, mm2 |
| por mm7, mm6 ; |
| accumulate thresholds |
| |
| |
| paddusw mm3, rd ; |
| mm3 += round value |
| psraw mm3, VP8_FILTER_SHIFT ; |
| mm3 /= 128 |
| |
| pand mm1, mm7 ; |
| mm1 select vals > thresh from source |
| pandn mm7, mm3 ; |
| mm7 select vals < thresh from blurred result |
| paddusw mm1, mm7 ; |
| combination |
| |
| packuswb mm1, mm0 ; |
| pack to bytes |
| |
| movd [edi], mm1 ; |
| neg eax ; |
| pitch is positive |
| |
| |
| add esi, 4 |
| add edi, 4 |
| add edx, 4 |
| |
| cmp edx, cols |
| jl nextcol |
| // done with the all cols, start the across filtering in place |
| sub esi, edx |
| sub edi, edx |
| |
| |
| push eax |
| xor edx, edx |
| mov eax, [edi-4]; |
| |
| acrossnextcol: |
| pxor mm7, mm7 ; |
| mm7 = 00000000 |
| movq mm6, [ebx + 32 ] ; |
| movq mm4, [edi+edx] ; |
| mm4 = p0..p7 |
| movq mm3, mm4 ; |
| mm3 = p0..p7 |
| punpcklbw mm3, mm0 ; |
| mm3 = p0..p3 |
| movq mm1, mm3 ; |
| mm1 = p0..p3 |
| pmullw mm3, mm6 ; |
| mm3 *= kernel 2 modifiers |
| |
| movq mm6, [ebx + 48] |
| psrlq mm4, 8 ; |
| mm4 = p1..p7 |
| movq mm5, mm4 ; |
| mm5 = p1..p7 |
| punpcklbw mm5, mm0 ; |
| mm5 = p1..p4 |
| pmullw mm6, mm5 ; |
| mm6 *= p1..p4 * kernel 3 modifiers |
| paddusw mm3, mm6 ; |
| mm3 += mm6 |
| |
| ; |
| thresholding |
| movq mm7, mm1 ; |
| mm7 = p0..p3 |
| psubusw mm7, mm5 ; |
| mm7 = p0..p3 - p1..p4 |
| psubusw mm5, mm1 ; |
| mm5 = p1..p4 - p0..p3 |
| paddusw mm7, mm5 ; |
| mm7 = abs(p0..p3 - p1..p4) |
| pcmpgtw mm7, mm2 |
| |
| movq mm6, [ebx + 64 ] |
| psrlq mm4, 8 ; |
| mm4 = p2..p7 |
| movq mm5, mm4 ; |
| mm5 = p2..p7 |
| punpcklbw mm5, mm0 ; |
| mm5 = p2..p5 |
| pmullw mm6, mm5 ; |
| mm5 *= kernel 4 modifiers |
| paddusw mm3, mm6 ; |
| mm3 += mm5 |
| |
| ; |
| thresholding |
| movq mm6, mm1 ; |
| mm6 = p0..p3 |
| psubusw mm6, mm5 ; |
| mm6 = p0..p3 - p1..p4 |
| psubusw mm5, mm1 ; |
| mm5 = p1..p4 - p0..p3 |
| paddusw mm6, mm5 ; |
| mm6 = abs(p0..p3 - p1..p4) |
| pcmpgtw mm6, mm2 |
| por mm7, mm6 ; |
| accumulate thresholds |
| |
| |
| movq mm6, [ebx ] |
| movq mm4, [edi+edx-2] ; |
| mm4 = p-2..p5 |
| movq mm5, mm4 ; |
| mm5 = p-2..p5 |
| punpcklbw mm5, mm0 ; |
| mm5 = p-2..p1 |
| pmullw mm6, mm5 ; |
| mm5 *= kernel 0 modifiers |
| paddusw mm3, mm6 ; |
| mm3 += mm5 |
| |
| ; |
| thresholding |
| movq mm6, mm1 ; |
| mm6 = p0..p3 |
| psubusw mm6, mm5 ; |
| mm6 = p0..p3 - p1..p4 |
| psubusw mm5, mm1 ; |
| mm5 = p1..p4 - p0..p3 |
| paddusw mm6, mm5 ; |
| mm6 = abs(p0..p3 - p1..p4) |
| pcmpgtw mm6, mm2 |
| por mm7, mm6 ; |
| accumulate thresholds |
| |
| movq mm6, [ebx + 16] |
| psrlq mm4, 8 ; |
| mm4 = p-1..p5 |
| punpcklbw mm4, mm0 ; |
| mm4 = p-1..p2 |
| pmullw mm6, mm4 ; |
| mm4 *= kernel 1 modifiers. |
| paddusw mm3, mm6 ; |
| mm3 += mm5 |
| |
| ; |
| thresholding |
| movq mm6, mm1 ; |
| mm6 = p0..p3 |
| psubusw mm6, mm4 ; |
| mm6 = p0..p3 - p1..p4 |
| psubusw mm4, mm1 ; |
| mm5 = p1..p4 - p0..p3 |
| paddusw mm6, mm4 ; |
| mm6 = abs(p0..p3 - p1..p4) |
| pcmpgtw mm6, mm2 |
| por mm7, mm6 ; |
| accumulate thresholds |
| |
| paddusw mm3, rd ; |
| mm3 += round value |
| psraw mm3, VP8_FILTER_SHIFT ; |
| mm3 /= 128 |
| |
| pand mm1, mm7 ; |
| mm1 select vals > thresh from source |
| pandn mm7, mm3 ; |
| mm7 select vals < thresh from blurred result |
| paddusw mm1, mm7 ; |
| combination |
| |
| packuswb mm1, mm0 ; |
| pack to bytes |
| mov DWORD PTR [edi+edx-4], eax ; |
| store previous four bytes |
| movd eax, mm1 |
| |
| add edx, 4 |
| cmp edx, cols |
| jl acrossnextcol; |
| |
| mov DWORD PTR [edi+edx-4], eax |
| pop eax |
| |
| // done with this rwo |
| add esi, eax ; |
| next line |
| mov eax, dst_pixels_per_line ; |
| destination pitch? |
| add edi, eax ; |
| next destination |
| mov eax, src_pixels_per_line ; |
| destination pitch? |
| |
| dec ecx ; |
| decrement count |
| jnz nextrow ; |
| next row |
| pop ebx |
| |
| } |
| } |
| |
| |
| |
| void vp8_post_proc_down_and_across_xmm |
| ( |
| unsigned char *src_ptr, |
| unsigned char *dst_ptr, |
| int src_pixels_per_line, |
| int dst_pixels_per_line, |
| int rows, |
| int cols, |
| int flimit |
| ) |
| { |
| #ifdef RELOCATEABLE |
| R4D2 |
| #endif |
| |
| __asm |
| { |
| movd xmm2, flimit |
| punpcklwd xmm2, xmm2 |
| punpckldq xmm2, xmm2 |
| punpcklqdq xmm2, xmm2 |
| |
| mov esi, src_ptr |
| mov edi, dst_ptr |
| |
| mov ecx, DWORD PTR rows |
| mov eax, src_pixels_per_line ; |
| destination pitch? |
| pxor xmm0, xmm0 ; |
| mm0 = 00000000 |
| |
| nextrow: |
| |
| xor edx, edx ; |
| |
| clear out edx for use as loop counter |
| nextcol: |
| movq xmm3, QWORD PTR [esi] ; |
| |
| mm4 = r0 p0..p7 |
| punpcklbw xmm3, xmm0 ; |
| mm3 = p0..p3 |
| movdqa xmm1, xmm3 ; |
| mm1 = p0..p3 |
| psllw xmm3, 2 ; |
| |
| movq xmm5, QWORD PTR [esi + eax] ; |
| mm4 = r1 p0..p7 |
| punpcklbw xmm5, xmm0 ; |
| mm5 = r1 p0..p3 |
| paddusw xmm3, xmm5 ; |
| mm3 += mm6 |
| |
| ; |
| thresholding |
| movdqa xmm7, xmm1 ; |
| mm7 = r0 p0..p3 |
| psubusw xmm7, xmm5 ; |
| mm7 = r0 p0..p3 - r1 p0..p3 |
| psubusw xmm5, xmm1 ; |
| mm5 = r1 p0..p3 - r0 p0..p3 |
| paddusw xmm7, xmm5 ; |
| mm7 = abs(r0 p0..p3 - r1 p0..p3) |
| pcmpgtw xmm7, xmm2 |
| |
| movq xmm5, QWORD PTR [esi + 2*eax] ; |
| mm4 = r2 p0..p7 |
| punpcklbw xmm5, xmm0 ; |
| mm5 = r2 p0..p3 |
| paddusw xmm3, xmm5 ; |
| mm3 += mm5 |
| |
| ; |
| thresholding |
| movdqa xmm6, xmm1 ; |
| mm6 = r0 p0..p3 |
| psubusw xmm6, xmm5 ; |
| mm6 = r0 p0..p3 - r2 p0..p3 |
| psubusw xmm5, xmm1 ; |
| mm5 = r2 p0..p3 - r2 p0..p3 |
| paddusw xmm6, xmm5 ; |
| mm6 = abs(r0 p0..p3 - r2 p0..p3) |
| pcmpgtw xmm6, xmm2 |
| por xmm7, xmm6 ; |
| accumulate thresholds |
| |
| |
| neg eax |
| movq xmm5, QWORD PTR [esi+2*eax] ; |
| mm4 = r-2 p0..p7 |
| punpcklbw xmm5, xmm0 ; |
| mm5 = r-2 p0..p3 |
| paddusw xmm3, xmm5 ; |
| mm3 += mm5 |
| |
| ; |
| thresholding |
| movdqa xmm6, xmm1 ; |
| mm6 = r0 p0..p3 |
| psubusw xmm6, xmm5 ; |
| mm6 = p0..p3 - r-2 p0..p3 |
| psubusw xmm5, xmm1 ; |
| mm5 = r-2 p0..p3 - p0..p3 |
| paddusw xmm6, xmm5 ; |
| mm6 = abs(r0 p0..p3 - r-2 p0..p3) |
| pcmpgtw xmm6, xmm2 |
| por xmm7, xmm6 ; |
| accumulate thresholds |
| |
| movq xmm4, QWORD PTR [esi+eax] ; |
| mm4 = r-1 p0..p7 |
| punpcklbw xmm4, xmm0 ; |
| mm4 = r-1 p0..p3 |
| paddusw xmm3, xmm4 ; |
| mm3 += mm5 |
| |
| ; |
| thresholding |
| movdqa xmm6, xmm1 ; |
| mm6 = r0 p0..p3 |
| psubusw xmm6, xmm4 ; |
| mm6 = p0..p3 - r-2 p0..p3 |
| psubusw xmm4, xmm1 ; |
| mm5 = r-1 p0..p3 - p0..p3 |
| paddusw xmm6, xmm4 ; |
| mm6 = abs(r0 p0..p3 - r-1 p0..p3) |
| pcmpgtw xmm6, xmm2 |
| por xmm7, xmm6 ; |
| accumulate thresholds |
| |
| |
| paddusw xmm3, rd42 ; |
| mm3 += round value |
| psraw xmm3, 3 ; |
| mm3 /= 8 |
| |
| pand xmm1, xmm7 ; |
| mm1 select vals > thresh from source |
| pandn xmm7, xmm3 ; |
| mm7 select vals < thresh from blurred result |
| paddusw xmm1, xmm7 ; |
| combination |
| |
| packuswb xmm1, xmm0 ; |
| pack to bytes |
| movq QWORD PTR [edi], xmm1 ; |
| |
| neg eax ; |
| pitch is positive |
| add esi, 8 |
| add edi, 8 |
| |
| add edx, 8 |
| cmp edx, cols |
| |
| jl nextcol |
| |
| // done with the all cols, start the across filtering in place |
| sub esi, edx |
| sub edi, edx |
| |
| xor edx, edx |
| movq mm0, QWORD PTR [edi-8]; |
| |
| acrossnextcol: |
| movq xmm7, QWORD PTR [edi +edx -2] |
| movd xmm4, DWORD PTR [edi +edx +6] |
| |
| pslldq xmm4, 8 |
| por xmm4, xmm7 |
| |
| movdqa xmm3, xmm4 |
| psrldq xmm3, 2 |
| punpcklbw xmm3, xmm0 ; |
| mm3 = p0..p3 |
| movdqa xmm1, xmm3 ; |
| mm1 = p0..p3 |
| psllw xmm3, 2 |
| |
| |
| movdqa xmm5, xmm4 |
| psrldq xmm5, 3 |
| punpcklbw xmm5, xmm0 ; |
| mm5 = p1..p4 |
| paddusw xmm3, xmm5 ; |
| mm3 += mm6 |
| |
| ; |
| thresholding |
| movdqa xmm7, xmm1 ; |
| mm7 = p0..p3 |
| psubusw xmm7, xmm5 ; |
| mm7 = p0..p3 - p1..p4 |
| psubusw xmm5, xmm1 ; |
| mm5 = p1..p4 - p0..p3 |
| paddusw xmm7, xmm5 ; |
| mm7 = abs(p0..p3 - p1..p4) |
| pcmpgtw xmm7, xmm2 |
| |
| movdqa xmm5, xmm4 |
| psrldq xmm5, 4 |
| punpcklbw xmm5, xmm0 ; |
| mm5 = p2..p5 |
| paddusw xmm3, xmm5 ; |
| mm3 += mm5 |
| |
| ; |
| thresholding |
| movdqa xmm6, xmm1 ; |
| mm6 = p0..p3 |
| psubusw xmm6, xmm5 ; |
| mm6 = p0..p3 - p1..p4 |
| psubusw xmm5, xmm1 ; |
| mm5 = p1..p4 - p0..p3 |
| paddusw xmm6, xmm5 ; |
| mm6 = abs(p0..p3 - p1..p4) |
| pcmpgtw xmm6, xmm2 |
| por xmm7, xmm6 ; |
| accumulate thresholds |
| |
| |
| movdqa xmm5, xmm4 ; |
| mm5 = p-2..p5 |
| punpcklbw xmm5, xmm0 ; |
| mm5 = p-2..p1 |
| paddusw xmm3, xmm5 ; |
| mm3 += mm5 |
| |
| ; |
| thresholding |
| movdqa xmm6, xmm1 ; |
| mm6 = p0..p3 |
| psubusw xmm6, xmm5 ; |
| mm6 = p0..p3 - p1..p4 |
| psubusw xmm5, xmm1 ; |
| mm5 = p1..p4 - p0..p3 |
| paddusw xmm6, xmm5 ; |
| mm6 = abs(p0..p3 - p1..p4) |
| pcmpgtw xmm6, xmm2 |
| por xmm7, xmm6 ; |
| accumulate thresholds |
| |
| psrldq xmm4, 1 ; |
| mm4 = p-1..p5 |
| punpcklbw xmm4, xmm0 ; |
| mm4 = p-1..p2 |
| paddusw xmm3, xmm4 ; |
| mm3 += mm5 |
| |
| ; |
| thresholding |
| movdqa xmm6, xmm1 ; |
| mm6 = p0..p3 |
| psubusw xmm6, xmm4 ; |
| mm6 = p0..p3 - p1..p4 |
| psubusw xmm4, xmm1 ; |
| mm5 = p1..p4 - p0..p3 |
| paddusw xmm6, xmm4 ; |
| mm6 = abs(p0..p3 - p1..p4) |
| pcmpgtw xmm6, xmm2 |
| por xmm7, xmm6 ; |
| accumulate thresholds |
| |
| paddusw xmm3, rd42 ; |
| mm3 += round value |
| psraw xmm3, 3 ; |
| mm3 /= 8 |
| |
| pand xmm1, xmm7 ; |
| mm1 select vals > thresh from source |
| pandn xmm7, xmm3 ; |
| mm7 select vals < thresh from blurred result |
| paddusw xmm1, xmm7 ; |
| combination |
| |
| packuswb xmm1, xmm0 ; |
| pack to bytes |
| movq QWORD PTR [edi+edx-8], mm0 ; |
| store previous four bytes |
| movdq2q mm0, xmm1 |
| |
| add edx, 8 |
| cmp edx, cols |
| jl acrossnextcol; |
| |
| // last 8 pixels |
| movq QWORD PTR [edi+edx-8], mm0 |
| |
| // done with this rwo |
| add esi, eax ; |
| next line |
| mov eax, dst_pixels_per_line ; |
| destination pitch? |
| add edi, eax ; |
| next destination |
| mov eax, src_pixels_per_line ; |
| destination pitch? |
| |
| dec ecx ; |
| decrement count |
| jnz nextrow ; |
| next row |
| } |
| } |
| |
| |
| void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit) |
| { |
| int c, i; |
| __declspec(align(16)) |
| int flimit2[2]; |
| __declspec(align(16)) |
| unsigned char d[16][8]; |
| |
| flimit = vp8_q2mbl(flimit); |
| |
| for (i = 0; i < 2; i++) |
| flimit2[i] = flimit; |
| |
| rows += 8; |
| |
| for (c = 0; c < cols; c += 4) |
| { |
| unsigned char *s = &dst[c]; |
| |
| __asm |
| { |
| mov esi, s ; |
| pxor mm0, mm0 ; |
| |
| mov eax, pitch ; |
| neg eax // eax = -pitch |
| |
| lea esi, [esi + eax*8]; // edi = s[-pitch*8] |
| neg eax |
| |
| |
| pxor mm5, mm5 |
| pxor mm6, mm6 ; |
| |
| pxor mm7, mm7 ; |
| mov edi, esi |
| |
| mov ecx, 15 ; |
| |
| loop_initvar: |
| movd mm1, DWORD PTR [edi]; |
| punpcklbw mm1, mm0 ; |
| |
| paddw mm5, mm1 ; |
| pmullw mm1, mm1 ; |
| |
| movq mm2, mm1 ; |
| punpcklwd mm1, mm0 ; |
| |
| punpckhwd mm2, mm0 ; |
| paddd mm6, mm1 ; |
| |
| paddd mm7, mm2 ; |
| lea edi, [edi+eax] ; |
| |
| dec ecx |
| jne loop_initvar |
| //save the var and sum |
| xor edx, edx |
| loop_row: |
| movd mm1, DWORD PTR [esi] // [s-pitch*8] |
| movd mm2, DWORD PTR [edi] // [s+pitch*7] |
| |
| punpcklbw mm1, mm0 |
| punpcklbw mm2, mm0 |
| |
| paddw mm5, mm2 |
| psubw mm5, mm1 |
| |
| pmullw mm2, mm2 |
| movq mm4, mm2 |
| |
| punpcklwd mm2, mm0 |
| punpckhwd mm4, mm0 |
| |
| paddd mm6, mm2 |
| paddd mm7, mm4 |
| |
| pmullw mm1, mm1 |
| movq mm2, mm1 |
| |
| punpcklwd mm1, mm0 |
| psubd mm6, mm1 |
| |
| punpckhwd mm2, mm0 |
| psubd mm7, mm2 |
| |
| |
| movq mm3, mm6 |
| pslld mm3, 4 |
| |
| psubd mm3, mm6 |
| movq mm1, mm5 |
| |
| movq mm4, mm5 |
| pmullw mm1, mm1 |
| |
| pmulhw mm4, mm4 |
| movq mm2, mm1 |
| |
| punpcklwd mm1, mm4 |
| punpckhwd mm2, mm4 |
| |
| movq mm4, mm7 |
| pslld mm4, 4 |
| |
| psubd mm4, mm7 |
| |
| psubd mm3, mm1 |
| psubd mm4, mm2 |
| |
| psubd mm3, flimit2 |
| psubd mm4, flimit2 |
| |
| psrad mm3, 31 |
| psrad mm4, 31 |
| |
| packssdw mm3, mm4 |
| packsswb mm3, mm0 |
| |
| movd mm1, DWORD PTR [esi+eax*8] |
| |
| movq mm2, mm1 |
| punpcklbw mm1, mm0 |
| |
| paddw mm1, mm5 |
| mov ecx, edx |
| |
| and ecx, 127 |
| movq mm4, vp8_rv[ecx*2] |
| |
| paddw mm1, mm4 |
| //paddw xmm1, eight8s |
| psraw mm1, 4 |
| |
| packuswb mm1, mm0 |
| pand mm1, mm3 |
| |
| pandn mm3, mm2 |
| por mm1, mm3 |
| |
| and ecx, 15 |
| movd DWORD PTR d[ecx*4], mm1 |
| |
| mov ecx, edx |
| sub ecx, 8 |
| |
| and ecx, 15 |
| movd mm1, DWORD PTR d[ecx*4] |
| |
| movd [esi], mm1 |
| lea esi, [esi+eax] |
| |
| lea edi, [edi+eax] |
| add edx, 1 |
| |
| cmp edx, rows |
| jl loop_row |
| |
| } |
| |
| } |
| } |
| |
| void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit) |
| { |
| int c, i; |
| __declspec(align(16)) |
| int flimit4[4]; |
| __declspec(align(16)) |
| unsigned char d[16][8]; |
| |
| flimit = vp8_q2mbl(flimit); |
| |
| for (i = 0; i < 4; i++) |
| flimit4[i] = flimit; |
| |
| rows += 8; |
| |
| for (c = 0; c < cols; c += 8) |
| { |
| unsigned char *s = &dst[c]; |
| |
| __asm |
| { |
| mov esi, s ; |
| pxor xmm0, xmm0 ; |
| |
| mov eax, pitch ; |
| neg eax // eax = -pitch |
| |
| lea esi, [esi + eax*8]; // edi = s[-pitch*8] |
| neg eax |
| |
| |
| pxor xmm5, xmm5 |
| pxor xmm6, xmm6 ; |
| |
| pxor xmm7, xmm7 ; |
| mov edi, esi |
| |
| mov ecx, 15 ; |
| |
| loop_initvar: |
| movq xmm1, QWORD PTR [edi]; |
| punpcklbw xmm1, xmm0 ; |
| |
| paddw xmm5, xmm1 ; |
| pmullw xmm1, xmm1 ; |
| |
| movdqa xmm2, xmm1 ; |
| punpcklwd xmm1, xmm0 ; |
| |
| punpckhwd xmm2, xmm0 ; |
| paddd xmm6, xmm1 ; |
| |
| paddd xmm7, xmm2 ; |
| lea edi, [edi+eax] ; |
| |
| dec ecx |
| jne loop_initvar |
| //save the var and sum |
| xor edx, edx |
| loop_row: |
| movq xmm1, QWORD PTR [esi] // [s-pitch*8] |
| movq xmm2, QWORD PTR [edi] // [s+pitch*7] |
| |
| punpcklbw xmm1, xmm0 |
| punpcklbw xmm2, xmm0 |
| |
| paddw xmm5, xmm2 |
| psubw xmm5, xmm1 |
| |
| pmullw xmm2, xmm2 |
| movdqa xmm4, xmm2 |
| |
| punpcklwd xmm2, xmm0 |
| punpckhwd xmm4, xmm0 |
| |
| paddd xmm6, xmm2 |
| paddd xmm7, xmm4 |
| |
| pmullw xmm1, xmm1 |
| movdqa xmm2, xmm1 |
| |
| punpcklwd xmm1, xmm0 |
| psubd xmm6, xmm1 |
| |
| punpckhwd xmm2, xmm0 |
| psubd xmm7, xmm2 |
| |
| |
| movdqa xmm3, xmm6 |
| pslld xmm3, 4 |
| |
| psubd xmm3, xmm6 |
| movdqa xmm1, xmm5 |
| |
| movdqa xmm4, xmm5 |
| pmullw xmm1, xmm1 |
| |
| pmulhw xmm4, xmm4 |
| movdqa xmm2, xmm1 |
| |
| punpcklwd xmm1, xmm4 |
| punpckhwd xmm2, xmm4 |
| |
| movdqa xmm4, xmm7 |
| pslld xmm4, 4 |
| |
| psubd xmm4, xmm7 |
| |
| psubd xmm3, xmm1 |
| psubd xmm4, xmm2 |
| |
| psubd xmm3, flimit4 |
| psubd xmm4, flimit4 |
| |
| psrad xmm3, 31 |
| psrad xmm4, 31 |
| |
| packssdw xmm3, xmm4 |
| packsswb xmm3, xmm0 |
| |
| movq xmm1, QWORD PTR [esi+eax*8] |
| |
| movq xmm2, xmm1 |
| punpcklbw xmm1, xmm0 |
| |
| paddw xmm1, xmm5 |
| mov ecx, edx |
| |
| and ecx, 127 |
| movdqu xmm4, vp8_rv[ecx*2] |
| |
| paddw xmm1, xmm4 |
| //paddw xmm1, eight8s |
| psraw xmm1, 4 |
| |
| packuswb xmm1, xmm0 |
| pand xmm1, xmm3 |
| |
| pandn xmm3, xmm2 |
| por xmm1, xmm3 |
| |
| and ecx, 15 |
| movq QWORD PTR d[ecx*8], xmm1 |
| |
| mov ecx, edx |
| sub ecx, 8 |
| |
| and ecx, 15 |
| movq mm0, d[ecx*8] |
| |
| movq [esi], mm0 |
| lea esi, [esi+eax] |
| |
| lea edi, [edi+eax] |
| add edx, 1 |
| |
| cmp edx, rows |
| jl loop_row |
| |
| } |
| |
| } |
| } |
| #if 0 |
| /**************************************************************************** |
| * |
| * ROUTINE : plane_add_noise_wmt |
| * |
| * INPUTS : unsigned char *Start starting address of buffer to add gaussian |
| * noise to |
| * unsigned int Width width of plane |
| * unsigned int Height height of plane |
| * int Pitch distance between subsequent lines of frame |
| * int q quantizer used to determine amount of noise |
| * to add |
| * |
| * OUTPUTS : None. |
| * |
| * RETURNS : void. |
| * |
| * FUNCTION : adds gaussian noise to a plane of pixels |
| * |
| * SPECIAL NOTES : None. |
| * |
| ****************************************************************************/ |
| void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) |
| { |
| unsigned int i; |
| |
| __declspec(align(16)) unsigned char blackclamp[16]; |
| __declspec(align(16)) unsigned char whiteclamp[16]; |
| __declspec(align(16)) unsigned char bothclamp[16]; |
| char char_dist[300]; |
| char Rand[2048]; |
| double sigma; |
| // return; |
| __asm emms |
| sigma = a + .5 + .6 * (63 - q) / 63.0; |
| |
| // set up a lookup table of 256 entries that matches |
| // a gaussian distribution with sigma determined by q. |
| // |
| { |
| double i; |
| int next, j; |
| |
| next = 0; |
| |
| for (i = -32; i < 32; i++) |
| { |
| double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i); |
| int a = (int)(g + .5); |
| |
| if (a) |
| { |
| for (j = 0; j < a; j++) |
| { |
| char_dist[next+j] = (char) i; |
| } |
| |
| next = next + j; |
| } |
| |
| } |
| |
| for (next = next; next < 256; next++) |
| char_dist[next] = 0; |
| |
| } |
| |
| for (i = 0; i < 2048; i++) |
| { |
| Rand[i] = char_dist[rand() & 0xff]; |
| } |
| |
| for (i = 0; i < 16; i++) |
| { |
| blackclamp[i] = -char_dist[0]; |
| whiteclamp[i] = -char_dist[0]; |
| bothclamp[i] = -2 * char_dist[0]; |
| } |
| |
| for (i = 0; i < Height; i++) |
| { |
| unsigned char *Pos = Start + i * Pitch; |
| char *Ref = Rand + (rand() & 0xff); |
| |
| __asm |
| { |
| mov ecx, [Width] |
| mov esi, Pos |
| mov edi, Ref |
| xor eax, eax |
| |
| nextset: |
| movdqu xmm1, [esi+eax] // get the source |
| |
| psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise |
| paddusb xmm1, bothclamp |
| psubusb xmm1, whiteclamp |
| |
| movdqu xmm2, [edi+eax] // get the noise for this line |
| paddb xmm1, xmm2 // add it in |
| movdqu [esi+eax], xmm1 // store the result |
| |
| add eax, 16 // move to the next line |
| |
| cmp eax, ecx |
| jl nextset |
| |
| |
| } |
| |
| } |
| } |
| #endif |
| __declspec(align(16)) |
| static const int four8s[4] = { 8, 8, 8, 8}; |
| void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit) |
| { |
| int r, i; |
| __declspec(align(16)) |
| int flimit4[4]; |
| unsigned char *s = src; |
| int sumsq; |
| int sum; |
| |
| |
| flimit = vp8_q2mbl(flimit); |
| flimit4[0] = |
| flimit4[1] = |
| flimit4[2] = |
| flimit4[3] = flimit; |
| |
| for (r = 0; r < rows; r++) |
| { |
| |
| |
| sumsq = 0; |
| sum = 0; |
| |
| for (i = -8; i <= 6; i++) |
| { |
| sumsq += s[i] * s[i]; |
| sum += s[i]; |
| } |
| |
| __asm |
| { |
| mov eax, sumsq |
| movd xmm7, eax |
| |
| mov eax, sum |
| movd xmm6, eax |
| |
| mov esi, s |
| xor ecx, ecx |
| |
| mov edx, cols |
| add edx, 8 |
| pxor mm0, mm0 |
| pxor mm1, mm1 |
| |
| pxor xmm0, xmm0 |
| nextcol4: |
| |
| movd xmm1, DWORD PTR [esi+ecx-8] // -8 -7 -6 -5 |
| movd xmm2, DWORD PTR [esi+ecx+7] // +7 +8 +9 +10 |
| |
| punpcklbw xmm1, xmm0 // expanding |
| punpcklbw xmm2, xmm0 // expanding |
| |
| punpcklwd xmm1, xmm0 // expanding to dwords |
| punpcklwd xmm2, xmm0 // expanding to dwords |
| |
| psubd xmm2, xmm1 // 7--8 8--7 9--6 10--5 |
| paddd xmm1, xmm1 // -8*2 -7*2 -6*2 -5*2 |
| |
| paddd xmm1, xmm2 // 7+-8 8+-7 9+-6 10+-5 |
| pmaddwd xmm1, xmm2 // squared of 7+-8 8+-7 9+-6 10+-5 |
| |
| paddd xmm6, xmm2 |
| paddd xmm7, xmm1 |
| |
| pshufd xmm6, xmm6, 0 // duplicate the last ones |
| pshufd xmm7, xmm7, 0 // duplicate the last ones |
| |
| psrldq xmm1, 4 // 8--7 9--6 10--5 0000 |
| psrldq xmm2, 4 // 8--7 9--6 10--5 0000 |
| |
| pshufd xmm3, xmm1, 3 // 0000 8--7 8--7 8--7 squared |
| pshufd xmm4, xmm2, 3 // 0000 8--7 8--7 8--7 squared |
| |
| paddd xmm6, xmm4 |
| paddd xmm7, xmm3 |
| |
| pshufd xmm3, xmm1, 01011111b // 0000 0000 9--6 9--6 squared |
| pshufd xmm4, xmm2, 01011111b // 0000 0000 9--6 9--6 squared |
| |
| paddd xmm7, xmm3 |
| paddd xmm6, xmm4 |
| |
| pshufd xmm3, xmm1, 10111111b // 0000 0000 8--7 8--7 squared |
| pshufd xmm4, xmm2, 10111111b // 0000 0000 8--7 8--7 squared |
| |
| paddd xmm7, xmm3 |
| paddd xmm6, xmm4 |
| |
| movdqa xmm3, xmm6 |
| pmaddwd xmm3, xmm3 |
| |
| movdqa xmm5, xmm7 |
| pslld xmm5, 4 |
| |
| psubd xmm5, xmm7 |
| psubd xmm5, xmm3 |
| |
| psubd xmm5, flimit4 |
| psrad xmm5, 31 |
| |
| packssdw xmm5, xmm0 |
| packsswb xmm5, xmm0 |
| |
| movd xmm1, DWORD PTR [esi+ecx] |
| movq xmm2, xmm1 |
| |
| punpcklbw xmm1, xmm0 |
| punpcklwd xmm1, xmm0 |
| |
| paddd xmm1, xmm6 |
| paddd xmm1, four8s |
| |
| psrad xmm1, 4 |
| packssdw xmm1, xmm0 |
| |
| packuswb xmm1, xmm0 |
| pand xmm1, xmm5 |
| |
| pandn xmm5, xmm2 |
| por xmm5, xmm1 |
| |
| movd [esi+ecx-8], mm0 |
| movq mm0, mm1 |
| |
| movdq2q mm1, xmm5 |
| psrldq xmm7, 12 |
| |
| psrldq xmm6, 12 |
| add ecx, 4 |
| |
| cmp ecx, edx |
| jl nextcol4 |
| |
| } |
| s += pitch; |
| } |
| } |
| |
| #if 0 |
| |
| /**************************************************************************** |
| * |
| * ROUTINE : plane_add_noise_mmx |
| * |
| * INPUTS : unsigned char *Start starting address of buffer to add gaussian |
| * noise to |
| * unsigned int Width width of plane |
| * unsigned int Height height of plane |
| * int Pitch distance between subsequent lines of frame |
| * int q quantizer used to determine amount of noise |
| * to add |
| * |
| * OUTPUTS : None. |
| * |
| * RETURNS : void. |
| * |
| * FUNCTION : adds gaussian noise to a plane of pixels |
| * |
| * SPECIAL NOTES : None. |
| * |
| ****************************************************************************/ |
| void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) |
| { |
| unsigned int i; |
| int Pitch4 = Pitch * 4; |
| const int noise_amount = 2; |
| const int noise_adder = 2 * noise_amount + 1; |
| |
| __declspec(align(16)) unsigned char blackclamp[16]; |
| __declspec(align(16)) unsigned char whiteclamp[16]; |
| __declspec(align(16)) unsigned char bothclamp[16]; |
| |
| char char_dist[300]; |
| char Rand[2048]; |
| |
| double sigma; |
| __asm emms |
| sigma = a + .5 + .6 * (63 - q) / 63.0; |
| |
| // set up a lookup table of 256 entries that matches |
| // a gaussian distribution with sigma determined by q. |
| // |
| { |
| double i, sum = 0; |
| int next, j; |
| |
| next = 0; |
| |
| for (i = -32; i < 32; i++) |
| { |
| int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i)); |
| |
| if (a) |
| { |
| for (j = 0; j < a; j++) |
| { |
| char_dist[next+j] = (char) i; |
| } |
| |
| next = next + j; |
| } |
| |
| } |
| |
| for (next = next; next < 256; next++) |
| char_dist[next] = 0; |
| |
| } |
| |
| for (i = 0; i < 2048; i++) |
| { |
| Rand[i] = char_dist[rand() & 0xff]; |
| } |
| |
| for (i = 0; i < 16; i++) |
| { |
| blackclamp[i] = -char_dist[0]; |
| whiteclamp[i] = -char_dist[0]; |
| bothclamp[i] = -2 * char_dist[0]; |
| } |
| |
| for (i = 0; i < Height; i++) |
| { |
| unsigned char *Pos = Start + i * Pitch; |
| char *Ref = Rand + (rand() & 0xff); |
| |
| __asm |
| { |
| mov ecx, [Width] |
| mov esi, Pos |
| mov edi, Ref |
| xor eax, eax |
| |
| nextset: |
| movq mm1, [esi+eax] // get the source |
| |
| psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise |
| paddusb mm1, bothclamp |
| psubusb mm1, whiteclamp |
| |
| movq mm2, [edi+eax] // get the noise for this line |
| paddb mm1, mm2 // add it in |
| movq [esi+eax], mm1 // store the result |
| |
| add eax, 8 // move to the next line |
| |
| cmp eax, ecx |
| jl nextset |
| |
| |
| } |
| |
| } |
| } |
| #else |
| extern char an[8][64][3072]; |
| extern int cd[8][64]; |
| |
| void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) |
| { |
| unsigned int i; |
| __declspec(align(16)) unsigned char blackclamp[16]; |
| __declspec(align(16)) unsigned char whiteclamp[16]; |
| __declspec(align(16)) unsigned char bothclamp[16]; |
| |
| |
| __asm emms |
| |
| for (i = 0; i < 16; i++) |
| { |
| blackclamp[i] = -cd[a][q]; |
| whiteclamp[i] = -cd[a][q]; |
| bothclamp[i] = -2 * cd[a][q]; |
| } |
| |
| for (i = 0; i < Height; i++) |
| { |
| unsigned char *Pos = Start + i * Pitch; |
| char *Ref = an[a][q] + (rand() & 0xff); |
| |
| __asm |
| { |
| mov ecx, [Width] |
| mov esi, Pos |
| mov edi, Ref |
| xor eax, eax |
| |
| nextset: |
| movq mm1, [esi+eax] // get the source |
| |
| psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise |
| paddusb mm1, bothclamp |
| psubusb mm1, whiteclamp |
| |
| movq mm2, [edi+eax] // get the noise for this line |
| paddb mm1, mm2 // add it in |
| movq [esi+eax], mm1 // store the result |
| |
| add eax, 8 // move to the next line |
| |
| cmp eax, ecx |
| jl nextset |
| } |
| } |
| } |
| |
| |
| void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) |
| { |
| unsigned int i; |
| |
| __declspec(align(16)) unsigned char blackclamp[16]; |
| __declspec(align(16)) unsigned char whiteclamp[16]; |
| __declspec(align(16)) unsigned char bothclamp[16]; |
| |
| __asm emms |
| |
| for (i = 0; i < 16; i++) |
| { |
| blackclamp[i] = -cd[a][q]; |
| whiteclamp[i] = -cd[a][q]; |
| bothclamp[i] = -2 * cd[a][q]; |
| } |
| |
| for (i = 0; i < Height; i++) |
| { |
| unsigned char *Pos = Start + i * Pitch; |
| char *Ref = an[a][q] + (rand() & 0xff); |
| |
| __asm |
| { |
| mov ecx, [Width] |
| mov esi, Pos |
| mov edi, Ref |
| xor eax, eax |
| |
| nextset: |
| movdqu xmm1, [esi+eax] // get the source |
| |
| psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise |
| paddusb xmm1, bothclamp |
| psubusb xmm1, whiteclamp |
| |
| movdqu xmm2, [edi+eax] // get the noise for this line |
| paddb xmm1, xmm2 // add it in |
| movdqu [esi+eax], xmm1 // store the result |
| |
| add eax, 16 // move to the next line |
| |
| cmp eax, ecx |
| jl nextset |
| } |
| } |
| } |
| |
| #endif |