| /* |
| * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| |
| /**************************************************************************** |
| * |
| * Module Title : scaleopt.cpp |
| * |
| * Description : Optimized scaling functions |
| * |
| ****************************************************************************/ |
| #include "pragmas.h" |
| |
| /**************************************************************************** |
| * Module Statics |
| ****************************************************************************/ |
| #if 0 |
| __declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 }; |
| __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 }; |
| __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 }; |
| __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 }; |
| __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 }; |
| __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1}; |
| __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 }; |
| __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 }; |
| __declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0}; |
| __declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 }; |
| __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 }; |
| #endif |
| |
| #include "vpx_scale/vpxscale.h" |
| #include "vpx_mem/vpx_mem.h" |
| |
| /**************************************************************************** |
| * |
| * ROUTINE : horizontal_line_3_5_scale_mmx |
| * |
| * INPUTS : const unsigned char *source : |
| * unsigned int source_width : |
| * unsigned char *dest : |
| * unsigned int dest_width : |
| * |
| * OUTPUTS : None. |
| * |
| * RETURNS : void |
| * |
| * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels. |
| * |
| * SPECIAL NOTES : None. |
| * |
| ****************************************************************************/ |
| static |
| void horizontal_line_3_5_scale_mmx |
| ( |
| const unsigned char *source, |
| unsigned int source_width, |
| unsigned char *dest, |
| unsigned int dest_width |
| ) |
| { |
| __declspec(align(16)) unsigned short const35_2[] = { 154, 51, 205, 102 }; |
| __declspec(align(16)) unsigned short const35_1[] = { 102, 205, 51, 154 }; |
| __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; |
| |
| (void) dest_width; |
| |
| __asm |
| { |
| |
| push ebx |
| |
| mov esi, source |
| mov edi, dest |
| |
| mov ecx, source_width |
| lea edx, [esi+ecx-3]; |
| |
| movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx |
| movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx |
| |
| movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx |
| pxor mm7, mm7 // clear mm7 |
| |
| horiz_line_3_5_loop: |
| |
| mov eax, DWORD PTR [esi] // eax = 00 01 02 03 |
| mov ebx, eax |
| |
| and ebx, 0xffff00 // ebx = xx 01 02 xx |
| mov ecx, eax // ecx = 00 01 02 03 |
| |
| and eax, 0xffff0000 // eax = xx xx 02 03 |
| xor ecx, eax // ecx = 00 01 xx xx |
| |
| shr ebx, 8 // ebx = 01 02 xx xx |
| or eax, ebx // eax = 01 02 02 03 |
| |
| shl ebx, 16 // ebx = xx xx 01 02 |
| movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx |
| |
| or ebx, ecx // ebx = 00 01 01 02 |
| punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx |
| |
| movd mm0, ebx // mm0 = 00 01 01 02 |
| pmullw mm1, mm6 // |
| |
| punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx |
| pmullw mm0, mm5 // |
| |
| mov [edi], ebx // writeoutput 00 xx xx xx |
| add esi, 3 |
| |
| add edi, 5 |
| paddw mm0, mm1 |
| |
| paddw mm0, mm4 |
| psrlw mm0, 8 |
| |
| cmp esi, edx |
| packuswb mm0, mm7 |
| |
| movd DWORD Ptr [edi-4], mm0 |
| jl horiz_line_3_5_loop |
| |
| //Exit: |
| mov eax, DWORD PTR [esi] // eax = 00 01 02 03 |
| mov ebx, eax |
| |
| and ebx, 0xffff00 // ebx = xx 01 02 xx |
| mov ecx, eax // ecx = 00 01 02 03 |
| |
| and eax, 0xffff0000 // eax = xx xx 02 03 |
| xor ecx, eax // ecx = 00 01 xx xx |
| |
| shr ebx, 8 // ebx = 01 02 xx xx |
| or eax, ebx // eax = 01 02 02 03 |
| |
| shl eax, 8 // eax = xx 01 02 02 |
| and eax, 0xffff0000 // eax = xx xx 02 02 |
| |
| or eax, ebx // eax = 01 02 02 02 |
| |
| shl ebx, 16 // ebx = xx xx 01 02 |
| movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx |
| |
| or ebx, ecx // ebx = 00 01 01 02 |
| punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx |
| |
| movd mm0, ebx // mm0 = 00 01 01 02 |
| pmullw mm1, mm6 // |
| |
| punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx |
| pmullw mm0, mm5 // |
| |
| mov [edi], ebx // writeoutput 00 xx xx xx |
| paddw mm0, mm1 |
| |
| paddw mm0, mm4 |
| psrlw mm0, 8 |
| |
| packuswb mm0, mm7 |
| movd DWORD Ptr [edi+1], mm0 |
| |
| pop ebx |
| |
| } |
| |
| /* |
| const unsigned char *src = source; |
| unsigned char *des = dest; |
| unsigned int a, b, c ; |
| unsigned int i; |
| (void) dest_width; |
| |
| for ( i=0; i<source_width-3; i+=3 ) |
| { |
| a = src[0]; |
| b = src[1]; |
| des [0] = (UINT8) (a); |
| // 2 * left + 3 * right /5 |
| des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8); |
| c = src[2] ; |
| // 4 * left + 1 * right /5 |
| des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8); |
| // 1 * left + 4 * right /5 |
| des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8); |
| |
| a = src[3]; |
| // 3 * left + 2 * right /5 |
| des [4] = (UINT8) (( c * 154 + a * 102 + 128 ) >> 8); |
| |
| src += 3; |
| des += 5; |
| } |
| |
| a = src[0]; |
| b = src[1]; |
| des [0] = (UINT8) (a); |
| // 2 * left + 3 * right /5 |
| des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8); |
| c = src[2] ; |
| // 4 * left + 1 * right /5 |
| des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8); |
| // 1 * left + 4 * right /5 |
| des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8); |
| |
| des [4] = (UINT8) (c); |
| */ |
| } |
| |
| |
| /**************************************************************************** |
| * |
| * ROUTINE : horizontal_line_4_5_scale_mmx |
| * |
| * INPUTS : const unsigned char *source : |
| * unsigned int source_width : |
| * unsigned char *dest : |
| * unsigned int dest_width : |
| * |
| * OUTPUTS : None. |
| * |
| * RETURNS : void |
| * |
| * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels. |
| * |
| * SPECIAL NOTES : None. |
| * |
| ****************************************************************************/ |
| static |
| void horizontal_line_4_5_scale_mmx |
| ( |
| const unsigned char *source, |
| unsigned int source_width, |
| unsigned char *dest, |
| unsigned int dest_width |
| ) |
| { |
| __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; |
| __declspec(align(16)) unsigned short const45_2[] = {205, 154, 102, 51 }; |
| __declspec(align(16)) unsigned short const45_1[] = { 51, 102, 154, 205 }; |
| __declspec(align(16)) unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0}; |
| |
| (void)dest_width; |
| |
| __asm |
| { |
| |
| mov esi, source |
| mov edi, dest |
| |
| mov ecx, source_width |
| lea edx, [esi+ecx-8]; |
| |
| movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx |
| movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx |
| |
| movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx |
| pxor mm7, mm7 // clear mm7 |
| |
| horiz_line_4_5_loop: |
| |
| movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07 |
| movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08 |
| |
| movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 |
| movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08 |
| |
| movd DWORD PTR [edi], mm0 // write output 00 xx xx xx |
| punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx |
| |
| punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx |
| pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 |
| |
| pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 |
| punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx |
| |
| movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx |
| pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 |
| |
| punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx |
| pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51 |
| |
| paddw mm0, mm1 // added round values |
| paddw mm0, mm4 |
| |
| psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx |
| packuswb mm0, mm7 |
| |
| movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 |
| add edi, 10 |
| |
| add esi, 8 |
| paddw mm2, mm3 // |
| |
| paddw mm2, mm4 // added round values |
| cmp esi, edx |
| |
| psrlw mm2, 8 |
| packuswb mm2, mm7 |
| |
| movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09 |
| jl horiz_line_4_5_loop |
| |
| //Exit: |
| movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07 |
| movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07 |
| |
| movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 |
| psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00 |
| |
| movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00 |
| pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00 |
| |
| psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07 |
| por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07 |
| |
| movq mm3, mm1 |
| |
| movd DWORD PTR [edi], mm0 // write output 00 xx xx xx |
| punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx |
| |
| punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx |
| pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 |
| |
| pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 |
| punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx |
| |
| movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx |
| pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 |
| |
| punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx |
| pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51 |
| |
| paddw mm0, mm1 // added round values |
| paddw mm0, mm4 |
| |
| psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx |
| packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx |
| |
| movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 |
| paddw mm2, mm3 // |
| |
| paddw mm2, mm4 // added round values |
| psrlw mm2, 8 |
| |
| packuswb mm2, mm7 |
| movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09 |
| |
| |
| } |
| /* |
| const unsigned char *src = source; |
| unsigned char *des = dest; |
| unsigned int a, b, c ; |
| unsigned i; |
| (void) dest_width; |
| |
| for ( i=0; i<source_width-4; i+=4 ) |
| { |
| a = src[0]; |
| b = src[1]; |
| des [0] = (UINT8) a; |
| des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8); |
| c = src[2] * 154; |
| a = src[3]; |
| des [2] = (UINT8) (( b * 102 + c + 128) >> 8); |
| des [3] = (UINT8) (( c + 102 * a + 128) >> 8); |
| b = src[4]; |
| des [4] = (UINT8) (( a * 205 + 51 * b + 128) >> 8); |
| |
| src += 4; |
| des += 5; |
| } |
| |
| a = src[0]; |
| b = src[1]; |
| des [0] = (UINT8) (a); |
| des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8); |
| c = src[2] * 154; |
| a = src[3]; |
| des [2] = (UINT8) (( b * 102 + c + 128) >> 8); |
| des [3] = (UINT8) (( c + 102 * a + 128) >> 8); |
| des [4] = (UINT8) (a); |
| */ |
| } |
| |
| /**************************************************************************** |
| * |
| * ROUTINE : vertical_band_4_5_scale_mmx |
| * |
| * INPUTS : unsigned char *dest : |
| * unsigned int dest_pitch : |
| * unsigned int dest_width : |
| * |
| * OUTPUTS : None. |
| * |
| * RETURNS : void |
| * |
| * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels. |
| * |
| * SPECIAL NOTES : The routine uses the first line of the band below |
| * the current band. The function also has a "C" only |
| * version. |
| * |
| ****************************************************************************/ |
| static |
| void vertical_band_4_5_scale_mmx |
| ( |
| unsigned char *dest, |
| unsigned int dest_pitch, |
| unsigned int dest_width |
| ) |
| { |
| |
| __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 }; |
| __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 }; |
| __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 }; |
| __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 }; |
| __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; |
| |
| __asm |
| { |
| |
| mov esi, dest // Get the source and destination pointer |
| mov ecx, dest_pitch // Get the pitch size |
| |
| lea edi, [esi+ecx*2] // tow lines below |
| add edi, ecx // three lines below |
| |
| pxor mm7, mm7 // clear out mm7 |
| mov edx, dest_width // Loop counter |
| |
| vs_4_5_loop: |
| |
| movq mm0, QWORD ptr [esi] // src[0]; |
| movq mm1, QWORD ptr [esi+ecx] // src[1]; |
| |
| movq mm2, mm0 // Make a copy |
| punpcklbw mm0, mm7 // unpack low to word |
| |
| movq mm5, one_fifth |
| punpckhbw mm2, mm7 // unpack high to word |
| |
| pmullw mm0, mm5 // a * 1/5 |
| |
| movq mm3, mm1 // make a copy |
| punpcklbw mm1, mm7 // unpack low to word |
| |
| pmullw mm2, mm5 // a * 1/5 |
| movq mm6, four_fifths // constan |
| |
| movq mm4, mm1 // copy of low b |
| pmullw mm4, mm6 // b * 4/5 |
| |
| punpckhbw mm3, mm7 // unpack high to word |
| movq mm5, mm3 // copy of high b |
| |
| pmullw mm5, mm6 // b * 4/5 |
| paddw mm0, mm4 // a * 1/5 + b * 4/5 |
| |
| paddw mm2, mm5 // a * 1/5 + b * 4/5 |
| paddw mm0, round_values // + 128 |
| |
| paddw mm2, round_values // + 128 |
| psrlw mm0, 8 |
| |
| psrlw mm2, 8 |
| packuswb mm0, mm2 // des [1] |
| |
| movq QWORD ptr [esi+ecx], mm0 // write des[1] |
| movq mm0, [esi+ecx*2] // mm0 = src[2] |
| |
| // mm1, mm3 --- Src[1] |
| // mm0 --- Src[2] |
| // mm7 for unpacking |
| |
| movq mm5, two_fifths |
| movq mm2, mm0 // make a copy |
| |
| pmullw mm1, mm5 // b * 2/5 |
| movq mm6, three_fifths |
| |
| |
| punpcklbw mm0, mm7 // unpack low to word |
| pmullw mm3, mm5 // b * 2/5 |
| |
| movq mm4, mm0 // make copy of c |
| punpckhbw mm2, mm7 // unpack high to word |
| |
| pmullw mm4, mm6 // c * 3/5 |
| movq mm5, mm2 |
| |
| pmullw mm5, mm6 // c * 3/5 |
| paddw mm1, mm4 // b * 2/5 + c * 3/5 |
| |
| paddw mm3, mm5 // b * 2/5 + c * 3/5 |
| paddw mm1, round_values // + 128 |
| |
| paddw mm3, round_values // + 128 |
| psrlw mm1, 8 |
| |
| psrlw mm3, 8 |
| packuswb mm1, mm3 // des[2] |
| |
| movq QWORD ptr [esi+ecx*2], mm1 // write des[2] |
| movq mm1, [edi] // mm1=Src[3]; |
| |
| // mm0, mm2 --- Src[2] |
| // mm1 --- Src[3] |
| // mm6 --- 3/5 |
| // mm7 for unpacking |
| |
| pmullw mm0, mm6 // c * 3/5 |
| movq mm5, two_fifths // mm5 = 2/5 |
| |
| movq mm3, mm1 // make a copy |
| pmullw mm2, mm6 // c * 3/5 |
| |
| punpcklbw mm1, mm7 // unpack low |
| movq mm4, mm1 // make a copy |
| |
| punpckhbw mm3, mm7 // unpack high |
| pmullw mm4, mm5 // d * 2/5 |
| |
| movq mm6, mm3 // make a copy |
| pmullw mm6, mm5 // d * 2/5 |
| |
| paddw mm0, mm4 // c * 3/5 + d * 2/5 |
| paddw mm2, mm6 // c * 3/5 + d * 2/5 |
| |
| paddw mm0, round_values // + 128 |
| paddw mm2, round_values // + 128 |
| |
| psrlw mm0, 8 |
| psrlw mm2, 8 |
| |
| packuswb mm0, mm2 // des[3] |
| movq QWORD ptr [edi], mm0 // write des[3] |
| |
| // mm1, mm3 --- Src[3] |
| // mm7 -- cleared for unpacking |
| |
| movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group |
| |
| movq mm5, four_fifths // mm5 = 4/5 |
| pmullw mm1, mm5 // d * 4/5 |
| |
| movq mm6, one_fifth // mm6 = 1/5 |
| movq mm2, mm0 // make a copy |
| |
| pmullw mm3, mm5 // d * 4/5 |
| punpcklbw mm0, mm7 // unpack low |
| |
| pmullw mm0, mm6 // an * 1/5 |
| punpckhbw mm2, mm7 // unpack high |
| |
| paddw mm1, mm0 // d * 4/5 + an * 1/5 |
| pmullw mm2, mm6 // an * 1/5 |
| |
| paddw mm3, mm2 // d * 4/5 + an * 1/5 |
| paddw mm1, round_values // + 128 |
| |
| paddw mm3, round_values // + 128 |
| psrlw mm1, 8 |
| |
| psrlw mm3, 8 |
| packuswb mm1, mm3 // des[4] |
| |
| movq QWORD ptr [edi+ecx], mm1 // write des[4] |
| |
| add edi, 8 |
| add esi, 8 |
| |
| sub edx, 8 |
| jg vs_4_5_loop |
| } |
| } |
| |
| /**************************************************************************** |
| * |
| * ROUTINE : last_vertical_band_4_5_scale_mmx |
| * |
| * INPUTS : unsigned char *dest : |
| * unsigned int dest_pitch : |
| * unsigned int dest_width : |
| * |
| * OUTPUTS : None. |
| * |
| * RETURNS : None |
| * |
| * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image. |
| * |
| * SPECIAL NOTES : The routine uses the first line of the band below |
| * the current band. The function also has an "C" only |
| * version. |
| * |
| ****************************************************************************/ |
| static |
| void last_vertical_band_4_5_scale_mmx |
| ( |
| unsigned char *dest, |
| unsigned int dest_pitch, |
| unsigned int dest_width |
| ) |
| { |
| __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 }; |
| __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 }; |
| __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 }; |
| __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 }; |
| __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; |
| |
| __asm |
| { |
| mov esi, dest // Get the source and destination pointer |
| mov ecx, dest_pitch // Get the pitch size |
| |
| lea edi, [esi+ecx*2] // tow lines below |
| add edi, ecx // three lines below |
| |
| pxor mm7, mm7 // clear out mm7 |
| mov edx, dest_width // Loop counter |
| |
| last_vs_4_5_loop: |
| |
| movq mm0, QWORD ptr [esi] // src[0]; |
| movq mm1, QWORD ptr [esi+ecx] // src[1]; |
| |
| movq mm2, mm0 // Make a copy |
| punpcklbw mm0, mm7 // unpack low to word |
| |
| movq mm5, one_fifth |
| punpckhbw mm2, mm7 // unpack high to word |
| |
| pmullw mm0, mm5 // a * 1/5 |
| |
| movq mm3, mm1 // make a copy |
| punpcklbw mm1, mm7 // unpack low to word |
| |
| pmullw mm2, mm5 // a * 1/5 |
| movq mm6, four_fifths // constan |
| |
| movq mm4, mm1 // copy of low b |
| pmullw mm4, mm6 // b * 4/5 |
| |
| punpckhbw mm3, mm7 // unpack high to word |
| movq mm5, mm3 // copy of high b |
| |
| pmullw mm5, mm6 // b * 4/5 |
| paddw mm0, mm4 // a * 1/5 + b * 4/5 |
| |
| paddw mm2, mm5 // a * 1/5 + b * 4/5 |
| paddw mm0, round_values // + 128 |
| |
| paddw mm2, round_values // + 128 |
| psrlw mm0, 8 |
| |
| psrlw mm2, 8 |
| packuswb mm0, mm2 // des [1] |
| |
| movq QWORD ptr [esi+ecx], mm0 // write des[1] |
| movq mm0, [esi+ecx*2] // mm0 = src[2] |
| |
| // mm1, mm3 --- Src[1] |
| // mm0 --- Src[2] |
| // mm7 for unpacking |
| |
| movq mm5, two_fifths |
| movq mm2, mm0 // make a copy |
| |
| pmullw mm1, mm5 // b * 2/5 |
| movq mm6, three_fifths |
| |
| |
| punpcklbw mm0, mm7 // unpack low to word |
| pmullw mm3, mm5 // b * 2/5 |
| |
| movq mm4, mm0 // make copy of c |
| punpckhbw mm2, mm7 // unpack high to word |
| |
| pmullw mm4, mm6 // c * 3/5 |
| movq mm5, mm2 |
| |
| pmullw mm5, mm6 // c * 3/5 |
| paddw mm1, mm4 // b * 2/5 + c * 3/5 |
| |
| paddw mm3, mm5 // b * 2/5 + c * 3/5 |
| paddw mm1, round_values // + 128 |
| |
| paddw mm3, round_values // + 128 |
| psrlw mm1, 8 |
| |
| psrlw mm3, 8 |
| packuswb mm1, mm3 // des[2] |
| |
| movq QWORD ptr [esi+ecx*2], mm1 // write des[2] |
| movq mm1, [edi] // mm1=Src[3]; |
| |
| movq QWORD ptr [edi+ecx], mm1 // write des[4]; |
| |
| // mm0, mm2 --- Src[2] |
| // mm1 --- Src[3] |
| // mm6 --- 3/5 |
| // mm7 for unpacking |
| |
| pmullw mm0, mm6 // c * 3/5 |
| movq mm5, two_fifths // mm5 = 2/5 |
| |
| movq mm3, mm1 // make a copy |
| pmullw mm2, mm6 // c * 3/5 |
| |
| punpcklbw mm1, mm7 // unpack low |
| movq mm4, mm1 // make a copy |
| |
| punpckhbw mm3, mm7 // unpack high |
| pmullw mm4, mm5 // d * 2/5 |
| |
| movq mm6, mm3 // make a copy |
| pmullw mm6, mm5 // d * 2/5 |
| |
| paddw mm0, mm4 // c * 3/5 + d * 2/5 |
| paddw mm2, mm6 // c * 3/5 + d * 2/5 |
| |
| paddw mm0, round_values // + 128 |
| paddw mm2, round_values // + 128 |
| |
| psrlw mm0, 8 |
| psrlw mm2, 8 |
| |
| packuswb mm0, mm2 // des[3] |
| movq QWORD ptr [edi], mm0 // write des[3] |
| |
| // mm1, mm3 --- Src[3] |
| // mm7 -- cleared for unpacking |
| add edi, 8 |
| add esi, 8 |
| |
| sub edx, 8 |
| jg last_vs_4_5_loop |
| } |
| } |
| |
| /**************************************************************************** |
| * |
| * ROUTINE : vertical_band_3_5_scale_mmx |
| * |
| * INPUTS : unsigned char *dest : |
| * unsigned int dest_pitch : |
| * unsigned int dest_width : |
| * |
| * OUTPUTS : None. |
| * |
| * RETURNS : void |
| * |
| * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. |
| * |
| * SPECIAL NOTES : The routine uses the first line of the band below |
| * the current band. The function also has an "C" only |
| * version. |
| * |
| ****************************************************************************/ |
| static |
| void vertical_band_3_5_scale_mmx |
| ( |
| unsigned char *dest, |
| unsigned int dest_pitch, |
| unsigned int dest_width |
| ) |
| { |
| __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 }; |
| __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 }; |
| __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 }; |
| __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 }; |
| __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; |
| |
| __asm |
| { |
| mov esi, dest // Get the source and destination pointer |
| mov ecx, dest_pitch // Get the pitch size |
| |
| lea edi, [esi+ecx*2] // tow lines below |
| add edi, ecx // three lines below |
| |
| pxor mm7, mm7 // clear out mm7 |
| mov edx, dest_width // Loop counter |
| |
| vs_3_5_loop: |
| |
| movq mm0, QWORD ptr [esi] // src[0]; |
| movq mm1, QWORD ptr [esi+ecx] // src[1]; |
| |
| movq mm2, mm0 // Make a copy |
| punpcklbw mm0, mm7 // unpack low to word |
| |
| movq mm5, two_fifths // mm5 = 2/5 |
| punpckhbw mm2, mm7 // unpack high to word |
| |
| pmullw mm0, mm5 // a * 2/5 |
| |
| movq mm3, mm1 // make a copy |
| punpcklbw mm1, mm7 // unpack low to word |
| |
| pmullw mm2, mm5 // a * 2/5 |
| movq mm6, three_fifths // mm6 = 3/5 |
| |
| movq mm4, mm1 // copy of low b |
| pmullw mm4, mm6 // b * 3/5 |
| |
| punpckhbw mm3, mm7 // unpack high to word |
| movq mm5, mm3 // copy of high b |
| |
| pmullw mm5, mm6 // b * 3/5 |
| paddw mm0, mm4 // a * 2/5 + b * 3/5 |
| |
| paddw mm2, mm5 // a * 2/5 + b * 3/5 |
| paddw mm0, round_values // + 128 |
| |
| paddw mm2, round_values // + 128 |
| psrlw mm0, 8 |
| |
| psrlw mm2, 8 |
| packuswb mm0, mm2 // des [1] |
| |
| movq QWORD ptr [esi+ecx], mm0 // write des[1] |
| movq mm0, [esi+ecx*2] // mm0 = src[2] |
| |
| // mm1, mm3 --- Src[1] |
| // mm0 --- Src[2] |
| // mm7 for unpacking |
| |
| movq mm4, mm1 // b low |
| pmullw mm1, four_fifths // b * 4/5 low |
| |
| movq mm5, mm3 // b high |
| pmullw mm3, four_fifths // b * 4/5 high |
| |
| movq mm2, mm0 // c |
| pmullw mm4, one_fifth // b * 1/5 |
| |
| punpcklbw mm0, mm7 // c low |
| pmullw mm5, one_fifth // b * 1/5 |
| |
| movq mm6, mm0 // make copy of c low |
| punpckhbw mm2, mm7 // c high |
| |
| pmullw mm6, one_fifth // c * 1/5 low |
| movq mm7, mm2 // make copy of c high |
| |
| pmullw mm7, one_fifth // c * 1/5 high |
| paddw mm1, mm6 // b * 4/5 + c * 1/5 low |
| |
| paddw mm3, mm7 // b * 4/5 + c * 1/5 high |
| movq mm6, mm0 // make copy of c low |
| |
| pmullw mm6, four_fifths // c * 4/5 low |
| movq mm7, mm2 // make copy of c high |
| |
| pmullw mm7, four_fifths // c * 4/5 high |
| |
| paddw mm4, mm6 // b * 1/5 + c * 4/5 low |
| paddw mm5, mm7 // b * 1/5 + c * 4/5 high |
| |
| paddw mm1, round_values // + 128 |
| paddw mm3, round_values // + 128 |
| |
| psrlw mm1, 8 |
| psrlw mm3, 8 |
| |
| packuswb mm1, mm3 // des[2] |
| movq QWORD ptr [esi+ecx*2], mm1 // write des[2] |
| |
| paddw mm4, round_values // + 128 |
| paddw mm5, round_values // + 128 |
| |
| psrlw mm4, 8 |
| psrlw mm5, 8 |
| |
| packuswb mm4, mm5 // des[3] |
| movq QWORD ptr [edi], mm4 // write des[3] |
| |
| // mm0, mm2 --- Src[3] |
| |
| pxor mm7, mm7 // clear mm7 for unpacking |
| movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next group |
| |
| movq mm5, three_fifths // mm5 = 3/5 |
| pmullw mm0, mm5 // d * 3/5 |
| |
| movq mm6, two_fifths // mm6 = 2/5 |
| movq mm3, mm1 // make a copy |
| |
| pmullw mm2, mm5 // d * 3/5 |
| punpcklbw mm1, mm7 // unpack low |
| |
| pmullw mm1, mm6 // an * 2/5 |
| punpckhbw mm3, mm7 // unpack high |
| |
| paddw mm0, mm1 // d * 3/5 + an * 2/5 |
| pmullw mm3, mm6 // an * 2/5 |
| |
| paddw mm2, mm3 // d * 3/5 + an * 2/5 |
| paddw mm0, round_values // + 128 |
| |
| paddw mm2, round_values // + 128 |
| psrlw mm0, 8 |
| |
| psrlw mm2, 8 |
| packuswb mm0, mm2 // des[4] |
| |
| movq QWORD ptr [edi+ecx], mm0 // write des[4] |
| |
| add edi, 8 |
| add esi, 8 |
| |
| sub edx, 8 |
| jg vs_3_5_loop |
| } |
| } |
| |
| /**************************************************************************** |
| * |
| * ROUTINE : last_vertical_band_3_5_scale_mmx |
| * |
| * INPUTS : unsigned char *dest : |
| * unsigned int dest_pitch : |
| * unsigned int dest_width : |
| * |
| * OUTPUTS : None. |
| * |
| * RETURNS : void |
| * |
| * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. |
| * |
| * SPECIAL NOTES : The routine uses the first line of the band below |
| * the current band. The function also has an "C" only |
| * version. |
| * |
| ****************************************************************************/ |
| static |
| void last_vertical_band_3_5_scale_mmx |
| ( |
| unsigned char *dest, |
| unsigned int dest_pitch, |
| unsigned int dest_width |
| ) |
| { |
| __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 }; |
| __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 }; |
| __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 }; |
| __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 }; |
| __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; |
| __asm |
| { |
| mov esi, dest // Get the source and destination pointer |
| mov ecx, dest_pitch // Get the pitch size |
| |
| lea edi, [esi+ecx*2] // tow lines below |
| add edi, ecx // three lines below |
| |
| pxor mm7, mm7 // clear out mm7 |
| mov edx, dest_width // Loop counter |
| |
| |
| last_vs_3_5_loop: |
| |
| movq mm0, QWORD ptr [esi] // src[0]; |
| movq mm1, QWORD ptr [esi+ecx] // src[1]; |
| |
| movq mm2, mm0 // Make a copy |
| punpcklbw mm0, mm7 // unpack low to word |
| |
| movq mm5, two_fifths // mm5 = 2/5 |
| punpckhbw mm2, mm7 // unpack high to word |
| |
| pmullw mm0, mm5 // a * 2/5 |
| |
| movq mm3, mm1 // make a copy |
| punpcklbw mm1, mm7 // unpack low to word |
| |
| pmullw mm2, mm5 // a * 2/5 |
| movq mm6, three_fifths // mm6 = 3/5 |
| |
| movq mm4, mm1 // copy of low b |
| pmullw mm4, mm6 // b * 3/5 |
| |
| punpckhbw mm3, mm7 // unpack high to word |
| movq mm5, mm3 // copy of high b |
| |
| pmullw mm5, mm6 // b * 3/5 |
| paddw mm0, mm4 // a * 2/5 + b * 3/5 |
| |
| paddw mm2, mm5 // a * 2/5 + b * 3/5 |
| paddw mm0, round_values // + 128 |
| |
| paddw mm2, round_values // + 128 |
| psrlw mm0, 8 |
| |
| psrlw mm2, 8 |
| packuswb mm0, mm2 // des [1] |
| |
| movq QWORD ptr [esi+ecx], mm0 // write des[1] |
| movq mm0, [esi+ecx*2] // mm0 = src[2] |
| |
| |
| |
| // mm1, mm3 --- Src[1] |
| // mm0 --- Src[2] |
| // mm7 for unpacking |
| |
| movq mm4, mm1 // b low |
| pmullw mm1, four_fifths // b * 4/5 low |
| |
| movq QWORD ptr [edi+ecx], mm0 // write des[4] |
| |
| movq mm5, mm3 // b high |
| pmullw mm3, four_fifths // b * 4/5 high |
| |
| movq mm2, mm0 // c |
| pmullw mm4, one_fifth // b * 1/5 |
| |
| punpcklbw mm0, mm7 // c low |
| pmullw mm5, one_fifth // b * 1/5 |
| |
| movq mm6, mm0 // make copy of c low |
| punpckhbw mm2, mm7 // c high |
| |
| pmullw mm6, one_fifth // c * 1/5 low |
| movq mm7, mm2 // make copy of c high |
| |
| pmullw mm7, one_fifth // c * 1/5 high |
| paddw mm1, mm6 // b * 4/5 + c * 1/5 low |
| |
| paddw mm3, mm7 // b * 4/5 + c * 1/5 high |
| movq mm6, mm0 // make copy of c low |
| |
| pmullw mm6, four_fifths // c * 4/5 low |
| movq mm7, mm2 // make copy of c high |
| |
| pmullw mm7, four_fifths // c * 4/5 high |
| |
| paddw mm4, mm6 // b * 1/5 + c * 4/5 low |
| paddw mm5, mm7 // b * 1/5 + c * 4/5 high |
| |
| paddw mm1, round_values // + 128 |
| paddw mm3, round_values // + 128 |
| |
| psrlw mm1, 8 |
| psrlw mm3, 8 |
| |
| packuswb mm1, mm3 // des[2] |
| movq QWORD ptr [esi+ecx*2], mm1 // write des[2] |
| |
| paddw mm4, round_values // + 128 |
| paddw mm5, round_values // + 128 |
| |
| psrlw mm4, 8 |
| psrlw mm5, 8 |
| |
| packuswb mm4, mm5 // des[3] |
| movq QWORD ptr [edi], mm4 // write des[3] |
| |
| // mm0, mm2 --- Src[3] |
| |
| add edi, 8 |
| add esi, 8 |
| |
| sub edx, 8 |
| jg last_vs_3_5_loop |
| } |
| } |
| |
| /**************************************************************************** |
| * |
| * ROUTINE : vertical_band_1_2_scale_mmx |
| * |
| * INPUTS : unsigned char *dest : |
| * unsigned int dest_pitch : |
| * unsigned int dest_width : |
| * |
| * OUTPUTS : None. |
| * |
| * RETURNS : void |
| * |
| * FUNCTION : 1 to 2 up-scaling of a band of pixels. |
| * |
| * SPECIAL NOTES : The routine uses the first line of the band below |
| * the current band. The function also has an "C" only |
| * version. |
| * |
| ****************************************************************************/ |
| static |
| void vertical_band_1_2_scale_mmx |
| ( |
| unsigned char *dest, |
| unsigned int dest_pitch, |
| unsigned int dest_width |
| ) |
| { |
| __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1}; |
| |
| __asm |
| { |
| |
| mov esi, dest // Get the source and destination pointer |
| mov ecx, dest_pitch // Get the pitch size |
| |
| pxor mm7, mm7 // clear out mm7 |
| mov edx, dest_width // Loop counter |
| |
| vs_1_2_loop: |
| |
| movq mm0, [esi] // get Src[0] |
| movq mm1, [esi + ecx * 2] // get Src[1] |
| |
| movq mm2, mm0 // make copy before unpack |
| movq mm3, mm1 // make copy before unpack |
| |
| punpcklbw mm0, mm7 // low Src[0] |
| movq mm6, four_ones // mm6= 1, 1, 1, 1 |
| |
| punpcklbw mm1, mm7 // low Src[1] |
| paddw mm0, mm1 // low (a + b) |
| |
| punpckhbw mm2, mm7 // high Src[0] |
| paddw mm0, mm6 // low (a + b + 1) |
| |
| punpckhbw mm3, mm7 |
| paddw mm2, mm3 // high (a + b ) |
| |
| psraw mm0, 1 // low (a + b +1 )/2 |
| paddw mm2, mm6 // high (a + b + 1) |
| |
| psraw mm2, 1 // high (a + b + 1)/2 |
| packuswb mm0, mm2 // pack results |
| |
| movq [esi+ecx], mm0 // write out eight bytes |
| add esi, 8 |
| |
| sub edx, 8 |
| jg vs_1_2_loop |
| } |
| |
| } |
| |
| /**************************************************************************** |
| * |
| * ROUTINE : last_vertical_band_1_2_scale_mmx |
| * |
| * INPUTS : unsigned char *dest : |
| * unsigned int dest_pitch : |
| * unsigned int dest_width : |
| * |
| * OUTPUTS : None. |
| * |
| * RETURNS : void |
| * |
| * FUNCTION : 1 to 2 up-scaling of band of pixels. |
| * |
| * SPECIAL NOTES : The routine uses the first line of the band below |
| * the current band. The function also has an "C" only |
| * version. |
| * |
| ****************************************************************************/ |
| static |
| void last_vertical_band_1_2_scale_mmx |
| ( |
| unsigned char *dest, |
| unsigned int dest_pitch, |
| unsigned int dest_width |
| ) |
| { |
| __asm |
| { |
| mov esi, dest // Get the source and destination pointer |
| mov ecx, dest_pitch // Get the pitch size |
| |
| mov edx, dest_width // Loop counter |
| |
| last_vs_1_2_loop: |
| |
| movq mm0, [esi] // get Src[0] |
| movq [esi+ecx], mm0 // write out eight bytes |
| |
| add esi, 8 |
| sub edx, 8 |
| |
| jg last_vs_1_2_loop |
| } |
| } |
| |
| /**************************************************************************** |
| * |
| * ROUTINE : horizontal_line_1_2_scale |
| * |
| * INPUTS : const unsigned char *source : |
| * unsigned int source_width : |
| * unsigned char *dest : |
| * unsigned int dest_width : |
| * |
| * OUTPUTS : None. |
| * |
| * RETURNS : void |
| * |
| * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. |
| * |
| * SPECIAL NOTES : None. |
| * |
| ****************************************************************************/ |
| static |
| void horizontal_line_1_2_scale_mmx |
| ( |
| const unsigned char *source, |
| unsigned int source_width, |
| unsigned char *dest, |
| unsigned int dest_width |
| ) |
| { |
| __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1}; |
| |
| (void) dest_width; |
| |
| __asm |
| { |
| mov esi, source |
| mov edi, dest |
| |
| pxor mm7, mm7 |
| movq mm6, four_ones |
| |
| mov ecx, source_width |
| |
| hs_1_2_loop: |
| |
| movq mm0, [esi] |
| movq mm1, [esi+1] |
| |
| movq mm2, mm0 |
| movq mm3, mm1 |
| |
| movq mm4, mm0 |
| punpcklbw mm0, mm7 |
| |
| punpcklbw mm1, mm7 |
| paddw mm0, mm1 |
| |
| paddw mm0, mm6 |
| punpckhbw mm2, mm7 |
| |
| punpckhbw mm3, mm7 |
| paddw mm2, mm3 |
| |
| paddw mm2, mm6 |
| psraw mm0, 1 |
| |
| psraw mm2, 1 |
| packuswb mm0, mm2 |
| |
| movq mm2, mm4 |
| punpcklbw mm2, mm0 |
| |
| movq [edi], mm2 |
| punpckhbw mm4, mm0 |
| |
| movq [edi+8], mm4 |
| add esi, 8 |
| |
| add edi, 16 |
| sub ecx, 8 |
| |
| cmp ecx, 8 |
| jg hs_1_2_loop |
| |
| // last eight pixel |
| |
| movq mm0, [esi] |
| movq mm1, mm0 |
| |
| movq mm2, mm0 |
| movq mm3, mm1 |
| |
| psrlq mm1, 8 |
| psrlq mm3, 56 |
| |
| psllq mm3, 56 |
| por mm1, mm3 |
| |
| movq mm3, mm1 |
| movq mm4, mm0 |
| |
| punpcklbw mm0, mm7 |
| punpcklbw mm1, mm7 |
| |
| paddw mm0, mm1 |
| paddw mm0, mm6 |
| |
| punpckhbw mm2, mm7 |
| punpckhbw mm3, mm7 |
| |
| paddw mm2, mm3 |
| paddw mm2, mm6 |
| |
| psraw mm0, 1 |
| psraw mm2, 1 |
| |
| packuswb mm0, mm2 |
| movq mm2, mm4 |
| |
| punpcklbw mm2, mm0 |
| movq [edi], mm2 |
| |
| punpckhbw mm4, mm0 |
| movq [edi+8], mm4 |
| } |
| } |
| |
| |
| |
| |
| |
| |
| /**************************************************************************** |
| * |
| * ROUTINE : horizontal_line_5_4_scale_mmx |
| * |
| * INPUTS : const unsigned char *source : Pointer to source data. |
| * unsigned int source_width : Stride of source. |
| * unsigned char *dest : Pointer to destination data. |
| * unsigned int dest_width : Stride of destination (NOT USED). |
| * |
| * OUTPUTS : None. |
| * |
| * RETURNS : void |
| * |
| * FUNCTION : Copies horizontal line of pixels from source to |
| * destination scaling up by 4 to 5. |
| * |
| * SPECIAL NOTES : None. |
| * |
| ****************************************************************************/ |
| static |
| void horizontal_line_5_4_scale_mmx |
| ( |
| const unsigned char *source, |
| unsigned int source_width, |
| unsigned char *dest, |
| unsigned int dest_width |
| ) |
| { |
| |
| __declspec(align(16)) const unsigned short const54_2[] = { 0, 64, 128, 192 }; |
| __declspec(align(16)) const unsigned short const54_1[] = {256, 192, 128, 64 }; |
| __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; |
| /* |
| unsigned i; |
| unsigned int a, b, c, d, e; |
| unsigned char *des = dest; |
| const unsigned char *src = source; |
| |
| (void) dest_width; |
| |
| for ( i=0; i<source_width; i+=5 ) |
| { |
| a = src[0]; |
| b = src[1]; |
| c = src[2]; |
| d = src[3]; |
| e = src[4]; |
| |
| des[0] = a; |
| des[1] = ((b*192 + c* 64 + 128)>>8); |
| des[2] = ((c*128 + d*128 + 128)>>8); |
| des[3] = ((d* 64 + e*192 + 128)>>8); |
| |
| src += 5; |
| des += 4; |
| } |
| */ |
| __asm |
| { |
| |
| mov esi, source ; |
| mov edi, dest ; |
| |
| mov ecx, source_width ; |
| movq mm5, const54_1 ; |
| |
| pxor mm7, mm7 ; |
| movq mm6, const54_2 ; |
| |
| movq mm4, round_values ; |
| lea edx, [esi+ecx] ; |
| horizontal_line_5_4_loop: |
| |
| movq mm0, QWORD PTR [esi] ; |
| 00 01 02 03 04 05 06 07 |
| movq mm1, mm0 ; |
| 00 01 02 03 04 05 06 07 |
| |
| psrlq mm0, 8 ; |
| 01 02 03 04 05 06 07 xx |
| punpcklbw mm1, mm7 ; |
| xx 00 xx 01 xx 02 xx 03 |
| |
| punpcklbw mm0, mm7 ; |
| xx 01 xx 02 xx 03 xx 04 |
| pmullw mm1, mm5 |
| |
| pmullw mm0, mm6 |
| add esi, 5 |
| |
| add edi, 4 |
| paddw mm1, mm0 |
| |
| paddw mm1, mm4 |
| psrlw mm1, 8 |
| |
| cmp esi, edx |
| packuswb mm1, mm7 |
| |
| movd DWORD PTR [edi-4], mm1 |
| |
| jl horizontal_line_5_4_loop |
| |
| } |
| |
| } |
| |
| static |
| void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) |
| { |
| |
| __declspec(align(16)) const unsigned short one_fourths[] = { 64, 64, 64, 64 }; |
| __declspec(align(16)) const unsigned short two_fourths[] = { 128, 128, 128, 128 }; |
| __declspec(align(16)) const unsigned short three_fourths[] = { 192, 192, 192, 192 }; |
| |
| __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; |
| __asm |
| { |
| push ebx |
| |
| mov esi, source // Get the source and destination pointer |
| mov ecx, src_pitch // Get the pitch size |
| |
| mov edi, dest // tow lines below |
| pxor mm7, mm7 // clear out mm7 |
| |
| mov edx, dest_pitch // Loop counter |
| mov ebx, dest_width |
| |
| vs_5_4_loop: |
| |
| movd mm0, DWORD ptr [esi] // src[0]; |
| movd mm1, DWORD ptr [esi+ecx] // src[1]; |
| |
| movd mm2, DWORD ptr [esi+ecx*2] |
| lea eax, [esi+ecx*2] // |
| |
| punpcklbw mm1, mm7 |
| punpcklbw mm2, mm7 |
| |
| movq mm3, mm2 |
| pmullw mm1, three_fourths |
| |
| pmullw mm2, one_fourths |
| movd mm4, [eax+ecx] |
| |
| pmullw mm3, two_fourths |
| punpcklbw mm4, mm7 |
| |
| movq mm5, mm4 |
| pmullw mm4, two_fourths |
| |
| paddw mm1, mm2 |
| movd mm6, [eax+ecx*2] |
| |
| pmullw mm5, one_fourths |
| paddw mm1, round_values; |
| |
| paddw mm3, mm4 |
| psrlw mm1, 8 |
| |
| punpcklbw mm6, mm7 |
| paddw mm3, round_values |
| |
| pmullw mm6, three_fourths |
| psrlw mm3, 8 |
| |
| packuswb mm1, mm7 |
| packuswb mm3, mm7 |
| |
| movd DWORD PTR [edi], mm0 |
| movd DWORD PTR [edi+edx], mm1 |
| |
| |
| paddw mm5, mm6 |
| movd DWORD PTR [edi+edx*2], mm3 |
| |
| lea eax, [edi+edx*2] |
| paddw mm5, round_values |
| |
| psrlw mm5, 8 |
| add edi, 4 |
| |
| packuswb mm5, mm7 |
| movd DWORD PTR [eax+edx], mm5 |
| |
| add esi, 4 |
| sub ebx, 4 |
| |
| jg vs_5_4_loop |
| |
| pop ebx |
| } |
| } |
| |
| |
| |
| static |
| void horizontal_line_5_3_scale_mmx |
| ( |
| const unsigned char *source, |
| unsigned int source_width, |
| unsigned char *dest, |
| unsigned int dest_width |
| ) |
| { |
| __declspec(align(16)) const unsigned short const53_1[] = { 0, 85, 171, 0 }; |
| __declspec(align(16)) const unsigned short const53_2[] = {256, 171, 85, 0 }; |
| __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; |
| __asm |
| { |
| |
| mov esi, source ; |
| mov edi, dest ; |
| |
| mov ecx, source_width ; |
| movq mm5, const53_1 ; |
| |
| pxor mm7, mm7 ; |
| movq mm6, const53_2 ; |
| |
| movq mm4, round_values ; |
| lea edx, [esi+ecx-5] ; |
| horizontal_line_5_3_loop: |
| |
| movq mm0, QWORD PTR [esi] ; |
| 00 01 02 03 04 05 06 07 |
| movq mm1, mm0 ; |
| 00 01 02 03 04 05 06 07 |
| |
| psllw mm0, 8 ; |
| xx 00 xx 02 xx 04 xx 06 |
| psrlw mm1, 8 ; |
| 01 xx 03 xx 05 xx 07 xx |
| |
| psrlw mm0, 8 ; |
| 00 xx 02 xx 04 xx 06 xx |
| psllq mm1, 16 ; |
| xx xx 01 xx 03 xx 05 xx |
| |
| pmullw mm0, mm6 |
| |
| pmullw mm1, mm5 |
| add esi, 5 |
| |
| add edi, 3 |
| paddw mm1, mm0 |
| |
| paddw mm1, mm4 |
| psrlw mm1, 8 |
| |
| cmp esi, edx |
| packuswb mm1, mm7 |
| |
| movd DWORD PTR [edi-3], mm1 |
| jl horizontal_line_5_3_loop |
| |
| //exit condition |
| movq mm0, QWORD PTR [esi] ; |
| 00 01 02 03 04 05 06 07 |
| movq mm1, mm0 ; |
| 00 01 02 03 04 05 06 07 |
| |
| psllw mm0, 8 ; |
| xx 00 xx 02 xx 04 xx 06 |
| psrlw mm1, 8 ; |
| 01 xx 03 xx 05 xx 07 xx |
| |
| psrlw mm0, 8 ; |
| 00 xx 02 xx 04 xx 06 xx |
| psllq mm1, 16 ; |
| xx xx 01 xx 03 xx 05 xx |
| |
| pmullw mm0, mm6 |
| |
| pmullw mm1, mm5 |
| paddw mm1, mm0 |
| |
| paddw mm1, mm4 |
| psrlw mm1, 8 |
| |
| packuswb mm1, mm7 |
| movd eax, mm1 |
| |
| mov edx, eax |
| shr edx, 16 |
| |
| mov WORD PTR[edi], ax |
| mov BYTE PTR[edi+2], dl |
| |
| } |
| |
| } |
| |
| |
| static |
| void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) |
| { |
| __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; |
| __declspec(align(16)) const unsigned short one_thirds[] = { 85, 85, 85, 85 }; |
| __declspec(align(16)) const unsigned short two_thirds[] = { 171, 171, 171, 171 }; |
| |
| __asm |
| { |
| push ebx |
| |
| mov esi, source // Get the source and destination pointer |
| mov ecx, src_pitch // Get the pitch size |
| |
| mov edi, dest // tow lines below |
| pxor mm7, mm7 // clear out mm7 |
| |
| mov edx, dest_pitch // Loop counter |
| movq mm5, one_thirds |
| |
| movq mm6, two_thirds |
| mov ebx, dest_width; |
| |
| vs_5_3_loop: |
| |
| movd mm0, DWORD ptr [esi] // src[0]; |
| movd mm1, DWORD ptr [esi+ecx] // src[1]; |
| |
| movd mm2, DWORD ptr [esi+ecx*2] |
| lea eax, [esi+ecx*2] // |
| |
| punpcklbw mm1, mm7 |
| punpcklbw mm2, mm7 |
| |
| pmullw mm1, mm5 |
| pmullw mm2, mm6 |
| |
| movd mm3, DWORD ptr [eax+ecx] |
| movd mm4, DWORD ptr [eax+ecx*2] |
| |
| punpcklbw mm3, mm7 |
| punpcklbw mm4, mm7 |
| |
| pmullw mm3, mm6 |
| pmullw mm4, mm5 |
| |
| |
| movd DWORD PTR [edi], mm0 |
| paddw mm1, mm2 |
| |
| paddw mm1, round_values |
| psrlw mm1, 8 |
| |
| packuswb mm1, mm7 |
| paddw mm3, mm4 |
| |
| paddw mm3, round_values |
| movd DWORD PTR [edi+edx], mm1 |
| |
| psrlw mm3, 8 |
| packuswb mm3, mm7 |
| |
| movd DWORD PTR [edi+edx*2], mm3 |
| |
| |
| add edi, 4 |
| add esi, 4 |
| |
| sub ebx, 4 |
| jg vs_5_3_loop |
| |
| pop ebx |
| } |
| } |
| |
| |
| |
| |
| /**************************************************************************** |
| * |
| * ROUTINE : horizontal_line_2_1_scale |
| * |
| * INPUTS : const unsigned char *source : |
| * unsigned int source_width : |
| * unsigned char *dest : |
| * unsigned int dest_width : |
| * |
| * OUTPUTS : None. |
| * |
| * RETURNS : void |
| * |
| * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. |
| * |
| * SPECIAL NOTES : None. |
| * |
| ****************************************************************************/ |
| static |
| void horizontal_line_2_1_scale_mmx |
| ( |
| const unsigned char *source, |
| unsigned int source_width, |
| unsigned char *dest, |
| unsigned int dest_width |
| ) |
| { |
| (void) dest_width; |
| |
| __asm |
| { |
| mov esi, source |
| mov edi, dest |
| |
| pxor mm7, mm7 |
| mov ecx, dest_width |
| |
| xor edx, edx |
| hs_2_1_loop: |
| |
| movq mm0, [esi+edx*2] |
| psllw mm0, 8 |
| |
| psrlw mm0, 8 |
| packuswb mm0, mm7 |
| |
| movd DWORD Ptr [edi+edx], mm0; |
| add edx, 4 |
| |
| cmp edx, ecx |
| jl hs_2_1_loop |
| |
| } |
| } |
| |
| |
| |
| static |
| void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) |
| { |
| vpx_memcpy(dest, source, dest_width); |
| } |
| |
| |
| |
| static |
| void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) |
| { |
| |
| __declspec(align(16)) const unsigned short three_sixteenths[] = { 48, 48, 48, 48 }; |
| __declspec(align(16)) const unsigned short ten_sixteenths[] = { 160, 160, 160, 160 }; |
| __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; |
| __asm |
| { |
| mov esi, source |
| mov edi, dest |
| |
| mov eax, src_pitch |
| mov edx, dest_width |
| |
| pxor mm7, mm7 |
| sub esi, eax //back one line |
| |
| |
| lea ecx, [esi+edx]; |
| movq mm6, round_values; |
| |
| movq mm5, three_sixteenths; |
| movq mm4, ten_sixteenths; |
| |
| vs_2_1_i_loop: |
| movd mm0, [esi] // |
| movd mm1, [esi+eax] // |
| |
| movd mm2, [esi+eax*2] // |
| punpcklbw mm0, mm7 |
| |
| pmullw mm0, mm5 |
| punpcklbw mm1, mm7 |
| |
| pmullw mm1, mm4 |
| punpcklbw mm2, mm7 |
| |
| pmullw mm2, mm5 |
| paddw mm0, round_values |
| |
| paddw mm1, mm2 |
| paddw mm0, mm1 |
| |
| psrlw mm0, 8 |
| packuswb mm0, mm7 |
| |
| movd DWORD PTR [edi], mm0 |
| add esi, 4 |
| |
| add edi, 4; |
| cmp esi, ecx |
| jl vs_2_1_i_loop |
| |
| } |
| } |
| |
| void |
| register_mmxscalers(void) |
| { |
| vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx; |
| vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx; |
| vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx; |
| vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx; |
| vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx; |
| vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx; |
| vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx; |
| vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx; |
| vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx; |
| |
| vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c; |
| vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c; |
| vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c; |
| vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c; |
| vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c; |
| vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c; |
| |
| |
| |
| vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx; |
| vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx; |
| vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx; |
| vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx; |
| vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx; |
| vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx; |
| vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx; |
| |
| } |