| /* |
| * Copyright (C) 2008 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /* ---- includes ----------------------------------------------------------- */ |
| |
| #include "b_BasicEm/Math.h" |
| #include "b_BasicEm/Functions.h" |
| |
| /* ---- related objects --------------------------------------------------- */ |
| |
| /* ---- typedefs ----------------------------------------------------------- */ |
| |
| /* ---- constants ---------------------------------------------------------- */ |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| /* ========================================================================= */ |
| /* */ |
| /* ---- \ghd{ external functions } ----------------------------------------- */ |
| /* */ |
| /* ========================================================================= */ |
| |
| #if defined( HW_SSE2 ) |
| extern int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA ); |
| extern int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA ); |
| #endif |
| |
| #if defined( HW_FR71 ) |
| int32 bbs_dotProduct_fr71( const int16* vec1A, const int16* vec2A, uint32 sizeA ); |
| #endif |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| uint16 bbs_sqrt32( uint32 valA ) |
| { |
| uint32 rootL = 0; |
| uint32 expL = 0; |
| expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4; |
| expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3; |
| expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2; |
| expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1; |
| switch( expL >> 1 ) |
| { |
| case 15: rootL += ( ( rootL + 0x8000 ) * ( rootL + 0x8000 ) <= valA ) << 15; |
| case 14: rootL += ( ( rootL + 0x4000 ) * ( rootL + 0x4000 ) <= valA ) << 14; |
| case 13: rootL += ( ( rootL + 0x2000 ) * ( rootL + 0x2000 ) <= valA ) << 13; |
| case 12: rootL += ( ( rootL + 0x1000 ) * ( rootL + 0x1000 ) <= valA ) << 12; |
| case 11: rootL += ( ( rootL + 0x0800 ) * ( rootL + 0x0800 ) <= valA ) << 11; |
| case 10: rootL += ( ( rootL + 0x0400 ) * ( rootL + 0x0400 ) <= valA ) << 10; |
| case 9: rootL += ( ( rootL + 0x0200 ) * ( rootL + 0x0200 ) <= valA ) << 9; |
| case 8: rootL += ( ( rootL + 0x0100 ) * ( rootL + 0x0100 ) <= valA ) << 8; |
| case 7: rootL += ( ( rootL + 0x0080 ) * ( rootL + 0x0080 ) <= valA ) << 7; |
| case 6: rootL += ( ( rootL + 0x0040 ) * ( rootL + 0x0040 ) <= valA ) << 6; |
| case 5: rootL += ( ( rootL + 0x0020 ) * ( rootL + 0x0020 ) <= valA ) << 5; |
| case 4: rootL += ( ( rootL + 0x0010 ) * ( rootL + 0x0010 ) <= valA ) << 4; |
| case 3: rootL += ( ( rootL + 0x0008 ) * ( rootL + 0x0008 ) <= valA ) << 3; |
| case 2: rootL += ( ( rootL + 0x0004 ) * ( rootL + 0x0004 ) <= valA ) << 2; |
| case 1: rootL += ( ( rootL + 0x0002 ) * ( rootL + 0x0002 ) <= valA ) << 1; |
| case 0: rootL += ( ( rootL + 0x0001 ) * ( rootL + 0x0001 ) <= valA ) << 0; |
| } |
| |
| return ( uint16 )rootL; |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| uint8 bbs_sqrt16( uint16 valA ) |
| { |
| uint16 rootL = 0; |
| rootL += ( ( rootL + 0x80 ) * ( rootL + 0x80 ) <= valA ) << 7; |
| rootL += ( ( rootL + 0x40 ) * ( rootL + 0x40 ) <= valA ) << 6; |
| rootL += ( ( rootL + 0x20 ) * ( rootL + 0x20 ) <= valA ) << 5; |
| rootL += ( ( rootL + 0x10 ) * ( rootL + 0x10 ) <= valA ) << 4; |
| rootL += ( ( rootL + 0x08 ) * ( rootL + 0x08 ) <= valA ) << 3; |
| rootL += ( ( rootL + 0x04 ) * ( rootL + 0x04 ) <= valA ) << 2; |
| rootL += ( ( rootL + 0x02 ) * ( rootL + 0x02 ) <= valA ) << 1; |
| rootL += ( ( rootL + 0x01 ) * ( rootL + 0x01 ) <= valA ) << 0; |
| return ( uint8 )rootL; |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| /* table of sqrt and slope values */ |
| const uint32 bbs_fastSqrt32_tableG[] = |
| { |
| 268435456, 1016, 272596992, 1000, 276692992, 987, 280735744, 972, |
| 284717056, 959, 288645120, 946, 292519936, 933, 296341504, 922, |
| 300118016, 910, 303845376, 899, 307527680, 889, 311169024, 878, |
| 314765312, 869, 318324736, 858, 321839104, 850, 325320704, 840, |
| 328761344, 832, 332169216, 824, 335544320, 815, 338882560, 807, |
| 342188032, 799, 345460736, 792, 348704768, 785, 351920128, 777, |
| 355102720, 771, 358260736, 764, 361390080, 757, 364490752, 751, |
| 367566848, 745, 370618368, 739, 373645312, 732, 376643584, 727, |
| 379621376, 722, 382578688, 715, 385507328, 711, 388419584, 705, |
| 391307264, 700, 394174464, 695, 397021184, 689, 399843328, 686, |
| 402653184, 680, 405438464, 675, 408203264, 672, 410955776, 666, |
| 413683712, 663, 416399360, 658, 419094528, 653, 421769216, 650, |
| 424431616, 646, 427077632, 641, 429703168, 638, 432316416, 634, |
| 434913280, 630, 437493760, 627, 440061952, 622, 442609664, 620, |
| 445149184, 615, 447668224, 613, 450179072, 609, 452673536, 605, |
| 455151616, 602, 457617408, 600, 460075008, 595, 462512128, 593, |
| 464941056, 590, 467357696, 587, 469762048, 583, 472150016, 581, |
| 474529792, 578, 476897280, 575, 479252480, 572, 481595392, 569, |
| 483926016, 567, 486248448, 564, 488558592, 561, 490856448, 559, |
| 493146112, 556, 495423488, 553, 497688576, 552, 499949568, 548, |
| 502194176, 546, 504430592, 544, 506658816, 541, 508874752, 539, |
| 511082496, 537, 513282048, 534, 515469312, 533, 517652480, 529, |
| 519819264, 528, 521981952, 526, 524136448, 523, 526278656, 521, |
| 528412672, 519, 530538496, 517, 532656128, 515, 534765568, 514 |
| }; |
| |
| uint16 bbs_fastSqrt32( uint32 valA ) |
| { |
| uint32 expL = 0; |
| uint32 valL; |
| uint32 offsL; |
| uint32 indexL = 0; |
| |
| if( valA == 0 ) return 0; |
| |
| /* compute closest even size exponent of valA */ |
| expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4; |
| expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3; |
| expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2; |
| expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1; |
| |
| valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */ |
| offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 12 ) ) >> 13; |
| indexL = ( valL >> 24 ) & 0xFE; |
| |
| return ( bbs_fastSqrt32_tableG[ indexL ] + offsL * bbs_fastSqrt32_tableG[ indexL + 1 ] ) >> ( 28 - ( expL >> 1 ) ); |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| /* table of 1/sqrt (1.31) and negative slope (.15) values |
| referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */ |
| const uint32 bbs_invSqrt32_tableG[] = |
| { |
| 2147483648u, 1001, 2114682880, 956, 2083356672, 915, 2053373952, 877, |
| 2024636416, 840, 1997111296, 808, 1970634752, 776, 1945206784, 746, |
| 1920761856, 720, 1897168896, 693, 1874460672, 669, 1852538880, 646, |
| 1831370752, 625, 1810890752, 604, 1791098880, 584, 1771962368, 567, |
| 1753382912, 548, 1735426048, 533, 1717960704, 516, 1701052416, 502, |
| 1684602880, 487, 1668644864, 474, 1653112832, 461, 1638006784, 448, |
| 1623326720, 436, 1609039872, 426, 1595080704, 414, 1581514752, 404, |
| 1568276480, 394, 1555365888, 384, 1542782976, 375, 1530494976, 367, |
| 1518469120, 357, 1506770944, 350, 1495302144, 342, 1484095488, 334, |
| 1473150976, 327, 1462435840, 320, 1451950080, 313, 1441693696, 307, |
| 1431633920, 300, 1421803520, 294, 1412169728, 289, 1402699776, 282, |
| 1393459200, 277, 1384382464, 272, 1375469568, 266, 1366753280, 262, |
| 1358168064, 257, 1349746688, 251, 1341521920, 248, 1333395456, 243, |
| 1325432832, 238, 1317634048, 235, 1309933568, 230, 1302396928, 227, |
| 1294958592, 222, 1287684096, 219, 1280507904, 216, 1273430016, 211, |
| 1266515968, 209, 1259667456, 205, 1252950016, 202, 1246330880, 198, |
| 1239842816, 196, 1233420288, 192, 1227128832, 190, 1220902912, 187, |
| 1214775296, 184, 1208745984, 181, 1202814976, 179, 1196949504, 176, |
| 1191182336, 173, 1185513472, 171, 1179910144, 169, 1174372352, 166, |
| 1168932864, 164, 1163558912, 162, 1158250496, 160, 1153007616, 157, |
| 1147863040, 155, 1142784000, 154, 1137737728, 151, 1132789760, 149, |
| 1127907328, 148, 1123057664, 145, 1118306304, 144, 1113587712, 142, |
| 1108934656, 140, 1104347136, 138, 1099825152, 137, 1095335936, 135, |
| 1090912256, 134, 1086521344, 131, 1082228736, 131, 1077936128, 128 |
| }; |
| |
| uint32 bbs_invSqrt32( uint32 valA ) |
| { |
| |
| uint32 expL = 0; |
| uint32 valL; |
| uint32 offsL; |
| uint32 indexL = 0; |
| |
| if( valA == 0U ) return 0x80000000; |
| |
| /* compute closest even size exponent of valA */ |
| expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4; |
| expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3; |
| expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2; |
| expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1; |
| |
| valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */ |
| offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 9 ) ) >> 10; |
| indexL = ( valL >> 24 ) & 0xFE; |
| |
| return ( bbs_invSqrt32_tableG[ indexL ] - offsL * bbs_invSqrt32_tableG[ indexL + 1 ] ) >> ( expL >> 1 ); |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| /* table of 1/( x + 1 ) (2.30) and negative slope (.14) values |
| referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */ |
| const int32 bbs_inv32_tableG[] = |
| { |
| 1073741824, 1986, 1041203200, 1870, 1010565120, 1762, 981696512, 1664, |
| 954433536, 1575, 928628736, 1491, 904200192, 1415, 881016832, 1345, |
| 858980352, 1278, 838041600, 1218, 818085888, 1162, 799047680, 1108, |
| 780894208, 1059, 763543552, 1013, 746946560, 970, 731054080, 930, |
| 715816960, 891, 701218816, 856, 687194112, 823, 673710080, 791, |
| 660750336, 761, 648282112, 732, 636289024, 706, 624721920, 681, |
| 613564416, 657, 602800128, 635, 592396288, 613, 582352896, 592, |
| 572653568, 573, 563265536, 554, 554188800, 537, 545390592, 520, |
| }; |
| |
| int32 bbs_inv32( int32 valA ) |
| { |
| |
| uint32 expL = 0; |
| int32 signL = ( ( valA >> 30 ) & 0xFFFFFFFE ) + 1; |
| int32 valL = signL * valA; |
| int32 offsL; |
| uint32 indexL = 0; |
| |
| if( valL <= ( int32 ) 1 ) return 0x40000000 * signL; |
| |
| /* compute size exponent of valL */ |
| expL += ( ( valL >> ( expL + 0x10 ) ) != 0 ) << 4; |
| expL += ( ( valL >> ( expL + 0x08 ) ) != 0 ) << 3; |
| expL += ( ( valL >> ( expL + 0x04 ) ) != 0 ) << 2; |
| expL += ( ( valL >> ( expL + 0x02 ) ) != 0 ) << 1; |
| expL += ( ( valL >> ( expL + 0x01 ) ) != 0 ); |
| |
| valL = ( ( valL << ( 30 - expL ) ) - 1073741824 ); /*( 1U << 30 )*/ |
| offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 10 ) ) >> 11; |
| indexL = ( valL >> 24 ) & 0xFE; |
| |
| return signL * ( ( ( ( bbs_inv32_tableG[ indexL ] - offsL * bbs_inv32_tableG[ indexL + 1 ] ) >> ( expL - 1 ) ) + 1 ) >> 1 ); |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| uint32 bbs_intLog2( uint32 valA ) |
| { |
| uint32 expL = 0; |
| expL += 0x10 * ( ( valA >> ( expL + 0x10 ) ) != 0 ); |
| expL += 0x08 * ( ( valA >> ( expL + 0x08 ) ) != 0 ); |
| expL += 0x04 * ( ( valA >> ( expL + 0x04 ) ) != 0 ); |
| expL += 0x02 * ( ( valA >> ( expL + 0x02 ) ) != 0 ); |
| expL += 0x01 * ( ( valA >> ( expL + 0x01 ) ) != 0 ); |
| return expL; |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| const uint32 bbs_pow2M1_tableG[] = |
| { |
| 0, 713, 46769127, 721, 94047537, 729, 141840775, 737, |
| 190154447, 745, 238994221, 753, 288365825, 761, 338275050, 769, |
| 388727751, 778, 439729846, 786, 491287318, 795, 543406214, 803, |
| 596092647, 812, 649352798, 821, 703192913, 830, 757619310, 839, |
| 812638371, 848, 868256550, 857, 924480371, 867, 981316430, 876, |
| 1038771393, 886, 1096851999, 895, 1155565062, 905, 1214917468, 915, |
| 1274916179, 925, 1335568233, 935, 1396880745, 945, 1458860907, 956, |
| 1521515988, 966, 1584853338, 976, 1648880387, 987, 1713604645, 998, |
| 1779033703, 1009, 1845175238, 1020, 1912037006, 1031, 1979626852, 1042, |
| 2047952702, 1053, 2117022572, 1065, 2186844564u, 1077, 2257426868u, 1088, |
| 2328777762u, 1100, 2400905617u, 1112, 2473818892u, 1124, 2547526141u, 1136, |
| 2622036010u, 1149, 2697357237u, 1161, 2773498659u, 1174, 2850469207u, 1187, |
| 2928277909u, 1200, 3006933892u, 1213, 3086446383u, 1226, 3166824708u, 1239, |
| 3248078296u, 1253, 3330216677u, 1266, 3413249486u, 1280, 3497186464u, 1294, |
| 3582037455u, 1308, 3667812413u, 1323, 3754521399u, 1337, 3842174584u, 1352, |
| 3930782250u, 1366, 4020354790u, 1381, 4110902710u, 1396, 4202436633u, 1411 |
| }; |
| |
| uint32 bbs_pow2M1( uint32 valA ) |
| { |
| uint32 offsL = ( valA & 0x03FFFFFF ) >> 10; |
| uint16 indexL = ( ( valA & 0xFC000000 ) >> 26 ) << 1; |
| return bbs_pow2M1_tableG[ indexL ] + offsL * bbs_pow2M1_tableG[ indexL + 1 ]; |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| uint32 bbs_pow2( int32 valA ) |
| { |
| int32 shiftL = 16 - ( valA >> 27 ); |
| uint32 offsL = ( uint32 )( valA << 5 ); |
| if( shiftL == 32 ) return 1; |
| return ( 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL ); |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| uint32 bbs_exp( int32 valA ) |
| { |
| int32 adjustedL; |
| int32 shiftL; |
| int32 offsL; |
| |
| /* check boundaries to avoid overflow */ |
| if( valA < -1488522236 ) |
| { |
| return 0; |
| } |
| else if( valA > 1488522236 ) |
| { |
| return 0xFFFFFFFF; |
| } |
| |
| /* multily valA with 1/ln(2) in order to use function 2^x instead of e^x */ |
| adjustedL = ( valA >> 16 ) * 94548 + ( ( ( ( ( uint32 )valA ) & 0x0FFFF ) * 47274 ) >> 15 ); |
| |
| shiftL = 16 - ( adjustedL >> 27 ); |
| if( shiftL == 32 ) return 1; |
| offsL = ( uint32 )( adjustedL << 5 ); |
| return ( ( int32 ) 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL ); |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| int16 bbs_satS16( int32 valA ) |
| { |
| if( valA > 32767 ) return 32767; |
| if( valA < -32768 ) return -32768; |
| return valA; |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| #if defined( HW_i586 ) || defined( HW_i686 ) |
| |
| /* Windows section */ |
| #if defined( WIN32 ) && !defined( WIN64 ) |
| |
| /* disable warning "no return value"*/ |
| #pragma warning( disable : 4035 ) |
| |
| /** |
| * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0 |
| */ |
| int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA ) |
| { |
| __asm |
| { |
| push esi |
| push edi |
| |
| mov eax, vec1A |
| mov ebx, vec2A |
| |
| mov ecx, sizeA |
| |
| pxor mm4, mm4 |
| pxor mm6, mm6 |
| |
| pxor mm7, mm7 |
| shr ecx, 4 |
| |
| inner_loop_start: |
| movq mm0, 0[eax] |
| paddd mm7, mm4 |
| |
| movq mm1, 0[ebx] |
| paddd mm7, mm6 |
| |
| movq mm2, 8[eax] |
| pmaddwd mm0, mm1 |
| |
| movq mm3, 8[ebx] |
| |
| movq mm4, 16[eax] |
| pmaddwd mm2, mm3 |
| |
| movq mm5, 16[ebx] |
| paddd mm7, mm0 |
| |
| movq mm6, 24[eax] |
| pmaddwd mm4, mm5 |
| |
| pmaddwd mm6, 24[ebx] |
| paddd mm7, mm2 |
| |
| add eax, 32 |
| add ebx, 32 |
| |
| dec ecx |
| jnz inner_loop_start |
| |
| paddd mm7, mm4 |
| |
| paddd mm7, mm6 |
| |
| movq mm0, mm7 |
| |
| psrlq mm0, 32 |
| |
| paddd mm7, mm0 |
| |
| movd eax, mm7 |
| |
| emms |
| pop edi |
| pop esi |
| } |
| } |
| |
| #pragma warning( default : 4035 ) |
| |
| /* gcc compiler section */ |
| #elif defined( epl_LINUX ) || defined( CYGWIN ) |
| |
| /** |
| * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0 |
| */ |
| int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA ) |
| { |
| int32 resultL; |
| |
| __asm__ __volatile__( |
| |
| "movl %1,%%eax\n\t" |
| "movl %2,%%ebx\n\t" |
| |
| "movl %3,%%ecx\n\t" |
| |
| "pxor %%mm4,%%mm4\n\t" |
| "pxor %%mm6,%%mm6\n\t" |
| |
| "pxor %%mm7, %%mm7\n\t" |
| "shrl $4, %%ecx\n\t" |
| |
| "\n1:\t" |
| "movq 0(%%eax),%%mm0\n\t" |
| "paddd %%mm4,%%mm7\n\t" |
| |
| "movq 0( %%ebx ),%%mm1\n\t" |
| "paddd %%mm6,%%mm7\n\t" |
| |
| "movq 8( %%eax ),%%mm2\n\t" |
| "pmaddwd %%mm1,%%mm0\n\t" |
| |
| "movq 8( %%ebx ),%%mm3\n\t" |
| |
| "movq 16( %%eax ),%%mm4\n\t" |
| "pmaddwd %%mm3,%%mm2\n\t" |
| |
| "movq 16( %%ebx ),%%mm5\n\t" |
| "paddd %%mm0,%%mm7\n\t" |
| |
| "movq 24( %%eax ),%%mm6\n\t" |
| "pmaddwd %%mm5,%%mm4\n\t" |
| |
| "pmaddwd 24( %%ebx ),%%mm6\n\t" |
| "paddd %%mm2,%%mm7\n\t" |
| |
| "addl $32,%%eax\n\t" |
| "addl $32,%%ebx\n\t" |
| |
| "decl %%ecx\n\t" |
| "jnz 1b\n\t" |
| |
| "paddd %%mm4,%%mm7\n\t" |
| "paddd %%mm6,%%mm7\n\t" |
| |
| "movq %%mm7,%%mm0\n\t" |
| |
| "psrlq $32,%%mm0\n\t" |
| |
| "paddd %%mm0,%%mm7\n\t" |
| |
| "movd %%mm7,%0\n\t" |
| |
| "emms\n\t" |
| |
| : "=&g" ( resultL ) |
| : "g" ( vec1A ), "g" ( vec2A ), "g" ( sizeA ) |
| : "si", "di", "ax", "bx", "cx", "st", "memory" ); |
| |
| return resultL; |
| } |
| |
| #endif /* epl_LINUX, CYGWIN */ |
| |
| #endif /* HW_i586 || HW_i686 */ |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| #ifdef HW_TMS320C6x |
| /** |
| * Calls fast assembler version of dotproduct for DSP. |
| * dotProduct_C62x is implemented in file dotprod.asm and expects input vectors |
| * of even length. |
| */ |
| int32 bbs_dotProduct_dsp( const int16* vec1A, const int16* vec2A, uint32 sizeA ) |
| { |
| if( sizeA & 1 ) |
| { |
| int32 resultL; |
| resultL = dotProduct_C62x( vec1A, vec2A, sizeA - 1 ); |
| return resultL + ( int32 ) *( vec1A + sizeA - 1 ) * *( vec2A + sizeA - 1 ); |
| } |
| else |
| { |
| return dotProduct_C62x( vec1A, vec2A, sizeA ); |
| } |
| } |
| #endif /* HW_TMS320C6x */ |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| /* 16 dot product for the PS2/EE processor */ |
| /* input vectors MUST be 128 bit aligned ! */ |
| |
| #if defined( epl_LINUX ) && defined( HW_EE ) |
| |
| int32 bbs_dotProduct_EE( const int16* vec1A, const int16* vec2A, uint32 sizeA ) |
| { |
| int32 resultL = 0, |
| iL = sizeA >> 3, |
| jL = sizeA - ( iL << 3 ); |
| |
| if( iL > 0 ) |
| { |
| /* multiply-add elements of input vectors in sets of 8 */ |
| int32 accL[ 4 ], t1L, t2L, t3L; |
| asm volatile ( |
| "pxor %4, %2, %2\n\t" /* reset 8 accumulators (LO and HI register) to 0 */ |
| "pmtlo %4\n\t" |
| "pmthi %4\n\t" |
| |
| "\n__begin_loop:\t" |
| |
| "lq %2,0(%0)\n\t" /* load 8 pairs of int16 */ |
| "lq %3,0(%1)\n\t" |
| |
| "addi %0,%0,16\n\t" /* vec1L += 16 */ |
| "addi %1,%1,16\n\t" /* vec2L += 16 */ |
| "addi %7,%7,-1\n\t" /* iL-- */ |
| |
| "pmaddh %4,%2,%3\n\t" /* parallel multiply-add of 8 pairs of int16 */ |
| |
| "bgtzl %7,__begin_loop\n\t" /* if iL > 0 goto _begin_loop */ |
| |
| "pmflo %2\n\t" /* parallel add 8 accumulators , store remaining 4 accumulators in tmpL */ |
| "pmfhi %3\n\t" |
| "paddw %4,%2,%3\n\t" |
| "sq %4,0(%8)\n\t" |
| : "=r" ( vec1A ), "=r" ( vec2A ), "=r" ( t1L ), "=r" ( t2L ), "=r" ( t3L ) |
| : "0" ( vec1A ), "1" ( vec2A ), "r" ( iL ), "r" ( accL ) |
| : "memory" ); |
| |
| /* add 4 parallel accumulators */ |
| resultL += accL[ 0 ] + accL[ 1 ] + accL[ 2 ] + accL[ 3 ]; |
| } |
| |
| /* multiply-add remaining elements of input vectors */ |
| for( ; jL--; ) resultL += ( int32 ) *vec1A++ * *vec2A++; |
| |
| return resultL; |
| } |
| |
| #endif |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| #if defined( HW_ARMv5TE ) |
| |
| /* fast 16 dot product for ARM9E cores (DSP extensions). |
| * input vectors must be 32 bit aligned |
| */ |
| int32 bbs_dotProduct_arm9e( const int16* vec1A, const int16* vec2A, uint32 sizeA ) |
| { |
| int32 accuL = 0; |
| |
| int32* v1PtrL = ( int32* )vec1A; |
| int32* v2PtrL = ( int32* )vec2A; |
| |
| for( ; sizeA >= 4; sizeA -= 4 ) |
| { |
| __asm { |
| smlabb accuL, *v1PtrL, *v2PtrL, accuL; |
| smlatt accuL, *v1PtrL, *v2PtrL, accuL; |
| } |
| v1PtrL++; v2PtrL++; |
| __asm { |
| smlabb accuL, *v1PtrL, *v2PtrL, accuL; |
| smlatt accuL, *v1PtrL, *v2PtrL, accuL; |
| } |
| v1PtrL++; v2PtrL++; |
| } |
| |
| vec1A = ( int16* )v1PtrL; |
| vec2A = ( int16* )v2PtrL; |
| |
| /* multiply-add remaining elements of input vectors */ |
| for( ; sizeA > 0; sizeA-- ) accuL += ( int32 )*vec1A++ * *vec2A++; |
| |
| return accuL; |
| } |
| |
| #endif |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| /** |
| * Computes a fast dot product using standard C |
| */ |
| int32 bbs_dotProduct_stdc( const int16* vec1A, const int16* vec2A, uint32 sizeA ) |
| { |
| int32 accuL = 0; |
| |
| for( ; sizeA >= 8; sizeA -= 8 ) |
| { |
| accuL += ( int32 ) *vec1A * *vec2A; |
| accuL += ( int32 ) *( vec1A + 1 ) * *( vec2A + 1 ); |
| accuL += ( int32 ) *( vec1A + 2 ) * *( vec2A + 2 ); |
| accuL += ( int32 ) *( vec1A + 3 ) * *( vec2A + 3 ); |
| |
| accuL += ( int32 ) *( vec1A + 4 ) * *( vec2A + 4 ); |
| accuL += ( int32 ) *( vec1A + 5 ) * *( vec2A + 5 ); |
| accuL += ( int32 ) *( vec1A + 6 ) * *( vec2A + 6 ); |
| accuL += ( int32 ) *( vec1A + 7 ) * *( vec2A + 7 ); |
| |
| vec1A += 8; |
| vec2A += 8; |
| } |
| |
| for( ; sizeA; sizeA-- ) accuL += ( int32 ) *vec1A++ * *vec2A++; |
| |
| return accuL; |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| int32 bbs_dotProductInt16( const int16* vec1A, const int16* vec2A, uint32 sizeA ) |
| { |
| /* PC */ |
| #if ( defined( HW_i586 ) || defined( HW_i686 ) ) |
| |
| #if defined( HW_SSE2 ) |
| uint32 size16L = sizeA & 0xfffffff0; |
| if( size16L ) |
| { |
| if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 ) |
| { |
| return bbs_dotProduct_128SSE2( vec1A, vec2A, sizeA ); |
| } |
| else |
| { |
| return bbs_dotProduct_u128SSE2( vec1A, vec2A, sizeA ); |
| } |
| } |
| #elif !defined( WIN64 ) |
| /* MMX version (not supported by 64-bit compiler) */ |
| uint32 size16L = sizeA & 0xfffffff0; |
| if( size16L ) |
| { |
| if( sizeA == size16L ) |
| { |
| return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L ); |
| } |
| return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L ) |
| + bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L ); |
| } /* if( size16L ) */ |
| #endif |
| |
| return bbs_dotProduct_stdc( vec1A, vec2A, sizeA ); |
| |
| /* Playstation 2 */ |
| #elif defined( HW_EE ) && defined( epl_LINUX ) |
| |
| if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 ) |
| { |
| return bbs_dotProduct_EE( vec1A, vec2A, sizeA ); |
| } |
| return bbs_dotProduct_stdc( vec1A, vec2A, sizeA ); |
| |
| /* ARM9E */ |
| #elif defined( HW_ARMv5TE ) |
| |
| return bbs_dotProduct_arm9e( vec1A, vec2A, sizeA ); |
| |
| /* TI C6000 */ |
| #elif defined( HW_TMS320C6x ) |
| |
| return bbs_dotProduct_dsp( vec1A, vec2A, sizeA ); |
| |
| #elif defined( HW_FR71 ) |
| |
| uint32 size16L = sizeA & 0xfffffff0; |
| if( size16L ) |
| { |
| if( sizeA == size16L ) |
| { |
| return bbs_dotProduct_fr71( vec1A, vec2A, size16L ); |
| } |
| return bbs_dotProduct_fr71( vec1A, vec2A, size16L ) |
| + bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L ); |
| } |
| |
| return bbs_dotProduct_stdc( vec1A, vec2A, sizeA ); |
| |
| #endif |
| |
| return bbs_dotProduct_stdc( vec1A, vec2A, sizeA ); |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| /* table of fermi and slope values (result: 2.30; offset: .12) |
| referenced in b_NeuralNetEm/FastMlpNet.c, not not rename or remove */ |
| const uint32 bbs_fermi_tableG[] = |
| { |
| 45056, 8, 77824, 13, 131072, 21, 217088, 34, |
| 356352, 57, 589824, 94, 974848, 155, 1609728, 255, |
| 2654208, 418, 4366336, 688, 7184384, 1126, 11796480, 1834, |
| 19308544, 2970, 31473664, 4748, 50921472, 7453, 81448960, 11363, |
| 127991808, 16573, 195874816, 22680, 288772096, 28469, 405381120, 32102, |
| 536870912, 32101, 668356608, 28469, 784965632, 22680, 877862912, 16573, |
| 945745920, 11363, 992288768, 7453, 1022816256, 4748, 1042264064, 2970, |
| 1054429184, 1834, 1061941248, 1126, 1066553344, 688, 1069371392, 418, |
| 1071083520, 255, 1072128000, 155, 1072762880, 94, 1073147904, 57, |
| 1073381376, 34, 1073520640, 21, 1073606656, 13, 1073659904, 8, |
| }; |
| |
| int32 bbs_fermi( int32 valA ) |
| { |
| uint32 indexL = ( ( valA >> 15 ) + 20 ) << 1; |
| uint32 offsL = ( ( valA & 0x00007FFF ) + 4 ) >> 3; |
| if( valA < -655360 ) return 1; |
| if( valA >= 655360 ) return 1073741824 - 1; /* ( 1 << 30 ) */ |
| return ( bbs_fermi_tableG[ indexL ] + offsL * bbs_fermi_tableG[ indexL + 1 ] ); |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| void bbs_uint32ReduceToNBits( uint32* argPtrA, int32* bbpPtrA, uint32 nBitsA ) |
| { |
| int32 posHighestBitL = bbs_intLog2( *argPtrA ) + 1; |
| int32 shiftL = posHighestBitL - nBitsA; |
| if( shiftL > 0 ) |
| { |
| ( *argPtrA ) >>= shiftL; |
| ( *bbpPtrA ) -= shiftL; |
| } |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| void bbs_int32ReduceToNBits( int32* argPtrA, int32* bbpPtrA, uint32 nBitsA ) |
| { |
| int32 posHighestBitL = bbs_intLog2( bbs_abs( *argPtrA ) ) + 1; |
| int32 shiftL = posHighestBitL - nBitsA; |
| if( shiftL > 0 ) |
| { |
| ( *argPtrA ) >>= shiftL; |
| ( *bbpPtrA ) -= shiftL; |
| } |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| uint32 bbs_convertU32( uint32 srcA, int32 srcBbpA, int32 dstBbpA ) |
| { |
| if( dstBbpA >= srcBbpA ) |
| { |
| uint32 shiftL = dstBbpA - srcBbpA; |
| if( srcA > ( ( uint32 )0xFFFFFFFF >> shiftL ) ) |
| { |
| /* overflow */ |
| return ( uint32 )0xFFFFFFFF; |
| } |
| else |
| { |
| return srcA << shiftL; |
| } |
| } |
| else |
| { |
| uint32 shiftL = srcBbpA - dstBbpA; |
| uint32 addL = 1L << ( shiftL - 1 ); |
| if( srcA + addL < addL ) |
| { |
| /* rounding would cause overflow */ |
| return srcA >> shiftL; |
| } |
| else |
| { |
| return ( srcA + addL ) >> shiftL; |
| } |
| } |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| int32 bbs_convertS32( int32 srcA, int32 srcBbpA, int32 dstBbpA ) |
| { |
| if( dstBbpA >= srcBbpA ) |
| { |
| uint32 shiftL = ( uint32 )( dstBbpA - srcBbpA ); |
| if( srcA > ( ( int32 )0x7FFFFFFF >> shiftL ) ) |
| { |
| /* overflow */ |
| return ( uint32 )0x7FFFFFFF; |
| } |
| else if( srcA < ( ( int32 )0x80000000 >> shiftL ) ) |
| { |
| /* underflow */ |
| return ( int32 )0x80000000; |
| } |
| else |
| { |
| return srcA << shiftL; |
| } |
| } |
| else |
| { |
| uint32 shiftL = ( uint32 )( srcBbpA - dstBbpA ); |
| int32 addL = 1L << ( shiftL - 1 ); |
| if( srcA + addL < addL ) |
| { |
| /* rounding would cause overflow */ |
| return srcA >> shiftL; |
| } |
| else |
| { |
| return ( srcA + addL ) >> shiftL; |
| } |
| } |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| int32 bbs_vecPowerFlt16( const int16 *xA, int16 nxA ) |
| { |
| /* #if defined( HW_TMS320C5x ) |
| uint32 rL; |
| power( ( int16* ) xA, ( int32* ) &rL, nxA ); // does not work properly in DSPLib version 2.20.02 |
| return ( rL >> 1 ); |
| #else*/ |
| /* needs to be optimized */ |
| int32 rL = 0; |
| for( ; nxA--; ) |
| { |
| rL += ( int32 ) *xA * *xA; |
| xA++; |
| } |
| return rL; |
| /* #endif */ |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| void bbs_mulU32( uint32 v1A, uint32 v2A, uint32* manPtrA, int32* expPtrA ) |
| { |
| uint32 log1L = bbs_intLog2( v1A ); |
| uint32 log2L = bbs_intLog2( v2A ); |
| |
| if( log1L + log2L < 32 ) |
| { |
| *manPtrA = v1A * v2A; |
| *expPtrA = 0; |
| } |
| else |
| { |
| uint32 v1L = v1A; |
| uint32 v2L = v2A; |
| uint32 exp1L = 0; |
| uint32 exp2L = 0; |
| if( log1L > 15 && log2L > 15 ) |
| { |
| exp1L = log1L - 15; |
| exp2L = log2L - 15; |
| v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1; |
| v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1; |
| } |
| else if( log1L > 15 ) |
| { |
| exp1L = log1L + log2L - 31; |
| v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1; |
| } |
| else |
| { |
| exp2L = log1L + log2L - 31; |
| v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1; |
| } |
| |
| *manPtrA = v1L * v2L; |
| *expPtrA = exp1L + exp2L; |
| } |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| void bbs_mulS32( int32 v1A, int32 v2A, int32* manPtrA, int32* expPtrA ) |
| { |
| uint32 log1L = bbs_intLog2( v1A > 0 ? v1A : -v1A ); |
| uint32 log2L = bbs_intLog2( v2A > 0 ? v2A : -v2A ); |
| |
| if( log1L + log2L < 30 ) |
| { |
| *manPtrA = v1A * v2A; |
| *expPtrA = 0; |
| } |
| else |
| { |
| int32 v1L = v1A; |
| int32 v2L = v2A; |
| int32 exp1L = 0; |
| int32 exp2L = 0; |
| if( log1L > 14 && log2L > 14 ) |
| { |
| exp1L = log1L - 14; |
| exp2L = log2L - 14; |
| v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1; |
| v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1; |
| } |
| else if( log1L > 14 ) |
| { |
| exp1L = log1L + log2L - 29; |
| v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1; |
| } |
| else |
| { |
| exp2L = log1L + log2L - 29; |
| v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1; |
| } |
| |
| *manPtrA = v1L * v2L; |
| *expPtrA = exp1L + exp2L; |
| } |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| void bbs_vecSqrNorm32( const int32* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA ) |
| { |
| uint32 sumL = 0; |
| int32 sumExpL = 0; |
| |
| uint32 iL; |
| for( iL = 0; iL < sizeA; iL++ ) |
| { |
| int32 vL = vecA[ iL ]; |
| int32 logL = bbs_intLog2( vL > 0 ? vL : -vL ); |
| int32 expL = ( logL > 14 ) ? logL - 14 : 0; |
| uint32 prdL; |
| |
| if( expL >= 1 ) |
| { |
| vL = ( ( vL >> ( expL - 1 ) ) + 1 ) >> 1; |
| } |
| else |
| { |
| vL = vL >> expL; |
| } |
| |
| prdL = vL * vL; |
| expL <<= 1; /* now exponent of product */ |
| |
| if( sumExpL > expL ) |
| { |
| uint32 shrL = sumExpL - expL; |
| prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1; |
| } |
| else if( expL > sumExpL ) |
| { |
| uint32 shrL = expL - sumExpL; |
| sumL = ( ( sumL >> ( shrL - 1 ) ) + 1 ) >> 1; |
| sumExpL += shrL; |
| } |
| |
| sumL += prdL; |
| |
| if( sumL > 0x80000000 ) |
| { |
| sumL = ( sumL + 1 ) >> 1; |
| sumExpL++; |
| } |
| } |
| |
| /* make exponent even */ |
| if( ( sumExpL & 1 ) != 0 ) |
| { |
| sumL = ( sumL + 1 ) >> 1; |
| sumExpL++; |
| } |
| |
| if( manPtrA != NULL ) *manPtrA = sumL; |
| if( expPtrA != NULL ) *expPtrA = sumExpL; |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| void bbs_vecSqrNorm16( const int16* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA ) |
| { |
| uint32 sumL = 0; |
| int32 sumExpL = 0; |
| |
| uint32 iL; |
| for( iL = 0; iL < sizeA; iL++ ) |
| { |
| int32 vL = vecA[ iL ]; |
| uint32 prdL = vL * vL; |
| |
| if( sumExpL > 0 ) |
| { |
| uint32 shrL = sumExpL; |
| prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1; |
| } |
| |
| sumL += prdL; |
| |
| if( sumL > 0x80000000 ) |
| { |
| sumL = ( sumL + 1 ) >> 1; |
| sumExpL++; |
| } |
| } |
| |
| /* make exponent even */ |
| if( ( sumExpL & 1 ) != 0 ) |
| { |
| sumL = ( sumL + 1 ) >> 1; |
| sumExpL++; |
| } |
| |
| if( manPtrA != NULL ) *manPtrA = sumL; |
| if( expPtrA != NULL ) *expPtrA = sumExpL; |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| uint32 bbs_vecNorm16( const int16* vecA, uint32 sizeA ) |
| { |
| uint32 manL; |
| uint32 expL; |
| bbs_vecSqrNorm16( vecA, sizeA, &manL, &expL ); |
| manL = bbs_sqrt32( manL ); |
| return manL << ( expL >> 1 ); |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| void bbs_matMultiplyFlt16( const int16 *x1A, int16 row1A, int16 col1A, const int16 *x2A, int16 col2A, int16 *rA ) |
| { |
| #if defined( HW_TMS320C5x ) |
| /* operands need to be in internal memory for mmul*/ |
| if( x1A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE || |
| x2A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE ) |
| { |
| int16 iL,jL,kL; |
| int16 *ptr1L, *ptr2L; |
| int32 sumL; |
| |
| for( iL = 0; iL < row1A; iL++ ) |
| { |
| for( jL = 0; jL < col2A; jL++ ) |
| { |
| ptr1L = ( int16* ) x1A + iL * col1A; |
| ptr2L = ( int16* ) x2A + jL; |
| sumL = 0; |
| for( kL = 0; kL < col1A; kL++ ) |
| { |
| sumL += ( ( int32 ) *ptr1L++ * *ptr2L ); |
| ptr2L += col2A; |
| } |
| *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */ |
| } |
| } |
| } |
| else mmul( ( int16* ) x1A, row1A, col1A, ( int16* ) x2A, col1A, col2A, rA ); |
| |
| #elif defined( HW_ARMv4 ) || defined( HW_ARMv5TE ) |
| |
| int32 iL, jL, kL; |
| int16 *ptr1L, *ptr2L; |
| int32 sumL; |
| for( iL = 0; iL < row1A; iL++ ) |
| { |
| for( jL = 0; jL < col2A; jL++ ) |
| { |
| ptr1L = ( int16* ) x1A + iL * col1A; |
| ptr2L = ( int16* ) x2A + jL; |
| sumL = 0; |
| for( kL = col1A; kL >= 4; kL -= 4 ) |
| { |
| sumL += ( ( int32 ) *ptr1L++ * *ptr2L ); |
| sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) ); |
| sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) ); |
| sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) ); |
| ptr2L += col2A; |
| } |
| for( ; kL > 0; kL-- ) |
| { |
| sumL += ( ( int32 ) *ptr1L++ * *ptr2L ); |
| ptr2L += col2A; |
| } |
| *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */ |
| } |
| } |
| #else |
| /* needs to be optimized */ |
| int16 iL,jL,kL; |
| int16 *ptr1L, *ptr2L; |
| int32 sumL; |
| |
| for( iL = 0; iL < row1A; iL++ ) |
| { |
| for( jL = 0; jL < col2A; jL++ ) |
| { |
| ptr1L = ( int16* ) x1A + iL * col1A; |
| ptr2L = ( int16* ) x2A + jL; |
| sumL = 0; |
| for( kL = 0; kL < col1A; kL++ ) |
| { |
| sumL += ( ( int32 ) *ptr1L++ * *ptr2L ); |
| ptr2L += col2A; |
| } |
| *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */ |
| } |
| } |
| #endif |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| void bbs_matMultiplyTranspFlt16( const int16 *x1A, int16 row1A, int16 col1A, |
| const int16 *x2A, int16 col2A, int16 *rA ) |
| { |
| const int16* ptr1L = x1A; |
| |
| int32 iL; |
| for( iL = row1A; iL > 0 ; iL-- ) |
| { |
| int32 jL; |
| const int16* ptr2L = x2A; |
| for( jL = col2A; jL > 0 ; jL-- ) |
| { |
| int32 kL; |
| int32 sumL = 0; |
| for( kL = col1A >> 2; kL > 0; kL-- ) |
| { |
| sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); |
| sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); |
| sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); |
| sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); |
| } |
| for( kL = col1A & 3; kL > 0; kL-- ) |
| { |
| sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ ); |
| } |
| *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */ |
| ptr1L -= col1A; |
| } |
| ptr1L += col1A; |
| } |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| |
| #ifndef mtrans |
| uint16 bbs_matTrans( int16 *xA, int16 rowA, int16 colA, int16 *rA ) |
| { |
| /* needs to be optimized */ |
| int16 iL; |
| for( iL = colA; iL--; ) |
| { |
| int16* sL = xA++; |
| int16 jL; |
| for( jL = rowA; jL--; ) |
| { |
| *rA++ = *sL; |
| sL += colA; |
| } |
| } |
| return 0; |
| } |
| #endif |
| |
| /* ------------------------------------------------------------------------- */ |
| #ifndef atan2_16 |
| int16 bbs_atan2( int16 nomA, int16 denomA ) |
| { |
| int16 phL, argL; |
| |
| if( nomA == denomA ) return 8192; |
| argL = ( ( int32 ) nomA << 15 ) / denomA; |
| |
| /* 0.318253*2 x 20857 .15 |
| +0.003314*2 x^2 217 .15 |
| -0.130908*2 x^3 -8580 .15 |
| +0.068542*2 x^4 4491 .15 |
| -0.009159*2 x^5 -600 .15 */ |
| |
| phL = -600; |
| phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 4481; |
| phL = ( ( ( int32 ) phL * argL ) >> 15 ) - 8580; |
| phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 217; |
| phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 20857; |
| phL = ( ( int32 ) phL * argL ) >> 15; |
| |
| return phL >> 1; /* /2 */ |
| } |
| |
| /* needs to be optimized */ |
| uint16 bbs_vecPhase( int16 *reA, int16 *imA, int16 *phaseA, uint16 sizeA ) |
| { |
| for( ; sizeA--; ) |
| { |
| int16 reL = *reA++; |
| int16 imL = *imA++; |
| int16 phL = 0; |
| |
| if( reL < 0 ) |
| { |
| reL = -reL; |
| if( imL < 0 ) |
| { |
| imL = -imL; |
| if( reL > imL ) |
| { |
| phL = -32768 + bbs_atan2( imL, reL ); |
| } |
| else |
| { |
| phL = -16384 - bbs_atan2( reL, imL ); |
| } |
| } |
| else |
| { |
| if( reL > imL ) |
| { |
| phL = -( -32768 + bbs_atan2( imL, reL ) ); |
| } |
| else |
| { |
| if( imL == 0 ) phL = 0; |
| else phL = 16384 + bbs_atan2( reL, imL ); |
| } |
| } |
| } |
| else |
| { |
| if( imL < 0 ) |
| { |
| imL = -imL; |
| if( reL > imL ) |
| { |
| phL = -bbs_atan2( imL, reL ); |
| } |
| else |
| { |
| phL = -16384 + bbs_atan2( reL, imL ); |
| } |
| } |
| else |
| { |
| if( reL > imL ) |
| { |
| phL = bbs_atan2( imL, reL ); |
| } |
| else |
| { |
| if( imL == 0 ) phL = 0; |
| else phL = 16384 - bbs_atan2( reL, imL ); |
| } |
| } |
| } |
| |
| *phaseA++ = phL; |
| } |
| return 0; |
| } |
| |
| #endif |
| |
| /* ------------------------------------------------------------------------- */ |