cxcore/src/cxsvd.cpp - platform/external/opencv - Git at Google

 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                        Intel License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of Intel Corporation may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/

 #include "_cxcore.h"
 #include <float.h>

 /////////////////////////////////////////////////////////////////////////////////////////

 #define icvGivens_64f( n, x, y, c, s ) \
 {                                      \
     int _i;                            \
     double* _x = (x);                  \
     double* _y = (y);                  \
                                        \
     for( _i = 0; _i < n; _i++ )        \
     {                                  \
         double t0 = _x[_i];            \
         double t1 = _y[_i];            \
         _x[_i] = t0*c + t1*s;          \
         _y[_i] = -t0*s + t1*c;         \
     }                                  \
 }


 /* y[0:m,0:n] += diag(a[0:1,0:m]) * x[0:m,0:n] */
 static  void
 icvMatrAXPY_64f( int m, int n, const double* x, int dx,
                  const double* a, double* y, int dy )
 {
     int i, j;

     for( i = 0; i < m; i++, x += dx, y += dy )
     {
         double s = a[i];

         for( j = 0; j <= n - 4; j += 4 )
         {
             double t0 = y[j]   + s*x[j];
             double t1 = y[j+1] + s*x[j+1];
             y[j]   = t0;
             y[j+1] = t1;
             t0 = y[j+2] + s*x[j+2];
             t1 = y[j+3] + s*x[j+3];
             y[j+2] = t0;
             y[j+3] = t1;
         }

         for( ; j < n; j++ ) y[j] += s*x[j];
     }
 }


 /* y[1:m,-1] = h*y[1:m,0:n]*x[0:1,0:n]'*x[-1]  (this is used for U&V reconstruction)
    y[1:m,0:n] += h*y[1:m,0:n]*x[0:1,0:n]'*x[0:1,0:n] */
 static void
 icvMatrAXPY3_64f( int m, int n, const double* x, int l, double* y, double h )
 {
     int i, j;

     for( i = 1; i < m; i++ )
     {
         double s = 0;

         y += l;

         for( j = 0; j <= n - 4; j += 4 )
             s += x[j]*y[j] + x[j+1]*y[j+1] + x[j+2]*y[j+2] + x[j+3]*y[j+3];

         for( ; j < n; j++ )  s += x[j]*y[j];

         s *= h;
         y[-1] = s*x[-1];

         for( j = 0; j <= n - 4; j += 4 )
         {
             double t0 = y[j]   + s*x[j];
             double t1 = y[j+1] + s*x[j+1];
             y[j]   = t0;
             y[j+1] = t1;
             t0 = y[j+2] + s*x[j+2];
             t1 = y[j+3] + s*x[j+3];
             y[j+2] = t0;
             y[j+3] = t1;
         }

         for( ; j < n; j++ ) y[j] += s*x[j];
     }
 }


 #define icvGivens_32f( n, x, y, c, s ) \
 {                                      \
     int _i;                            \
     float* _x = (x);                   \
     float* _y = (y);                   \
                                        \
     for( _i = 0; _i < n; _i++ )        \
     {                                  \
         double t0 = _x[_i];            \
         double t1 = _y[_i];            \
         _x[_i] = (float)(t0*c + t1*s); \
         _y[_i] = (float)(-t0*s + t1*c);\
     }                                  \
 }

 static  void
 icvMatrAXPY_32f( int m, int n, const float* x, int dx,
                  const float* a, float* y, int dy )
 {
     int i, j;

     for( i = 0; i < m; i++, x += dx, y += dy )
     {
         double s = a[i];

         for( j = 0; j <= n - 4; j += 4 )
         {
             double t0 = y[j]   + s*x[j];
             double t1 = y[j+1] + s*x[j+1];
             y[j]   = (float)t0;
             y[j+1] = (float)t1;
             t0 = y[j+2] + s*x[j+2];
             t1 = y[j+3] + s*x[j+3];
             y[j+2] = (float)t0;
             y[j+3] = (float)t1;
         }

         for( ; j < n; j++ )
             y[j] = (float)(y[j] + s*x[j]);
     }
 }


 static void
 icvMatrAXPY3_32f( int m, int n, const float* x, int l, float* y, double h )
 {
     int i, j;

     for( i = 1; i < m; i++ )
     {
         double s = 0;
         y += l;

         for( j = 0; j <= n - 4; j += 4 )
             s += x[j]*y[j] + x[j+1]*y[j+1] + x[j+2]*y[j+2] + x[j+3]*y[j+3];

         for( ; j < n; j++ )  s += x[j]*y[j];

         s *= h;
         y[-1] = (float)(s*x[-1]);

         for( j = 0; j <= n - 4; j += 4 )
         {
             double t0 = y[j]   + s*x[j];
             double t1 = y[j+1] + s*x[j+1];
             y[j]   = (float)t0;
             y[j+1] = (float)t1;
             t0 = y[j+2] + s*x[j+2];
             t1 = y[j+3] + s*x[j+3];
             y[j+2] = (float)t0;
             y[j+3] = (float)t1;
         }

         for( ; j < n; j++ ) y[j] = (float)(y[j] + s*x[j]);
     }
 }

 /* accurate hypotenuse calculation */
 static double
 pythag( double a, double b )
 {
     a = fabs( a );
     b = fabs( b );
     if( a > b )
     {
         b /= a;
         a *= sqrt( 1. + b * b );
     }
     else if( b != 0 )
     {
         a /= b;
         a = b * sqrt( 1. + a * a );
     }

     return a;
 }

 /****************************************************************************************/
 /****************************************************************************************/

 #define MAX_ITERS  30

 static void
 icvSVD_64f( double* a, int lda, int m, int n,
             double* w,
             double* uT, int lduT, int nu,
             double* vT, int ldvT,
             double* buffer )
 {
     double* e;
     double* temp;
     double *w1, *e1;
     double *hv;
     double ku0 = 0, kv0 = 0;
     double anorm = 0;
     double *a1, *u0 = uT, *v0 = vT;
     double scale, h;
     int i, j, k, l;
     int nm, m1, n1;
     int nv = n;
     int iters = 0;
     double* hv0 = (double*)cvStackAlloc( (m+2)*sizeof(hv0[0])) + 1;

     e = buffer;
     w1 = w;
     e1 = e + 1;
     nm = n;

     temp = buffer + nm;

     memset( w, 0, nm * sizeof( w[0] ));
     memset( e, 0, nm * sizeof( e[0] ));

     m1 = m;
     n1 = n;

     /* transform a to bi-diagonal form */
     for( ;; )
     {
         int update_u;
         int update_v;

         if( m1 == 0 )
             break;

         scale = h = 0;
         update_u = uT && m1 > m - nu;
         hv = update_u ? uT : hv0;

         for( j = 0, a1 = a; j < m1; j++, a1 += lda )
         {
             double t = a1[0];
             scale += fabs( hv[j] = t );
         }

         if( scale != 0 )
         {
             double f = 1./scale, g, s = 0;

             for( j = 0; j < m1; j++ )
             {
                 double t = (hv[j] *= f);
                 s += t * t;
             }

             g = sqrt( s );
             f = hv[0];
             if( f >= 0 )
                 g = -g;
             hv[0] = f - g;
             h = 1. / (f * g - s);

             memset( temp, 0, n1 * sizeof( temp[0] ));

             /* calc temp[0:n-i] = a[i:m,i:n]'*hv[0:m-i] */
             icvMatrAXPY_64f( m1, n1 - 1, a + 1, lda, hv, temp + 1, 0 );
             for( k = 1; k < n1; k++ ) temp[k] *= h;

             /* modify a: a[i:m,i:n] = a[i:m,i:n] + hv[0:m-i]*temp[0:n-i]' */
             icvMatrAXPY_64f( m1, n1 - 1, temp + 1, 0, hv, a + 1, lda );
             *w1 = g*scale;
         }
         w1++;

         /* store -2/(hv'*hv) */
         if( update_u )
         {
             if( m1 == m )
                 ku0 = h;
             else
                 hv[-1] = h;
         }

         a++;
         n1--;
         if( vT )
             vT += ldvT + 1;

         if( n1 == 0 )
             break;

         scale = h = 0;
         update_v = vT && n1 > n - nv;

         hv = update_v ? vT : hv0;

         for( j = 0; j < n1; j++ )
         {
             double t = a[j];
             scale += fabs( hv[j] = t );
         }

         if( scale != 0 )
         {
             double f = 1./scale, g, s = 0;

             for( j = 0; j < n1; j++ )
             {
                 double t = (hv[j] *= f);
                 s += t * t;
             }

             g = sqrt( s );
             f = hv[0];
             if( f >= 0 )
                 g = -g;
             hv[0] = f - g;
             h = 1. / (f * g - s);
             hv[-1] = 0.;

             /* update a[i:m:i+1:n] = a[i:m,i+1:n] + (a[i:m,i+1:n]*hv[0:m-i])*... */
             icvMatrAXPY3_64f( m1, n1, hv, lda, a, h );

             *e1 = g*scale;
         }
         e1++;

         /* store -2/(hv'*hv) */
         if( update_v )
         {
             if( n1 == n )
                 kv0 = h;
             else
                 hv[-1] = h;
         }

         a += lda;
         m1--;
         if( uT )
             uT += lduT + 1;
     }

     m1 -= m1 != 0;
     n1 -= n1 != 0;

     /* accumulate left transformations */
     if( uT )
     {
         m1 = m - m1;
         uT = u0 + m1 * lduT;
         for( i = m1; i < nu; i++, uT += lduT )
         {
             memset( uT + m1, 0, (m - m1) * sizeof( uT[0] ));
             uT[i] = 1.;
         }

         for( i = m1 - 1; i >= 0; i-- )
         {
             double s;
             int lh = nu - i;

             l = m - i;

             hv = u0 + (lduT + 1) * i;
             h = i == 0 ? ku0 : hv[-1];

             assert( h <= 0 );

             if( h != 0 )
             {
                 uT = hv;
                 icvMatrAXPY3_64f( lh, l-1, hv+1, lduT, uT+1, h );

                 s = hv[0] * h;
                 for( k = 0; k < l; k++ ) hv[k] *= s;
                 hv[0] += 1;
             }
             else
             {
                 for( j = 1; j < l; j++ )
                     hv[j] = 0;
                 for( j = 1; j < lh; j++ )
                     hv[j * lduT] = 0;
                 hv[0] = 1;
             }
         }
         uT = u0;
     }

     /* accumulate right transformations */
     if( vT )
     {
         n1 = n - n1;
         vT = v0 + n1 * ldvT;
         for( i = n1; i < nv; i++, vT += ldvT )
         {
             memset( vT + n1, 0, (n - n1) * sizeof( vT[0] ));
             vT[i] = 1.;
         }

         for( i = n1 - 1; i >= 0; i-- )
         {
             double s;
             int lh = nv - i;

             l = n - i;
             hv = v0 + (ldvT + 1) * i;
             h = i == 0 ? kv0 : hv[-1];

             assert( h <= 0 );

             if( h != 0 )
             {
                 vT = hv;
                 icvMatrAXPY3_64f( lh, l-1, hv+1, ldvT, vT+1, h );

                 s = hv[0] * h;
                 for( k = 0; k < l; k++ ) hv[k] *= s;
                 hv[0] += 1;
             }
             else
             {
                 for( j = 1; j < l; j++ )
                     hv[j] = 0;
                 for( j = 1; j < lh; j++ )
                     hv[j * ldvT] = 0;
                 hv[0] = 1;
             }
         }
         vT = v0;
     }

     for( i = 0; i < nm; i++ )
     {
         double tnorm = fabs( w[i] );
         tnorm += fabs( e[i] );

         if( anorm < tnorm )
             anorm = tnorm;
     }

     anorm *= DBL_EPSILON;

     /* diagonalization of the bidiagonal form */
     for( k = nm - 1; k >= 0; k-- )
     {
         double z = 0;
         iters = 0;

         for( ;; )               /* do iterations */
         {
             double c, s, f, g, x, y;
             int flag = 0;

             /* test for splitting */
             for( l = k; l >= 0; l-- )
             {
                 if( fabs(e[l]) <= anorm )
                 {
                     flag = 1;
                     break;
                 }
                 assert( l > 0 );
                 if( fabs(w[l - 1]) <= anorm )
                     break;
             }

             if( !flag )
             {
                 c = 0;
                 s = 1;

                 for( i = l; i <= k; i++ )
                 {
                     f = s * e[i];

                     e[i] *= c;

                     if( anorm + fabs( f ) == anorm )
                         break;

                     g = w[i];
                     h = pythag( f, g );
                     w[i] = h;
                     c = g / h;
                     s = -f / h;

                     if( uT )
                         icvGivens_64f( m, uT + lduT * (l - 1), uT + lduT * i, c, s );
                 }
             }

             z = w[k];
             if( l == k || iters++ == MAX_ITERS )
                 break;

             /* shift from bottom 2x2 minor */
             x = w[l];
             y = w[k - 1];
             g = e[k - 1];
             h = e[k];
             f = 0.5 * (((g + z) / h) * ((g - z) / y) + y / h - h / y);
             g = pythag( f, 1 );
             if( f < 0 )
                 g = -g;
             f = x - (z / x) * z + (h / x) * (y / (f + g) - h);
             /* next QR transformation */
             c = s = 1;

             for( i = l + 1; i <= k; i++ )
             {
                 g = e[i];
                 y = w[i];
                 h = s * g;
                 g *= c;
                 z = pythag( f, h );
                 e[i - 1] = z;
                 c = f / z;
                 s = h / z;
                 f = x * c + g * s;
                 g = -x * s + g * c;
                 h = y * s;
                 y *= c;

                 if( vT )
                     icvGivens_64f( n, vT + ldvT * (i - 1), vT + ldvT * i, c, s );

                 z = pythag( f, h );
                 w[i - 1] = z;

                 /* rotation can be arbitrary if z == 0 */
                 if( z != 0 )
                 {
                     c = f / z;
                     s = h / z;
                 }
                 f = c * g + s * y;
                 x = -s * g + c * y;

                 if( uT )
                     icvGivens_64f( m, uT + lduT * (i - 1), uT + lduT * i, c, s );
             }

             e[l] = 0;
             e[k] = f;
             w[k] = x;
         }                       /* end of iteration loop */

         if( iters > MAX_ITERS )
             break;

         if( z < 0 )
         {
             w[k] = -z;
             if( vT )
             {
                 for( j = 0; j < n; j++ )
                     vT[j + k * ldvT] = -vT[j + k * ldvT];
             }
         }
     }                           /* end of diagonalization loop */

     /* sort singular values and corresponding values */
     for( i = 0; i < nm; i++ )
     {
         k = i;
         for( j = i + 1; j < nm; j++ )
             if( w[k] < w[j] )
                 k = j;

         if( k != i )
         {
             double t;
             CV_SWAP( w[i], w[k], t );

             if( vT )
                 for( j = 0; j < n; j++ )
                     CV_SWAP( vT[j + ldvT*k], vT[j + ldvT*i], t );

             if( uT )
                 for( j = 0; j < m; j++ )
                     CV_SWAP( uT[j + lduT*k], uT[j + lduT*i], t );
         }
     }
 }


 static void
 icvSVD_32f( float* a, int lda, int m, int n,
             float* w,
             float* uT, int lduT, int nu,
             float* vT, int ldvT,
             float* buffer )
 {
     float* e;
     float* temp;
     float *w1, *e1;
     float *hv;
     double ku0 = 0, kv0 = 0;
     double anorm = 0;
     float *a1, *u0 = uT, *v0 = vT;
     double scale, h;
     int i, j, k, l;
     int nm, m1, n1;
     int nv = n;
     int iters = 0;
     float* hv0 = (float*)cvStackAlloc( (m+2)*sizeof(hv0[0])) + 1;

     e = buffer;

     w1 = w;
     e1 = e + 1;
     nm = n;

     temp = buffer + nm;

     memset( w, 0, nm * sizeof( w[0] ));
     memset( e, 0, nm * sizeof( e[0] ));

     m1 = m;
     n1 = n;

     /* transform a to bi-diagonal form */
     for( ;; )
     {
         int update_u;
         int update_v;

         if( m1 == 0 )
             break;

         scale = h = 0;

         update_u = uT && m1 > m - nu;
         hv = update_u ? uT : hv0;

         for( j = 0, a1 = a; j < m1; j++, a1 += lda )
         {
             double t = a1[0];
             scale += fabs( hv[j] = (float)t );
         }

         if( scale != 0 )
         {
             double f = 1./scale, g, s = 0;

             for( j = 0; j < m1; j++ )
             {
                 double t = (hv[j] = (float)(hv[j]*f));
                 s += t * t;
             }

             g = sqrt( s );
             f = hv[0];
             if( f >= 0 )
                 g = -g;
             hv[0] = (float)(f - g);
             h = 1. / (f * g - s);

             memset( temp, 0, n1 * sizeof( temp[0] ));

             /* calc temp[0:n-i] = a[i:m,i:n]'*hv[0:m-i] */
             icvMatrAXPY_32f( m1, n1 - 1, a + 1, lda, hv, temp + 1, 0 );

             for( k = 1; k < n1; k++ ) temp[k] = (float)(temp[k]*h);

             /* modify a: a[i:m,i:n] = a[i:m,i:n] + hv[0:m-i]*temp[0:n-i]' */
             icvMatrAXPY_32f( m1, n1 - 1, temp + 1, 0, hv, a + 1, lda );
             *w1 = (float)(g*scale);
         }
         w1++;

         /* store -2/(hv'*hv) */
         if( update_u )
         {
             if( m1 == m )
                 ku0 = h;
             else
                 hv[-1] = (float)h;
         }

         a++;
         n1--;
         if( vT )
             vT += ldvT + 1;

         if( n1 == 0 )
             break;

         scale = h = 0;
         update_v = vT && n1 > n - nv;
         hv = update_v ? vT : hv0;

         for( j = 0; j < n1; j++ )
         {
             double t = a[j];
             scale += fabs( hv[j] = (float)t );
         }

         if( scale != 0 )
         {
             double f = 1./scale, g, s = 0;

             for( j = 0; j < n1; j++ )
             {
                 double t = (hv[j] = (float)(hv[j]*f));
                 s += t * t;
             }

             g = sqrt( s );
             f = hv[0];
             if( f >= 0 )
                 g = -g;
             hv[0] = (float)(f - g);
             h = 1. / (f * g - s);
             hv[-1] = 0.f;

             /* update a[i:m:i+1:n] = a[i:m,i+1:n] + (a[i:m,i+1:n]*hv[0:m-i])*... */
             icvMatrAXPY3_32f( m1, n1, hv, lda, a, h );

             *e1 = (float)(g*scale);
         }
         e1++;

         /* store -2/(hv'*hv) */
         if( update_v )
         {
             if( n1 == n )
                 kv0 = h;
             else
                 hv[-1] = (float)h;
         }

         a += lda;
         m1--;
         if( uT )
             uT += lduT + 1;
     }

     m1 -= m1 != 0;
     n1 -= n1 != 0;

     /* accumulate left transformations */
     if( uT )
     {
         m1 = m - m1;
         uT = u0 + m1 * lduT;
         for( i = m1; i < nu; i++, uT += lduT )
         {
             memset( uT + m1, 0, (m - m1) * sizeof( uT[0] ));
             uT[i] = 1.;
         }

         for( i = m1 - 1; i >= 0; i-- )
         {
             double s;
             int lh = nu - i;

             l = m - i;

             hv = u0 + (lduT + 1) * i;
             h = i == 0 ? ku0 : hv[-1];

             assert( h <= 0 );

             if( h != 0 )
             {
                 uT = hv;
                 icvMatrAXPY3_32f( lh, l-1, hv+1, lduT, uT+1, h );

                 s = hv[0] * h;
                 for( k = 0; k < l; k++ ) hv[k] = (float)(hv[k]*s);
                 hv[0] += 1;
             }
             else
             {
                 for( j = 1; j < l; j++ )
                     hv[j] = 0;
                 for( j = 1; j < lh; j++ )
                     hv[j * lduT] = 0;
                 hv[0] = 1;
             }
         }
         uT = u0;
     }

     /* accumulate right transformations */
     if( vT )
     {
         n1 = n - n1;
         vT = v0 + n1 * ldvT;
         for( i = n1; i < nv; i++, vT += ldvT )
         {
             memset( vT + n1, 0, (n - n1) * sizeof( vT[0] ));
             vT[i] = 1.;
         }

         for( i = n1 - 1; i >= 0; i-- )
         {
             double s;
             int lh = nv - i;

             l = n - i;
             hv = v0 + (ldvT + 1) * i;
             h = i == 0 ? kv0 : hv[-1];

             assert( h <= 0 );

             if( h != 0 )
             {
                 vT = hv;
                 icvMatrAXPY3_32f( lh, l-1, hv+1, ldvT, vT+1, h );

                 s = hv[0] * h;
                 for( k = 0; k < l; k++ ) hv[k] = (float)(hv[k]*s);
                 hv[0] += 1;
             }
             else
             {
                 for( j = 1; j < l; j++ )
                     hv[j] = 0;
                 for( j = 1; j < lh; j++ )
                     hv[j * ldvT] = 0;
                 hv[0] = 1;
             }
         }
         vT = v0;
     }

     for( i = 0; i < nm; i++ )
     {
         double tnorm = fabs( w[i] );
         tnorm += fabs( e[i] );

         if( anorm < tnorm )
             anorm = tnorm;
     }

     anorm *= FLT_EPSILON;

     /* diagonalization of the bidiagonal form */
     for( k = nm - 1; k >= 0; k-- )
     {
         double z = 0;
         iters = 0;

         for( ;; )               /* do iterations */
         {
             double c, s, f, g, x, y;
             int flag = 0;

             /* test for splitting */
             for( l = k; l >= 0; l-- )
             {
                 if( fabs( e[l] ) <= anorm )
                 {
                     flag = 1;
                     break;
                 }
                 assert( l > 0 );
                 if( fabs( w[l - 1] ) <= anorm )
                     break;
             }

             if( !flag )
             {
                 c = 0;
                 s = 1;

                 for( i = l; i <= k; i++ )
                 {
                     f = s * e[i];
                     e[i] = (float)(e[i]*c);

                     if( anorm + fabs( f ) == anorm )
                         break;

                     g = w[i];
                     h = pythag( f, g );
                     w[i] = (float)h;
                     c = g / h;
                     s = -f / h;

                     if( uT )
                         icvGivens_32f( m, uT + lduT * (l - 1), uT + lduT * i, c, s );
                 }
             }

             z = w[k];
             if( l == k || iters++ == MAX_ITERS )
                 break;

             /* shift from bottom 2x2 minor */
             x = w[l];
             y = w[k - 1];
             g = e[k - 1];
             h = e[k];
             f = 0.5 * (((g + z) / h) * ((g - z) / y) + y / h - h / y);
             g = pythag( f, 1 );
             if( f < 0 )
                 g = -g;
             f = x - (z / x) * z + (h / x) * (y / (f + g) - h);
             /* next QR transformation */
             c = s = 1;

             for( i = l + 1; i <= k; i++ )
             {
                 g = e[i];
                 y = w[i];
                 h = s * g;
                 g *= c;
                 z = pythag( f, h );
                 e[i - 1] = (float)z;
                 c = f / z;
                 s = h / z;
                 f = x * c + g * s;
                 g = -x * s + g * c;
                 h = y * s;
                 y *= c;

                 if( vT )
                     icvGivens_32f( n, vT + ldvT * (i - 1), vT + ldvT * i, c, s );

                 z = pythag( f, h );
                 w[i - 1] = (float)z;

                 /* rotation can be arbitrary if z == 0 */
                 if( z != 0 )
                 {
                     c = f / z;
                     s = h / z;
                 }
                 f = c * g + s * y;
                 x = -s * g + c * y;

                 if( uT )
                     icvGivens_32f( m, uT + lduT * (i - 1), uT + lduT * i, c, s );
             }

             e[l] = 0;
             e[k] = (float)f;
             w[k] = (float)x;
         }                       /* end of iteration loop */

         if( iters > MAX_ITERS )
             break;

         if( z < 0 )
         {
             w[k] = (float)(-z);
             if( vT )
             {
                 for( j = 0; j < n; j++ )
                     vT[j + k * ldvT] = -vT[j + k * ldvT];
             }
         }
     }                           /* end of diagonalization loop */

     /* sort singular values and corresponding vectors */
     for( i = 0; i < nm; i++ )
     {
         k = i;
         for( j = i + 1; j < nm; j++ )
             if( w[k] < w[j] )
                 k = j;

         if( k != i )
         {
             float t;
             CV_SWAP( w[i], w[k], t );

             if( vT )
                 for( j = 0; j < n; j++ )
                     CV_SWAP( vT[j + ldvT*k], vT[j + ldvT*i], t );

             if( uT )
                 for( j = 0; j < m; j++ )
                     CV_SWAP( uT[j + lduT*k], uT[j + lduT*i], t );
         }
     }
 }


 static void
 icvSVBkSb_64f( int m, int n, const double* w,
                const double* uT, int lduT,
                const double* vT, int ldvT,
                const double* b, int ldb, int nb,
                double* x, int ldx, double* buffer )
 {
     double threshold = 0;
     int i, j, nm = MIN( m, n );

     if( !b )
         nb = m;

     for( i = 0; i < n; i++ )
         memset( x + i*ldx, 0, nb*sizeof(x[0]));

     for( i = 0; i < nm; i++ )
         threshold += w[i];
     threshold *= 2*DBL_EPSILON;

     /* vT * inv(w) * uT * b */
     for( i = 0; i < nm; i++, uT += lduT, vT += ldvT )
     {
         double wi = w[i];

         if( wi > threshold )
         {
             wi = 1./wi;

             if( nb == 1 )
             {
                 double s = 0;
                 if( b )
                 {
                     if( ldb == 1 )
                     {
                         for( j = 0; j <= m - 4; j += 4 )
                             s += uT[j]*b[j] + uT[j+1]*b[j+1] + uT[j+2]*b[j+2] + uT[j+3]*b[j+3];
                         for( ; j < m; j++ )
                             s += uT[j]*b[j];
                     }
                     else
                     {
                         for( j = 0; j < m; j++ )
                             s += uT[j]*b[j*ldb];
                     }
                 }
                 else
                     s = uT[0];
                 s *= wi;
                 if( ldx == 1 )
                 {
                     for( j = 0; j <= n - 4; j += 4 )
                     {
                         double t0 = x[j] + s*vT[j];
                         double t1 = x[j+1] + s*vT[j+1];
                         x[j] = t0;
                         x[j+1] = t1;
                         t0 = x[j+2] + s*vT[j+2];
                         t1 = x[j+3] + s*vT[j+3];
                         x[j+2] = t0;
                         x[j+3] = t1;
                     }

                     for( ; j < n; j++ )
                         x[j] += s*vT[j];
                 }
                 else
                 {
                     for( j = 0; j < n; j++ )
                         x[j*ldx] += s*vT[j];
                 }
             }
             else
             {
                 if( b )
                 {
                     memset( buffer, 0, nb*sizeof(buffer[0]));
                     icvMatrAXPY_64f( m, nb, b, ldb, uT, buffer, 0 );
                     for( j = 0; j < nb; j++ )
                         buffer[j] *= wi;
                 }
                 else
                 {
                     for( j = 0; j < nb; j++ )
                         buffer[j] = uT[j]*wi;
                 }
                 icvMatrAXPY_64f( n, nb, buffer, 0, vT, x, ldx );
             }
         }
     }
 }


 static void
 icvSVBkSb_32f( int m, int n, const float* w,
                const float* uT, int lduT,
                const float* vT, int ldvT,
                const float* b, int ldb, int nb,
                float* x, int ldx, float* buffer )
 {
     float threshold = 0.f;
     int i, j, nm = MIN( m, n );

     if( !b )
         nb = m;

     for( i = 0; i < n; i++ )
         memset( x + i*ldx, 0, nb*sizeof(x[0]));

     for( i = 0; i < nm; i++ )
         threshold += w[i];
     threshold *= 2*FLT_EPSILON;

     /* vT * inv(w) * uT * b */
     for( i = 0; i < nm; i++, uT += lduT, vT += ldvT )
     {
         double wi = w[i];

         if( wi > threshold )
         {
             wi = 1./wi;

             if( nb == 1 )
             {
                 double s = 0;
                 if( b )
                 {
                     if( ldb == 1 )
                     {
                         for( j = 0; j <= m - 4; j += 4 )
                             s += uT[j]*b[j] + uT[j+1]*b[j+1] + uT[j+2]*b[j+2] + uT[j+3]*b[j+3];
                         for( ; j < m; j++ )
                             s += uT[j]*b[j];
                     }
                     else
                     {
                         for( j = 0; j < m; j++ )
                             s += uT[j]*b[j*ldb];
                     }
                 }
                 else
                     s = uT[0];
                 s *= wi;

                 if( ldx == 1 )
                 {
                     for( j = 0; j <= n - 4; j += 4 )
                     {
                         double t0 = x[j] + s*vT[j];
                         double t1 = x[j+1] + s*vT[j+1];
                         x[j] = (float)t0;
                         x[j+1] = (float)t1;
                         t0 = x[j+2] + s*vT[j+2];
                         t1 = x[j+3] + s*vT[j+3];
                         x[j+2] = (float)t0;
                         x[j+3] = (float)t1;
                     }

                     for( ; j < n; j++ )
                         x[j] = (float)(x[j] + s*vT[j]);
                 }
                 else
                 {
                     for( j = 0; j < n; j++ )
                         x[j*ldx] = (float)(x[j*ldx] + s*vT[j]);
                 }
             }
             else
             {
                 if( b )
                 {
                     memset( buffer, 0, nb*sizeof(buffer[0]));
                     icvMatrAXPY_32f( m, nb, b, ldb, uT, buffer, 0 );
                     for( j = 0; j < nb; j++ )
                         buffer[j] = (float)(buffer[j]*wi);
                 }
                 else
                 {
                     for( j = 0; j < nb; j++ )
                         buffer[j] = (float)(uT[j]*wi);
                 }
                 icvMatrAXPY_32f( n, nb, buffer, 0, vT, x, ldx );
             }
         }
     }
 }


 CV_IMPL  void
 cvSVD( CvArr* aarr, CvArr* warr, CvArr* uarr, CvArr* varr, int flags )
 {
     uchar* buffer = 0;
     int local_alloc = 0;

     CV_FUNCNAME( "cvSVD" );

     __BEGIN__;

     CvMat astub, *a = (CvMat*)aarr;
     CvMat wstub, *w = (CvMat*)warr;
     CvMat ustub, *u;
     CvMat vstub, *v;
     CvMat tmat;
     uchar* tw = 0;
     int type;
     int a_buf_offset = 0, u_buf_offset = 0, buf_size, pix_size;
     int temp_u = 0, /* temporary storage for U is needed */
         t_svd; /* special case: a->rows < a->cols */
     int m, n;
     int w_rows, w_cols;
     int u_rows = 0, u_cols = 0;
     int w_is_mat = 0;

     if( !CV_IS_MAT( a ))
         CV_CALL( a = cvGetMat( a, &astub ));

     if( !CV_IS_MAT( w ))
         CV_CALL( w = cvGetMat( w, &wstub ));

     if( !CV_ARE_TYPES_EQ( a, w ))
         CV_ERROR( CV_StsUnmatchedFormats, "" );

     if( a->rows >= a->cols )
     {
         m = a->rows;
         n = a->cols;
         w_rows = w->rows;
         w_cols = w->cols;
         t_svd = 0;
     }
     else
     {
         CvArr* t;
         CV_SWAP( uarr, varr, t );

         flags = (flags & CV_SVD_U_T ? CV_SVD_V_T : 0)|
                 (flags & CV_SVD_V_T ? CV_SVD_U_T : 0);
         m = a->cols;
         n = a->rows;
         w_rows = w->cols;
         w_cols = w->rows;
         t_svd = 1;
     }

     u = (CvMat*)uarr;
     v = (CvMat*)varr;

     w_is_mat = w_cols > 1 && w_rows > 1;

     if( !w_is_mat && CV_IS_MAT_CONT(w->type) && w_cols + w_rows - 1 == n )
         tw = w->data.ptr;

     if( u )
     {
         if( !CV_IS_MAT( u ))
             CV_CALL( u = cvGetMat( u, &ustub ));

         if( !(flags & CV_SVD_U_T) )
         {
             u_rows = u->rows;
             u_cols = u->cols;
         }
         else
         {
             u_rows = u->cols;
             u_cols = u->rows;
         }

         if( !CV_ARE_TYPES_EQ( a, u ))
             CV_ERROR( CV_StsUnmatchedFormats, "" );

         if( u_rows != m || (u_cols != m && u_cols != n))
             CV_ERROR( CV_StsUnmatchedSizes, !t_svd ? "U matrix has unappropriate size" :
                                                      "V matrix has unappropriate size" );

         temp_u = (u_rows != u_cols && !(flags & CV_SVD_U_T)) || u->data.ptr==a->data.ptr;

         if( w_is_mat && u_cols != w_rows )
             CV_ERROR( CV_StsUnmatchedSizes, !t_svd ? "U and W have incompatible sizes" :
                                                      "V and W have incompatible sizes" );
     }
     else
     {
         u = &ustub;
         u->data.ptr = 0;
         u->step = 0;
     }

     if( v )
     {
         int v_rows, v_cols;

         if( !CV_IS_MAT( v ))
             CV_CALL( v = cvGetMat( v, &vstub ));

         if( !(flags & CV_SVD_V_T) )
         {
             v_rows = v->rows;
             v_cols = v->cols;
         }
         else
         {
             v_rows = v->cols;
             v_cols = v->rows;
         }

         if( !CV_ARE_TYPES_EQ( a, v ))
             CV_ERROR( CV_StsUnmatchedFormats, "" );

         if( v_rows != n || v_cols != n )
             CV_ERROR( CV_StsUnmatchedSizes, t_svd ? "U matrix has unappropriate size" :
                                                     "V matrix has unappropriate size" );

         if( w_is_mat && w_cols != v_cols )
             CV_ERROR( CV_StsUnmatchedSizes, t_svd ? "U and W have incompatible sizes" :
                                                     "V and W have incompatible sizes" );
     }
     else
     {
         v = &vstub;
         v->data.ptr = 0;
         v->step = 0;
     }

     type = CV_MAT_TYPE( a->type );
     pix_size = CV_ELEM_SIZE(type);
     buf_size = n*2 + m;

     if( !(flags & CV_SVD_MODIFY_A) )
     {
         a_buf_offset = buf_size;
         buf_size += a->rows*a->cols;
     }

     if( temp_u )
     {
         u_buf_offset = buf_size;
         buf_size += u->rows*u->cols;
     }

     buf_size *= pix_size;

     if( buf_size <= CV_MAX_LOCAL_SIZE )
     {
         buffer = (uchar*)cvStackAlloc( buf_size );
         local_alloc = 1;
     }
     else
     {
         CV_CALL( buffer = (uchar*)cvAlloc( buf_size ));
     }

     if( !(flags & CV_SVD_MODIFY_A) )
     {
         cvInitMatHeader( &tmat, m, n, type,
                          buffer + a_buf_offset*pix_size );
         if( !t_svd )
             cvCopy( a, &tmat );
         else
             cvT( a, &tmat );
         a = &tmat;
     }

     if( temp_u )
     {
         cvInitMatHeader( &ustub, u_cols, u_rows, type, buffer + u_buf_offset*pix_size );
         u = &ustub;
     }

     if( !tw )
         tw = buffer + (n + m)*pix_size;

     if( type == CV_32FC1 )
     {
         icvSVD_32f( a->data.fl, a->step/sizeof(float), a->rows, a->cols,
                    (float*)tw, u->data.fl, u->step/sizeof(float), u_cols,
                    v->data.fl, v->step/sizeof(float), (float*)buffer );
     }
     else if( type == CV_64FC1 )
     {
         icvSVD_64f( a->data.db, a->step/sizeof(double), a->rows, a->cols,
                     (double*)tw, u->data.db, u->step/sizeof(double), u_cols,
                     v->data.db, v->step/sizeof(double), (double*)buffer );
     }
     else
     {
         CV_ERROR( CV_StsUnsupportedFormat, "" );
     }

     if( tw != w->data.ptr )
     {
         int shift = w->cols != 1;
         cvSetZero( w );
         if( type == CV_32FC1 )
             for( int i = 0; i < n; i++ )
                 ((float*)(w->data.ptr + i*w->step))[i*shift] = ((float*)tw)[i];
         else
             for( int i = 0; i < n; i++ )
                 ((double*)(w->data.ptr + i*w->step))[i*shift] = ((double*)tw)[i];
     }

     if( uarr )
     {
         if( !(flags & CV_SVD_U_T))
             cvT( u, uarr );
         else if( temp_u )
             cvCopy( u, uarr );
         /*CV_CHECK_NANS( uarr );*/
     }

     if( varr )
     {
         if( !(flags & CV_SVD_V_T))
             cvT( v, varr );
         /*CV_CHECK_NANS( varr );*/
     }

     CV_CHECK_NANS( w );

     __END__;

     if( buffer && !local_alloc )
         cvFree( &buffer );
 }


 CV_IMPL void
 cvSVBkSb( const CvArr* warr, const CvArr* uarr,
           const CvArr* varr, const CvArr* barr,
           CvArr* xarr, int flags )
 {
     uchar* buffer = 0;
     int local_alloc = 0;

     CV_FUNCNAME( "cvSVBkSb" );

     __BEGIN__;

     CvMat wstub, *w = (CvMat*)warr;
     CvMat bstub, *b = (CvMat*)barr;
     CvMat xstub, *x = (CvMat*)xarr;
     CvMat ustub, ustub2, *u = (CvMat*)uarr;
     CvMat vstub, vstub2, *v = (CvMat*)varr;
     uchar* tw = 0;
     int type;
     int temp_u = 0, temp_v = 0;
     int u_buf_offset = 0, v_buf_offset = 0, w_buf_offset = 0, t_buf_offset = 0;
     int buf_size = 0, pix_size;
     int m, n, nm;
     int u_rows, u_cols;
     int v_rows, v_cols;

     if( !CV_IS_MAT( w ))
         CV_CALL( w = cvGetMat( w, &wstub ));

     if( !CV_IS_MAT( u ))
         CV_CALL( u = cvGetMat( u, &ustub ));

     if( !CV_IS_MAT( v ))
         CV_CALL( v = cvGetMat( v, &vstub ));

     if( !CV_IS_MAT( x ))
         CV_CALL( x = cvGetMat( x, &xstub ));

     if( !CV_ARE_TYPES_EQ( w, u ) || !CV_ARE_TYPES_EQ( w, v ) || !CV_ARE_TYPES_EQ( w, x ))
         CV_ERROR( CV_StsUnmatchedFormats, "All matrices must have the same type" );

     type = CV_MAT_TYPE( w->type );
     pix_size = CV_ELEM_SIZE(type);

     if( !(flags & CV_SVD_U_T) )
     {
         temp_u = 1;
         u_buf_offset = buf_size;
         buf_size += u->cols*u->rows*pix_size;
         u_rows = u->rows;
         u_cols = u->cols;
     }
     else
     {
         u_rows = u->cols;
         u_cols = u->rows;
     }

     if( !(flags & CV_SVD_V_T) )
     {
         temp_v = 1;
         v_buf_offset = buf_size;
         buf_size += v->cols*v->rows*pix_size;
         v_rows = v->rows;
         v_cols = v->cols;
     }
     else
     {
         v_rows = v->cols;
         v_cols = v->rows;
     }

     m = u_rows;
     n = v_rows;
     nm = MIN(n,m);

     if( (u_rows != u_cols && v_rows != v_cols) || x->rows != v_rows )
         CV_ERROR( CV_StsBadSize, "V or U matrix must be square" );

     if( (w->rows == 1 || w->cols == 1) && w->rows + w->cols - 1 == nm )
     {
         if( CV_IS_MAT_CONT(w->type) )
             tw = w->data.ptr;
         else
         {
             w_buf_offset = buf_size;
             buf_size += nm*pix_size;
         }
     }
     else
     {
         if( w->cols != v_cols || w->rows != u_cols )
             CV_ERROR( CV_StsBadSize, "W must be 1d array of MIN(m,n) elements or "
                                     "matrix which size matches to U and V" );
         w_buf_offset = buf_size;
         buf_size += nm*pix_size;
     }

     if( b )
     {
         if( !CV_IS_MAT( b ))
             CV_CALL( b = cvGetMat( b, &bstub ));
         if( !CV_ARE_TYPES_EQ( w, b ))
             CV_ERROR( CV_StsUnmatchedFormats, "All matrices must have the same type" );
         if( b->cols != x->cols || b->rows != m )
             CV_ERROR( CV_StsUnmatchedSizes, "b matrix must have (m x x->cols) size" );
     }
     else
     {
         b = &bstub;
         memset( b, 0, sizeof(*b));
     }

     t_buf_offset = buf_size;
     buf_size += (MAX(m,n) + b->cols)*pix_size;

     if( buf_size <= CV_MAX_LOCAL_SIZE )
     {
         buffer = (uchar*)cvStackAlloc( buf_size );
         local_alloc = 1;
     }
     else
         CV_CALL( buffer = (uchar*)cvAlloc( buf_size ));

     if( temp_u )
     {
         cvInitMatHeader( &ustub2, u_cols, u_rows, type, buffer + u_buf_offset );
         cvT( u, &ustub2 );
         u = &ustub2;
     }

     if( temp_v )
     {
         cvInitMatHeader( &vstub2, v_cols, v_rows, type, buffer + v_buf_offset );
         cvT( v, &vstub2 );
         v = &vstub2;
     }

     if( !tw )
     {
         int i, shift = w->cols > 1 ? pix_size : 0;
         tw = buffer + w_buf_offset;
         for( i = 0; i < nm; i++ )
             memcpy( tw + i*pix_size, w->data.ptr + i*(w->step + shift), pix_size );
     }

     if( type == CV_32FC1 )
     {
         icvSVBkSb_32f( m, n, (float*)tw, u->data.fl, u->step/sizeof(float),
                        v->data.fl, v->step/sizeof(float),
                        b->data.fl, b->step/sizeof(float), b->cols,
                        x->data.fl, x->step/sizeof(float),
                        (float*)(buffer + t_buf_offset) );
     }
     else if( type == CV_64FC1 )
     {
         icvSVBkSb_64f( m, n, (double*)tw, u->data.db, u->step/sizeof(double),
                        v->data.db, v->step/sizeof(double),
                        b->data.db, b->step/sizeof(double), b->cols,
                        x->data.db, x->step/sizeof(double),
                        (double*)(buffer + t_buf_offset) );
     }
     else
     {
         CV_ERROR( CV_StsUnsupportedFormat, "" );
     }

     __END__;

     if( buffer && !local_alloc )
         cvFree( &buffer );
 }

 /* End of file. */