vpx_scale/generic/bicubic_scaler.c - platform/external/libvpx - Git at Google

 /*
  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */


 #include <float.h>
 #include <math.h>
 #include <stdio.h>
 #include "vpx_mem/vpx_mem.h"
 #include "vpxscale_arbitrary.h"

 #define FIXED_POINT

 #define MAX_IN_WIDTH        800
 #define MAX_IN_HEIGHT       600
 #define MAX_OUT_WIDTH       800
 #define MAX_OUT_HEIGHT      600
 #define MAX_OUT_DIMENSION   ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \
                              MAX_OUT_WIDTH : MAX_OUT_HEIGHT)

 BICUBIC_SCALER_STRUCT g_b_scaler;
 static int g_first_time = 1;

 #pragma DATA_SECTION(g_hbuf, "VP6_HEAP")
 #pragma DATA_ALIGN (g_hbuf, 32);
 unsigned char g_hbuf[MAX_OUT_DIMENSION];

 #pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP")
 #pragma DATA_ALIGN (g_hbuf_uv, 32);
 unsigned char g_hbuf_uv[MAX_OUT_DIMENSION];


 #ifdef FIXED_POINT
 static int a_i = 0.6 * 65536;
 #else
 static float a = -0.6;
 #endif

 #ifdef FIXED_POINT
 //         3     2
 // C0 = a*t - a*t
 //
 static short c0_fixed(unsigned int t)
 {
     // put t in Q16 notation
     unsigned short v1, v2;

     // Q16
     v1 = (a_i * t) >> 16;
     v1 = (v1 * t) >> 16;

     // Q16
     v2 = (a_i * t) >> 16;
     v2 = (v2 * t) >> 16;
     v2 = (v2 * t) >> 16;

     // Q12
     return -((v1 - v2) >> 4);
 }

 //                     2          3
 // C1 = a*t + (3-2*a)*t  - (2-a)*t
 //
 static short c1_fixed(unsigned int t)
 {
     unsigned short v1, v2, v3;
     unsigned short two, three;

     // Q16
     v1 = (a_i * t) >> 16;

     // Q13
     two = 2 << 13;
     v2 = two - (a_i >> 3);
     v2 = (v2 * t) >> 16;
     v2 = (v2 * t) >> 16;
     v2 = (v2 * t) >> 16;

     // Q13
     three = 3 << 13;
     v3 = three - (2 * (a_i >> 3));
     v3 = (v3 * t) >> 16;
     v3 = (v3 * t) >> 16;

     // Q12
     return (((v1 >> 3) - v2 + v3) >> 1);

 }

 //                 2          3
 // C2 = 1 - (3-a)*t  + (2-a)*t
 //
 static short c2_fixed(unsigned int t)
 {
     unsigned short v1, v2, v3;
     unsigned short two, three;

     // Q13
     v1 = 1 << 13;

     // Q13
     three = 3 << 13;
     v2 = three - (a_i >> 3);
     v2 = (v2 * t) >> 16;
     v2 = (v2 * t) >> 16;

     // Q13
     two = 2 << 13;
     v3 = two - (a_i >> 3);
     v3 = (v3 * t) >> 16;
     v3 = (v3 * t) >> 16;
     v3 = (v3 * t) >> 16;

     // Q12
     return (v1 - v2 + v3) >> 1;
 }

 //                 2      3
 // C3 = a*t - 2*a*t  + a*t
 //
 static short c3_fixed(unsigned int t)
 {
     int v1, v2, v3;

     // Q16
     v1 = (a_i * t) >> 16;

     // Q15
     v2 = 2 * (a_i >> 1);
     v2 = (v2 * t) >> 16;
     v2 = (v2 * t) >> 16;

     // Q16
     v3 = (a_i * t) >> 16;
     v3 = (v3 * t) >> 16;
     v3 = (v3 * t) >> 16;

     // Q12
     return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3);
 }
 #else
 //          3     2
 // C0 = -a*t + a*t
 //
 float C0(float t)
 {
     return -a * t * t * t + a * t * t;
 }

 //                      2          3
 // C1 = -a*t + (2*a+3)*t  - (a+2)*t
 //
 float C1(float t)
 {
     return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t;
 }

 //                 2          3
 // C2 = 1 - (a+3)*t  + (a+2)*t
 //
 float C2(float t)
 {
     return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f;
 }

 //                 2      3
 // C3 = a*t - 2*a*t  + a*t
 //
 float C3(float t)
 {
     return a * t * t * t - 2.0f * a * t * t + a * t;
 }
 #endif

 #if 0
 int compare_real_fixed()
 {
     int i, errors = 0;
     float mult = 1.0 / 10000.0;
     unsigned int fixed_mult = mult * 4294967296;//65536;
     unsigned int phase_offset_int;
     float phase_offset_real;

     for (i = 0; i < 10000; i++)
     {
         int fixed0, fixed1, fixed2, fixed3, fixed_total;
         int real0, real1, real2, real3, real_total;

         phase_offset_real = (float)i * mult;
         phase_offset_int = (fixed_mult * i) >> 16;
 //      phase_offset_int = phase_offset_real * 65536;

         fixed0 = c0_fixed(phase_offset_int);
         real0 = C0(phase_offset_real) * 4096.0;

         if ((abs(fixed0) > (abs(real0) + 1)) || (abs(fixed0) < (abs(real0) - 1)))
             errors++;

         fixed1 = c1_fixed(phase_offset_int);
         real1 = C1(phase_offset_real) * 4096.0;

         if ((abs(fixed1) > (abs(real1) + 1)) || (abs(fixed1) < (abs(real1) - 1)))
             errors++;

         fixed2 = c2_fixed(phase_offset_int);
         real2 = C2(phase_offset_real) * 4096.0;

         if ((abs(fixed2) > (abs(real2) + 1)) || (abs(fixed2) < (abs(real2) - 1)))
             errors++;

         fixed3 = c3_fixed(phase_offset_int);
         real3 = C3(phase_offset_real) * 4096.0;

         if ((abs(fixed3) > (abs(real3) + 1)) || (abs(fixed3) < (abs(real3) - 1)))
             errors++;

         fixed_total = fixed0 + fixed1 + fixed2 + fixed3;
         real_total = real0 + real1 + real2 + real3;

         if ((fixed_total > 4097) || (fixed_total < 4094))
             errors ++;

         if ((real_total > 4097) || (real_total < 4095))
             errors ++;
     }

     return errors;
 }
 #endif

 // Find greatest common denominator between two integers.  Method used here is
 //  slow compared to Euclid's algorithm, but does not require any division.
 int gcd(int a, int b)
 {
     // Problem with this algorithm is that if a or b = 0 this function
     //  will never exit.  Don't want to return 0 because any computation
     //  that was based on a common denoninator and tried to reduce by
     //  dividing by 0 would fail.  Best solution that could be thought of
     //  would to be fail by returing a 1;
     if (a <= 0 || b <= 0)
         return 1;

     while (a != b)
     {
         if (b > a)
             b = b - a;
         else
         {
             int tmp = a;//swap large and
             a = b; //small
             b = tmp;
         }
     }

     return b;
 }

 void bicubic_coefficient_init()
 {
     vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
     g_first_time = 0;
 }

 void bicubic_coefficient_destroy()
 {
     if (!g_first_time)
     {
         if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);

         if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);

         if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);

         if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);

         if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);

         if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);

         vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
     }
 }

 // Create the coeffients that will be used for the cubic interpolation.
 //  Because scaling does not have to be equal in the vertical and horizontal
 //  regimes the phase offsets will be different.  There are 4 coefficents
 //  for each point, two on each side.  The layout is that there are the
 //  4 coefficents for each phase in the array and then the next phase.
 int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height)
 {
     int i;
 #ifdef FIXED_POINT
     int phase_offset_int;
     unsigned int fixed_mult;
     int product_val = 0;
 #else
     float phase_offset;
 #endif
     int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv;

     if (g_first_time)
         bicubic_coefficient_init();


     // check to see if the coefficents have already been set up correctly
     if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height)
         && (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height))
         return 0;

     g_b_scaler.in_width = in_width;
     g_b_scaler.in_height = in_height;
     g_b_scaler.out_width = out_width;
     g_b_scaler.out_height = out_height;

     // Don't want to allow crazy scaling, just try and prevent a catastrophic
     //  failure here.  Want to fail after setting the member functions so if
     //  if the scaler is called the member functions will not scale.
     if (out_width <= 0 || out_height <= 0)
         return -1;

     // reduce in/out width and height ratios using the gcd
     gcd_w = gcd(out_width, in_width);
     gcd_h = gcd(out_height, in_height);
     gcd_h_uv = gcd(out_height, in_height / 2);

     // the numerator width and height are to be saved in
     //  globals so they can be used during the scaling process
     //  without having to be recalculated.
     g_b_scaler.nw = out_width / gcd_w;
     d_w = in_width / gcd_w;

     g_b_scaler.nh = out_height / gcd_h;
     d_h = in_height / gcd_h;

     g_b_scaler.nh_uv = out_height / gcd_h_uv;
     d_h_uv = (in_height / 2) / gcd_h_uv;

     // allocate memory for the coefficents
     if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);

     if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);

     if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);

     g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2);
     g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2);
     g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2);

     if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);

     if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);

     if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);

     g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2);
     g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2);
     g_b_scaler.c_h_uv = (short *)vpx_memalign(32, g_b_scaler.nh_uv * 4 * 2);

     g_b_scaler.hbuf = g_hbuf;
     g_b_scaler.hbuf_uv = g_hbuf_uv;

     // Set up polyphase filter taps.  This needs to be done before
     //  the scaling because of the floating point math required.  The
     //  coefficients are multiplied by 2^12 so that fixed point math
     //  can be used in the main scaling loop.
 #ifdef FIXED_POINT
     fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296;

     product_val = 0;

     for (i = 0; i < g_b_scaler.nw; i++)
     {
         if (product_val > g_b_scaler.nw)
             product_val -= g_b_scaler.nw;

         phase_offset_int = (fixed_mult * product_val) >> 16;

         g_b_scaler.c_w[i*4]   = c3_fixed(phase_offset_int);
         g_b_scaler.c_w[i*4+1] = c2_fixed(phase_offset_int);
         g_b_scaler.c_w[i*4+2] = c1_fixed(phase_offset_int);
         g_b_scaler.c_w[i*4+3] = c0_fixed(phase_offset_int);

         product_val += d_w;
     }


     fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296;

     product_val = 0;

     for (i = 0; i < g_b_scaler.nh; i++)
     {
         if (product_val > g_b_scaler.nh)
             product_val -= g_b_scaler.nh;

         phase_offset_int = (fixed_mult * product_val) >> 16;

         g_b_scaler.c_h[i*4]   = c0_fixed(phase_offset_int);
         g_b_scaler.c_h[i*4+1] = c1_fixed(phase_offset_int);
         g_b_scaler.c_h[i*4+2] = c2_fixed(phase_offset_int);
         g_b_scaler.c_h[i*4+3] = c3_fixed(phase_offset_int);

         product_val += d_h;
     }

     fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296;

     product_val = 0;

     for (i = 0; i < g_b_scaler.nh_uv; i++)
     {
         if (product_val > g_b_scaler.nh_uv)
             product_val -= g_b_scaler.nh_uv;

         phase_offset_int = (fixed_mult * product_val) >> 16;

         g_b_scaler.c_h_uv[i*4]   = c0_fixed(phase_offset_int);
         g_b_scaler.c_h_uv[i*4+1] = c1_fixed(phase_offset_int);
         g_b_scaler.c_h_uv[i*4+2] = c2_fixed(phase_offset_int);
         g_b_scaler.c_h_uv[i*4+3] = c3_fixed(phase_offset_int);

         product_val += d_h_uv;
     }

 #else

     for (i = 0; i < g_nw; i++)
     {
         phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw;
         g_c_w[i*4]   = (C3(phase_offset) * 4096.0);
         g_c_w[i*4+1] = (C2(phase_offset) * 4096.0);
         g_c_w[i*4+2] = (C1(phase_offset) * 4096.0);
         g_c_w[i*4+3] = (C0(phase_offset) * 4096.0);
     }

     for (i = 0; i < g_nh; i++)
     {
         phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh;
         g_c_h[i*4]   = (C0(phase_offset) * 4096.0);
         g_c_h[i*4+1] = (C1(phase_offset) * 4096.0);
         g_c_h[i*4+2] = (C2(phase_offset) * 4096.0);
         g_c_h[i*4+3] = (C3(phase_offset) * 4096.0);
     }

     for (i = 0; i < g_nh_uv; i++)
     {
         phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv;
         g_c_h_uv[i*4]   = (C0(phase_offset) * 4096.0);
         g_c_h_uv[i*4+1] = (C1(phase_offset) * 4096.0);
         g_c_h_uv[i*4+2] = (C2(phase_offset) * 4096.0);
         g_c_h_uv[i*4+3] = (C3(phase_offset) * 4096.0);
     }

 #endif

     // Create an array that corresponds input lines to output lines.
     //  This doesn't require floating point math, but it does require
     //  a division and because hardware division is not present that
     //  is a call.
     for (i = 0; i < out_width; i++)
     {
         g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw;

         if ((g_b_scaler.l_w[i] + 2) <= in_width)
             g_b_scaler.max_usable_out_width = i;

     }

     for (i = 0; i < out_height + 1; i++)
     {
         g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh;
         g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv;
     }

     return 0;
 }

 int bicubic_scale(int in_width, int in_height, int in_stride,
                   int out_width, int out_height, int out_stride,
                   unsigned char *input_image, unsigned char *output_image)
 {
     short *RESTRICT l_w, * RESTRICT l_h;
     short *RESTRICT c_w, * RESTRICT c_h;
     unsigned char *RESTRICT ip, * RESTRICT op;
     unsigned char *RESTRICT hbuf;
     int h, w, lw, lh;
     int temp_sum;
     int phase_offset_w, phase_offset_h;

     c_w = g_b_scaler.c_w;
     c_h = g_b_scaler.c_h;

     op = output_image;

     l_w = g_b_scaler.l_w;
     l_h = g_b_scaler.l_h;

     phase_offset_h = 0;

     for (h = 0; h < out_height; h++)
     {
         // select the row to work on
         lh = l_h[h];
         ip = input_image + (in_stride * lh);

         // vp8_filter the row vertically into an temporary buffer.
         //  If the phase offset == 0 then all the multiplication
         //  is going to result in the output equalling the input.
         //  So instead point the temporary buffer to the input.
         //  Also handle the boundry condition of not being able to
         //  filter that last lines.
         if (phase_offset_h && (lh < in_height - 2))
         {
             hbuf = g_b_scaler.hbuf;

             for (w = 0; w < in_width; w++)
             {
                 temp_sum =  c_h[phase_offset_h*4+3] * ip[w - in_stride];
                 temp_sum += c_h[phase_offset_h*4+2] * ip[w];
                 temp_sum += c_h[phase_offset_h*4+1] * ip[w + in_stride];
                 temp_sum += c_h[phase_offset_h*4]   * ip[w + 2*in_stride];

                 hbuf[w] = temp_sum >> 12;
             }
         }
         else
             hbuf = ip;

         // increase the phase offset for the next time around.
         if (++phase_offset_h >= g_b_scaler.nh)
             phase_offset_h = 0;

         // now filter and expand it horizontally into the final
         //  output buffer
         phase_offset_w = 0;

         for (w = 0; w < out_width; w++)
         {
             // get the index to use to expand the image
             lw = l_w[w];

             temp_sum =  c_w[phase_offset_w*4]   * hbuf[lw - 1];
             temp_sum += c_w[phase_offset_w*4+1] * hbuf[lw];
             temp_sum += c_w[phase_offset_w*4+2] * hbuf[lw + 1];
             temp_sum += c_w[phase_offset_w*4+3] * hbuf[lw + 2];
             temp_sum = temp_sum >> 12;

             if (++phase_offset_w >= g_b_scaler.nw)
                 phase_offset_w = 0;

             // boundry conditions
             if ((lw + 2) >= in_width)
                 temp_sum = hbuf[lw];

             if (lw == 0)
                 temp_sum = hbuf[0];

             op[w] = temp_sum;
         }

         op += out_stride;
     }

     return 0;
 }

 void bicubic_scale_frame_reset()
 {
     g_b_scaler.out_width = 0;
     g_b_scaler.out_height = 0;
 }

 void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                          int new_width, int new_height)
 {

     dst->y_width = new_width;
     dst->y_height = new_height;
     dst->uv_width = new_width / 2;
     dst->uv_height = new_height / 2;

     dst->y_stride = dst->y_width;
     dst->uv_stride = dst->uv_width;

     bicubic_scale(src->y_width, src->y_height, src->y_stride,
                   new_width, new_height, dst->y_stride,
                   src->y_buffer, dst->y_buffer);

     bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
                   new_width / 2, new_height / 2, dst->uv_stride,
                   src->u_buffer, dst->u_buffer);

     bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
                   new_width / 2, new_height / 2, dst->uv_stride,
                   src->v_buffer, dst->v_buffer);
 }
	/*
	* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license
	* that can be found in the LICENSE file in the root of the source
	* tree. An additional intellectual property rights grant can be found
	* in the file PATENTS. All contributing project authors may
	* be found in the AUTHORS file in the root of the source tree.
	*/


	#include <float.h>
	#include <math.h>
	#include <stdio.h>
	#include "vpx_mem/vpx_mem.h"
	#include "vpxscale_arbitrary.h"

	#define FIXED_POINT

	#define MAX_IN_WIDTH 800
	#define MAX_IN_HEIGHT 600
	#define MAX_OUT_WIDTH 800
	#define MAX_OUT_HEIGHT 600
	#define MAX_OUT_DIMENSION ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \
	MAX_OUT_WIDTH : MAX_OUT_HEIGHT)

	BICUBIC_SCALER_STRUCT g_b_scaler;
	static int g_first_time = 1;

	#pragma DATA_SECTION(g_hbuf, "VP6_HEAP")
	#pragma DATA_ALIGN (g_hbuf, 32);
	unsigned char g_hbuf[MAX_OUT_DIMENSION];

	#pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP")
	#pragma DATA_ALIGN (g_hbuf_uv, 32);
	unsigned char g_hbuf_uv[MAX_OUT_DIMENSION];


	#ifdef FIXED_POINT
	static int a_i = 0.6 * 65536;
	#else
	static float a = -0.6;
	#endif

	#ifdef FIXED_POINT
	// 3 2
	// C0 = at - at
	//
	static short c0_fixed(unsigned int t)
	{
	// put t in Q16 notation
	unsigned short v1, v2;

	// Q16
	v1 = (a_i * t) >> 16;
	v1 = (v1 * t) >> 16;

	// Q16
	v2 = (a_i * t) >> 16;
	v2 = (v2 * t) >> 16;
	v2 = (v2 * t) >> 16;

	// Q12
	return -((v1 - v2) >> 4);
	}

	// 2 3
	// C1 = at + (3-2a)t - (2-a)t
	//
	static short c1_fixed(unsigned int t)
	{
	unsigned short v1, v2, v3;
	unsigned short two, three;

	// Q16
	v1 = (a_i * t) >> 16;

	// Q13
	two = 2 << 13;
	v2 = two - (a_i >> 3);
	v2 = (v2 * t) >> 16;
	v2 = (v2 * t) >> 16;
	v2 = (v2 * t) >> 16;

	// Q13
	three = 3 << 13;
	v3 = three - (2 * (a_i >> 3));
	v3 = (v3 * t) >> 16;
	v3 = (v3 * t) >> 16;

	// Q12
	return (((v1 >> 3) - v2 + v3) >> 1);

	}

	// 2 3
	// C2 = 1 - (3-a)t + (2-a)t
	//
	static short c2_fixed(unsigned int t)
	{
	unsigned short v1, v2, v3;
	unsigned short two, three;

	// Q13
	v1 = 1 << 13;

	// Q13
	three = 3 << 13;
	v2 = three - (a_i >> 3);
	v2 = (v2 * t) >> 16;
	v2 = (v2 * t) >> 16;

	// Q13
	two = 2 << 13;
	v3 = two - (a_i >> 3);
	v3 = (v3 * t) >> 16;
	v3 = (v3 * t) >> 16;
	v3 = (v3 * t) >> 16;

	// Q12
	return (v1 - v2 + v3) >> 1;
	}

	// 2 3
	// C3 = at - 2at + at
	//
	static short c3_fixed(unsigned int t)
	{
	int v1, v2, v3;

	// Q16
	v1 = (a_i * t) >> 16;

	// Q15
	v2 = 2 * (a_i >> 1);
	v2 = (v2 * t) >> 16;
	v2 = (v2 * t) >> 16;

	// Q16
	v3 = (a_i * t) >> 16;
	v3 = (v3 * t) >> 16;
	v3 = (v3 * t) >> 16;

	// Q12
	return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3);
	}
	#else
	// 3 2
	// C0 = -at + at
	//
	float C0(float t)
	{
	return -a * t * t * t + a * t * t;
	}

	// 2 3
	// C1 = -at + (2a+3)t - (a+2)t
	//
	float C1(float t)
	{
	return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t;
	}

	// 2 3
	// C2 = 1 - (a+3)t + (a+2)t
	//
	float C2(float t)
	{
	return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f;
	}

	// 2 3
	// C3 = at - 2at + at
	//
	float C3(float t)
	{
	return a * t * t * t - 2.0f * a * t * t + a * t;
	}
	#endif

	#if 0
	int compare_real_fixed()
	{
	int i, errors = 0;
	float mult = 1.0 / 10000.0;
	unsigned int fixed_mult = mult * 4294967296;//65536;
	unsigned int phase_offset_int;
	float phase_offset_real;

	for (i = 0; i < 10000; i++)
	{
	int fixed0, fixed1, fixed2, fixed3, fixed_total;
	int real0, real1, real2, real3, real_total;

	phase_offset_real = (float)i * mult;
	phase_offset_int = (fixed_mult * i) >> 16;
	// phase_offset_int = phase_offset_real * 65536;

	fixed0 = c0_fixed(phase_offset_int);
	real0 = C0(phase_offset_real) * 4096.0;

	if ((abs(fixed0) > (abs(real0) + 1)) \|\| (abs(fixed0) < (abs(real0) - 1)))
	errors++;

	fixed1 = c1_fixed(phase_offset_int);
	real1 = C1(phase_offset_real) * 4096.0;

	if ((abs(fixed1) > (abs(real1) + 1)) \|\| (abs(fixed1) < (abs(real1) - 1)))
	errors++;

	fixed2 = c2_fixed(phase_offset_int);
	real2 = C2(phase_offset_real) * 4096.0;

	if ((abs(fixed2) > (abs(real2) + 1)) \|\| (abs(fixed2) < (abs(real2) - 1)))
	errors++;

	fixed3 = c3_fixed(phase_offset_int);
	real3 = C3(phase_offset_real) * 4096.0;

	if ((abs(fixed3) > (abs(real3) + 1)) \|\| (abs(fixed3) < (abs(real3) - 1)))
	errors++;

	fixed_total = fixed0 + fixed1 + fixed2 + fixed3;
	real_total = real0 + real1 + real2 + real3;

	if ((fixed_total > 4097) \|\| (fixed_total < 4094))
	errors ++;

	if ((real_total > 4097) \|\| (real_total < 4095))
	errors ++;
	}

	return errors;
	}
	#endif

	// Find greatest common denominator between two integers. Method used here is
	// slow compared to Euclid's algorithm, but does not require any division.
	int gcd(int a, int b)
	{
	// Problem with this algorithm is that if a or b = 0 this function
	// will never exit. Don't want to return 0 because any computation
	// that was based on a common denoninator and tried to reduce by
	// dividing by 0 would fail. Best solution that could be thought of
	// would to be fail by returing a 1;
	if (a <= 0 \|\| b <= 0)
	return 1;

	while (a != b)
	{
	if (b > a)
	b = b - a;
	else
	{
	int tmp = a;//swap large and
	a = b; //small
	b = tmp;
	}
	}

	return b;
	}

	void bicubic_coefficient_init()
	{
	vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
	g_first_time = 0;
	}

	void bicubic_coefficient_destroy()
	{
	if (!g_first_time)
	{
	if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);

	if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);

	if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);

	if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);

	if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);

	if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);

	vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
	}
	}

	// Create the coeffients that will be used for the cubic interpolation.
	// Because scaling does not have to be equal in the vertical and horizontal
	// regimes the phase offsets will be different. There are 4 coefficents
	// for each point, two on each side. The layout is that there are the
	// 4 coefficents for each phase in the array and then the next phase.
	int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height)
	{
	int i;
	#ifdef FIXED_POINT
	int phase_offset_int;
	unsigned int fixed_mult;
	int product_val = 0;
	#else
	float phase_offset;
	#endif
	int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv;

	if (g_first_time)
	bicubic_coefficient_init();


	// check to see if the coefficents have already been set up correctly
	if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height)
	&& (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height))
	return 0;

	g_b_scaler.in_width = in_width;
	g_b_scaler.in_height = in_height;
	g_b_scaler.out_width = out_width;
	g_b_scaler.out_height = out_height;

	// Don't want to allow crazy scaling, just try and prevent a catastrophic
	// failure here. Want to fail after setting the member functions so if
	// if the scaler is called the member functions will not scale.
	if (out_width <= 0 \|\| out_height <= 0)
	return -1;

	// reduce in/out width and height ratios using the gcd
	gcd_w = gcd(out_width, in_width);
	gcd_h = gcd(out_height, in_height);
	gcd_h_uv = gcd(out_height, in_height / 2);

	// the numerator width and height are to be saved in
	// globals so they can be used during the scaling process
	// without having to be recalculated.
	g_b_scaler.nw = out_width / gcd_w;
	d_w = in_width / gcd_w;

	g_b_scaler.nh = out_height / gcd_h;
	d_h = in_height / gcd_h;

	g_b_scaler.nh_uv = out_height / gcd_h_uv;
	d_h_uv = (in_height / 2) / gcd_h_uv;

	// allocate memory for the coefficents
	if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);

	if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);

	if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);

	g_b_scaler.l_w = (short )vpx_memalign(32, out_width 2);
	g_b_scaler.l_h = (short )vpx_memalign(32, out_height 2);
	g_b_scaler.l_h_uv = (short )vpx_memalign(32, out_height 2);

	if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);

	if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);

	if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);

	g_b_scaler.c_w = (short )vpx_memalign(32, g_b_scaler.nw 4 * 2);
	g_b_scaler.c_h = (short )vpx_memalign(32, g_b_scaler.nh 4 * 2);
	g_b_scaler.c_h_uv = (short )vpx_memalign(32, g_b_scaler.nh_uv 4 * 2);

	g_b_scaler.hbuf = g_hbuf;
	g_b_scaler.hbuf_uv = g_hbuf_uv;

	// Set up polyphase filter taps. This needs to be done before
	// the scaling because of the floating point math required. The
	// coefficients are multiplied by 2^12 so that fixed point math
	// can be used in the main scaling loop.
	#ifdef FIXED_POINT
	fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296;

	product_val = 0;

	for (i = 0; i < g_b_scaler.nw; i++)
	{
	if (product_val > g_b_scaler.nw)
	product_val -= g_b_scaler.nw;

	phase_offset_int = (fixed_mult * product_val) >> 16;

	g_b_scaler.c_w[i*4] = c3_fixed(phase_offset_int);
	g_b_scaler.c_w[i*4+1] = c2_fixed(phase_offset_int);
	g_b_scaler.c_w[i*4+2] = c1_fixed(phase_offset_int);
	g_b_scaler.c_w[i*4+3] = c0_fixed(phase_offset_int);

	product_val += d_w;
	}


	fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296;

	product_val = 0;

	for (i = 0; i < g_b_scaler.nh; i++)
	{
	if (product_val > g_b_scaler.nh)
	product_val -= g_b_scaler.nh;

	phase_offset_int = (fixed_mult * product_val) >> 16;

	g_b_scaler.c_h[i*4] = c0_fixed(phase_offset_int);
	g_b_scaler.c_h[i*4+1] = c1_fixed(phase_offset_int);
	g_b_scaler.c_h[i*4+2] = c2_fixed(phase_offset_int);
	g_b_scaler.c_h[i*4+3] = c3_fixed(phase_offset_int);

	product_val += d_h;
	}

	fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296;

	product_val = 0;

	for (i = 0; i < g_b_scaler.nh_uv; i++)
	{
	if (product_val > g_b_scaler.nh_uv)
	product_val -= g_b_scaler.nh_uv;

	phase_offset_int = (fixed_mult * product_val) >> 16;

	g_b_scaler.c_h_uv[i*4] = c0_fixed(phase_offset_int);
	g_b_scaler.c_h_uv[i*4+1] = c1_fixed(phase_offset_int);
	g_b_scaler.c_h_uv[i*4+2] = c2_fixed(phase_offset_int);
	g_b_scaler.c_h_uv[i*4+3] = c3_fixed(phase_offset_int);

	product_val += d_h_uv;
	}

	#else

	for (i = 0; i < g_nw; i++)
	{
	phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw;
	g_c_w[i4] = (C3(phase_offset) 4096.0);
	g_c_w[i4+1] = (C2(phase_offset) 4096.0);
	g_c_w[i4+2] = (C1(phase_offset) 4096.0);
	g_c_w[i4+3] = (C0(phase_offset) 4096.0);
	}

	for (i = 0; i < g_nh; i++)
	{
	phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh;
	g_c_h[i4] = (C0(phase_offset) 4096.0);
	g_c_h[i4+1] = (C1(phase_offset) 4096.0);
	g_c_h[i4+2] = (C2(phase_offset) 4096.0);
	g_c_h[i4+3] = (C3(phase_offset) 4096.0);
	}

	for (i = 0; i < g_nh_uv; i++)
	{
	phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv;
	g_c_h_uv[i4] = (C0(phase_offset) 4096.0);
	g_c_h_uv[i4+1] = (C1(phase_offset) 4096.0);
	g_c_h_uv[i4+2] = (C2(phase_offset) 4096.0);
	g_c_h_uv[i4+3] = (C3(phase_offset) 4096.0);
	}

	#endif

	// Create an array that corresponds input lines to output lines.
	// This doesn't require floating point math, but it does require
	// a division and because hardware division is not present that
	// is a call.
	for (i = 0; i < out_width; i++)
	{
	g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw;

	if ((g_b_scaler.l_w[i] + 2) <= in_width)
	g_b_scaler.max_usable_out_width = i;

	}

	for (i = 0; i < out_height + 1; i++)
	{
	g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh;
	g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv;
	}

	return 0;
	}

	int bicubic_scale(int in_width, int in_height, int in_stride,
	int out_width, int out_height, int out_stride,
	unsigned char input_image, unsigned char output_image)
	{
	short RESTRICT l_w, RESTRICT l_h;
	short RESTRICT c_w, RESTRICT c_h;
	unsigned char RESTRICT ip, RESTRICT op;
	unsigned char *RESTRICT hbuf;
	int h, w, lw, lh;
	int temp_sum;
	int phase_offset_w, phase_offset_h;

	c_w = g_b_scaler.c_w;
	c_h = g_b_scaler.c_h;

	op = output_image;

	l_w = g_b_scaler.l_w;
	l_h = g_b_scaler.l_h;

	phase_offset_h = 0;

	for (h = 0; h < out_height; h++)
	{
	// select the row to work on
	lh = l_h[h];
	ip = input_image + (in_stride * lh);

	// vp8_filter the row vertically into an temporary buffer.
	// If the phase offset == 0 then all the multiplication
	// is going to result in the output equalling the input.
	// So instead point the temporary buffer to the input.
	// Also handle the boundry condition of not being able to
	// filter that last lines.
	if (phase_offset_h && (lh < in_height - 2))
	{
	hbuf = g_b_scaler.hbuf;

	for (w = 0; w < in_width; w++)
	{
	temp_sum = c_h[phase_offset_h4+3] ip[w - in_stride];
	temp_sum += c_h[phase_offset_h4+2] ip[w];
	temp_sum += c_h[phase_offset_h4+1] ip[w + in_stride];
	temp_sum += c_h[phase_offset_h4] ip[w + 2*in_stride];

	hbuf[w] = temp_sum >> 12;
	}
	}
	else
	hbuf = ip;

	// increase the phase offset for the next time around.
	if (++phase_offset_h >= g_b_scaler.nh)
	phase_offset_h = 0;

	// now filter and expand it horizontally into the final
	// output buffer
	phase_offset_w = 0;

	for (w = 0; w < out_width; w++)
	{
	// get the index to use to expand the image
	lw = l_w[w];

	temp_sum = c_w[phase_offset_w4] hbuf[lw - 1];
	temp_sum += c_w[phase_offset_w4+1] hbuf[lw];
	temp_sum += c_w[phase_offset_w4+2] hbuf[lw + 1];
	temp_sum += c_w[phase_offset_w4+3] hbuf[lw + 2];
	temp_sum = temp_sum >> 12;

	if (++phase_offset_w >= g_b_scaler.nw)
	phase_offset_w = 0;

	// boundry conditions
	if ((lw + 2) >= in_width)
	temp_sum = hbuf[lw];

	if (lw == 0)
	temp_sum = hbuf[0];

	op[w] = temp_sum;
	}

	op += out_stride;
	}

	return 0;
	}

	void bicubic_scale_frame_reset()
	{
	g_b_scaler.out_width = 0;
	g_b_scaler.out_height = 0;
	}

	void bicubic_scale_frame(YV12_BUFFER_CONFIG src, YV12_BUFFER_CONFIG dst,
	int new_width, int new_height)
	{

	dst->y_width = new_width;
	dst->y_height = new_height;
	dst->uv_width = new_width / 2;
	dst->uv_height = new_height / 2;

	dst->y_stride = dst->y_width;
	dst->uv_stride = dst->uv_width;

	bicubic_scale(src->y_width, src->y_height, src->y_stride,
	new_width, new_height, dst->y_stride,
	src->y_buffer, dst->y_buffer);

	bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
	new_width / 2, new_height / 2, dst->uv_stride,
	src->u_buffer, dst->u_buffer);

	bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
	new_width / 2, new_height / 2, dst->uv_stride,
	src->v_buffer, dst->v_buffer);
	}