Merge "Fix YUV intrinsic" into jb-mr2-dev
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 3a49c0d..3d989bd 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -18,6 +18,10 @@
 #include "rsCpuIntrinsic.h"
 #include "rsCpuIntrinsicInlines.h"
 
+#ifndef RS_COMPATIBILITY_LIB
+#include "hardware/gralloc.h"
+#endif
+
 using namespace android;
 using namespace android::renderscript;
 
@@ -99,6 +103,7 @@
 };
 
 extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
+extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, uint32_t count, const short *param);
 
 void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
                                            uint32_t xstart, uint32_t xend,
@@ -109,39 +114,87 @@
         return;
     }
     const uchar *pinY = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
-    const uchar *pinUV = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
     const size_t strideY = cp->alloc->mHal.drvState.lod[0].stride;
-    const size_t strideUV = cp->alloc->mHal.drvState.lod[1].stride;
-
     const uchar *Y = pinY + (p->y * strideY);
-    const uchar *uv = pinUV + ((p->y >> 1) * strideUV);
 
     uchar4 *out = (uchar4 *)p->out;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
-    if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
-        int32_t len = (x2 - x1 - 1) >> 3;
-        if(len > 0) {
-            rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
-            x1 += len << 3;
-            out += len << 3;
-        }
+    switch (cp->alloc->mHal.state.yuv) {
+    // In API 17 there was no yuv format and the intrinsic treated everything as NV21
+    case 0:
+#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
+    case HAL_PIXEL_FORMAT_YCrCb_420_SP:  // NV21
 #endif
+        {
+            const uchar *pinUV = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
+            const size_t strideUV = cp->alloc->mHal.drvState.lod[1].stride;
+            const uchar *uv = pinUV + ((p->y >> 1) * strideUV);
 
-       // ALOGE("y %i  %i  %i", p->y, x1, x2);
-        while(x1 < x2) {
-            uchar u = uv[(x1 & 0xffffe) + 1];
-            uchar v = uv[(x1 & 0xffffe) + 0];
-            *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
-            out++;
-            x1++;
-            *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
-            out++;
-            x1++;
+            if(x2 > x1) {
+        #if defined(ARCH_ARM_HAVE_NEON)
+                int32_t len = (x2 - x1 - 1) >> 3;
+                if(len > 0) {
+                    rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
+                    x1 += len << 3;
+                    out += len << 3;
+                }
+        #endif
+
+               // ALOGE("y %i  %i  %i", p->y, x1, x2);
+                while(x1 < x2) {
+                    uchar u = uv[(x1 & 0xffffe) + 1];
+                    uchar v = uv[(x1 & 0xffffe) + 0];
+                    *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
+                    out++;
+                    x1++;
+                    *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
+                    out++;
+                    x1++;
+                }
+            }
         }
+        break;
+
+#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
+    case HAL_PIXEL_FORMAT_YV12:
+        {
+            const uchar *pinU = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
+            const size_t strideU = cp->alloc->mHal.drvState.lod[1].stride;
+            const uchar *u = pinU + ((p->y >> 1) * strideU);
+
+            const uchar *pinV = (const uchar *)cp->alloc->mHal.drvState.lod[2].mallocPtr;
+            const size_t strideV = cp->alloc->mHal.drvState.lod[2].stride;
+            const uchar *v = pinV + ((p->y >> 1) * strideV);
+
+            if(x2 > x1) {
+        #if defined(ARCH_ARM_HAVE_NEON)
+                int32_t len = (x2 - x1 - 1) >> 3;
+                if(len > 0) {
+                    rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
+                    x1 += len << 3;
+                    out += len << 3;
+                }
+        #endif
+
+               // ALOGE("y %i  %i  %i", p->y, x1, x2);
+                while(x1 < x2) {
+                    uchar ut = u[x1];
+                    uchar vt = v[x1];
+                    *out = rsYuvToRGBA_uchar4(Y[x1], ut, vt);
+                    out++;
+                    x1++;
+                    *out = rsYuvToRGBA_uchar4(Y[x1], ut, vt);
+                    out++;
+                    x1++;
+                }
+            }
+        }
+        break;
+#endif
     }
+
 }
 
 RsdCpuScriptIntrinsicYuvToRGB::RsdCpuScriptIntrinsicYuvToRGB(
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index b93a038..c8dc9bf 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -504,6 +504,100 @@
         bx          lr
 END(rsdIntrinsicYuv_K)
 
+/*
+    Function called with the following arguments: dst, Y, v, u, len, YuvCoeff
+        r0 = dst
+        r1 = Y
+        r2 = V,
+        r3 = U
+        ---- Args below will be in the stack ----
+        sp = length (pixels / 8)
+        sp+4 = YuvCoeff
+
+        This function converts 8 pixels per iteration
+*/
+ENTRY(rsdIntrinsicYuv2_K)
+        push        {r4, r5, r6, lr}        @ preserve clobbered int registers
+        vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
+
+        mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
+
+        ldr         r4, [sp, #64+16+4]      @ load the coeffs address in memory in r4 (16*4 + 4*4 + 4)
+        ldr         r6, [sp, #64+16]        @ load the length in r6 (16*4 + 4*4)
+        vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
+        vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
+        vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
+
+        mov         r4, #4                  @ Integer 8 in r4; used as an incrementing value
+
+        vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
+                                            @ the coeffs matrix (Q2)
+
+        1:
+        vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
+        vld1.8      {d12}, [r3], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
+        vld1.8      {d14}, [r2], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
+        pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
+        pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
+
+        vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
+        vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
+        vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
+
+        vsubl.u8    Q5, d12, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
+        vsubl.u8    Q6, d14, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
+        vmov.u16    d11, d10                @ Copying V to d11
+        vmov.u16    d13, d12                @ Copying U to d13
+        vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
+        vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
+
+
+        vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
+        vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
+        vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
+        vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
+
+                                            @                  R    G    B
+                                            @     Pixel(0-3)  Q8,  Q9, Q10
+                                            @     Pixel(4-7) Q11, Q12, Q13
+                                            @
+
+                                            @ Pixel(0-3)
+        vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
+        vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
+        vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
+        vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
+
+                                            @ Pixel(4-7)
+        vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
+        vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
+        vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
+        vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
+
+                                            @ Pixel(0-3)
+        vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
+
+                                            @ Pixel(4-7)
+        vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
+
+        vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
+        vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
+        vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
+
+        subs        r6, r6, #1              @ Checking length (r6)
+        vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
+
+        bne 1b                              @ if not done with length, loop
+
+        vpop        {Q4-Q7}                 @ Restore Vregisters
+        pop         {r4, r5, r6, lr}        @ Restore int registers
+        bx          lr
+END(rsdIntrinsicYuv2_K)
+
 /* Convolve 5x5 */
 
 /*