Fix YUV intrinsic

One format of YUV was not correctly handled by the built
in YUV intrinsic.  This CL adds support.  The scope is confined to
the YUV instrinsic and cannot impact other paths.

bug 8566866

Change-Id: I9a96b2117b0676213f4906b0ca12416693b9c0e4
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 3a49c0d..3d989bd 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -18,6 +18,10 @@
 #include "rsCpuIntrinsic.h"
 #include "rsCpuIntrinsicInlines.h"
 
+#ifndef RS_COMPATIBILITY_LIB
+#include "hardware/gralloc.h"
+#endif
+
 using namespace android;
 using namespace android::renderscript;
 
@@ -99,6 +103,7 @@
 };
 
 extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
+extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, uint32_t count, const short *param);
 
 void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
                                            uint32_t xstart, uint32_t xend,
@@ -109,39 +114,87 @@
         return;
     }
     const uchar *pinY = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
-    const uchar *pinUV = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
     const size_t strideY = cp->alloc->mHal.drvState.lod[0].stride;
-    const size_t strideUV = cp->alloc->mHal.drvState.lod[1].stride;
-
     const uchar *Y = pinY + (p->y * strideY);
-    const uchar *uv = pinUV + ((p->y >> 1) * strideUV);
 
     uchar4 *out = (uchar4 *)p->out;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
-    if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
-        int32_t len = (x2 - x1 - 1) >> 3;
-        if(len > 0) {
-            rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
-            x1 += len << 3;
-            out += len << 3;
-        }
+    switch (cp->alloc->mHal.state.yuv) {
+    // In API 17 there was no yuv format and the intrinsic treated everything as NV21
+    case 0:
+#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
+    case HAL_PIXEL_FORMAT_YCrCb_420_SP:  // NV21
 #endif
+        {
+            const uchar *pinUV = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
+            const size_t strideUV = cp->alloc->mHal.drvState.lod[1].stride;
+            const uchar *uv = pinUV + ((p->y >> 1) * strideUV);
 
-       // ALOGE("y %i  %i  %i", p->y, x1, x2);
-        while(x1 < x2) {
-            uchar u = uv[(x1 & 0xffffe) + 1];
-            uchar v = uv[(x1 & 0xffffe) + 0];
-            *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
-            out++;
-            x1++;
-            *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
-            out++;
-            x1++;
+            if(x2 > x1) {
+        #if defined(ARCH_ARM_HAVE_NEON)
+                int32_t len = (x2 - x1 - 1) >> 3;
+                if(len > 0) {
+                    rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
+                    x1 += len << 3;
+                    out += len << 3;
+                }
+        #endif
+
+               // ALOGE("y %i  %i  %i", p->y, x1, x2);
+                while(x1 < x2) {
+                    uchar u = uv[(x1 & 0xffffe) + 1];
+                    uchar v = uv[(x1 & 0xffffe) + 0];
+                    *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
+                    out++;
+                    x1++;
+                    *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
+                    out++;
+                    x1++;
+                }
+            }
         }
+        break;
+
+#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
+    case HAL_PIXEL_FORMAT_YV12:
+        {
+            const uchar *pinU = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
+            const size_t strideU = cp->alloc->mHal.drvState.lod[1].stride;
+            const uchar *u = pinU + ((p->y >> 1) * strideU);
+
+            const uchar *pinV = (const uchar *)cp->alloc->mHal.drvState.lod[2].mallocPtr;
+            const size_t strideV = cp->alloc->mHal.drvState.lod[2].stride;
+            const uchar *v = pinV + ((p->y >> 1) * strideV);
+
+            if(x2 > x1) {
+        #if defined(ARCH_ARM_HAVE_NEON)
+                int32_t len = (x2 - x1 - 1) >> 3;
+                if(len > 0) {
+                    rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
+                    x1 += len << 3;
+                    out += len << 3;
+                }
+        #endif
+
+               // ALOGE("y %i  %i  %i", p->y, x1, x2);
+                while(x1 < x2) {
+                    uchar ut = u[x1];
+                    uchar vt = v[x1];
+                    *out = rsYuvToRGBA_uchar4(Y[x1], ut, vt);
+                    out++;
+                    x1++;
+                    *out = rsYuvToRGBA_uchar4(Y[x1], ut, vt);
+                    out++;
+                    x1++;
+                }
+            }
+        }
+        break;
+#endif
     }
+
 }
 
 RsdCpuScriptIntrinsicYuvToRGB::RsdCpuScriptIntrinsicYuvToRGB(
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index b93a038..c8dc9bf 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -504,6 +504,100 @@
         bx          lr
 END(rsdIntrinsicYuv_K)
 
+/*
+    Function called with the following arguments: dst, Y, v, u, len, YuvCoeff
+        r0 = dst
+        r1 = Y
+        r2 = V,
+        r3 = U
+        ---- Args below will be in the stack ----
+        sp = length (pixels / 8)
+        sp+4 = YuvCoeff
+
+        This function converts 8 pixels per iteration
+*/
+ENTRY(rsdIntrinsicYuv2_K)
+        push        {r4, r5, r6, lr}        @ preserve clobbered int registers
+        vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
+
+        mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
+
+        ldr         r4, [sp, #64+16+4]      @ load the coeffs address in memory in r4 (16*4 + 4*4 + 4)
+        ldr         r6, [sp, #64+16]        @ load the length in r6 (16*4 + 4*4)
+        vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
+        vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
+        vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
+
+        mov         r4, #4                  @ Integer 8 in r4; used as an incrementing value
+
+        vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
+                                            @ the coeffs matrix (Q2)
+
+        1:
+        vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
+        vld1.8      {d12}, [r3], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
+        vld1.8      {d14}, [r2], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
+        pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
+        pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
+
+        vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
+        vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
+        vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
+
+        vsubl.u8    Q5, d12, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
+        vsubl.u8    Q6, d14, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
+        vmov.u16    d11, d10                @ Copying V to d11
+        vmov.u16    d13, d12                @ Copying U to d13
+        vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
+        vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
+
+
+        vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
+        vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
+        vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
+        vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
+
+                                            @                  R    G    B
+                                            @     Pixel(0-3)  Q8,  Q9, Q10
+                                            @     Pixel(4-7) Q11, Q12, Q13
+                                            @
+
+                                            @ Pixel(0-3)
+        vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
+        vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
+        vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
+        vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
+
+                                            @ Pixel(4-7)
+        vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
+        vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
+        vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
+        vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
+
+                                            @ Pixel(0-3)
+        vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
+
+                                            @ Pixel(4-7)
+        vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
+
+        vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
+        vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
+        vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
+
+        subs        r6, r6, #1              @ Checking length (r6)
+        vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
+
+        bne 1b                              @ if not done with length, loop
+
+        vpop        {Q4-Q7}                 @ Restore Vregisters
+        pop         {r4, r5, r6, lr}        @ Restore int registers
+        bx          lr
+END(rsdIntrinsicYuv2_K)
+
 /* Convolve 5x5 */
 
 /*