Merge "Start making RS 64-bit clean." into jb-mr2-dev
diff --git a/CleanSpec.mk b/CleanSpec.mk
new file mode 100644
index 0000000..4c9c547
--- /dev/null
+++ b/CleanSpec.mk
@@ -0,0 +1,51 @@
+# Copyright (C) 2013 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# If you don't need to do a full clean build but would like to touch
+# a file or delete some intermediate files, add a clean step to the end
+# of the list.  These steps will only be run once, if they haven't been
+# run before.
+#
+# E.g.:
+#     $(call add-clean-step, touch -c external/sqlite/sqlite3.h)
+#     $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/STATIC_LIBRARIES/libz_intermediates)
+#
+# Always use "touch -c" and "rm -f" or "rm -rf" to gracefully deal with
+# files that are missing or have been moved.
+#
+# Use $(PRODUCT_OUT) to get to the "out/target/product/blah/" directory.
+# Use $(OUT_DIR) to refer to the "out" directory.
+#
+# If you need to re-do something that's already mentioned, just copy
+# the command and add it to the bottom of the list.  E.g., if a change
+# that you made last week required touching a file and a change you
+# made today requires touching the same file, just copy the old
+# touch step and add it to the end of the list.
+#
+# ************************************************
+# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST
+# ************************************************
+
+# For example:
+#$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/APPS/AndroidTests_intermediates)
+#$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/JAVA_LIBRARIES/core_intermediates)
+#$(call add-clean-step, find $(OUT_DIR) -type f -name "IGTalkSession*" -print0 | xargs -0 rm -f)
+#$(call add-clean-step, rm -rf $(PRODUCT_OUT)/data/*)
+$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/APPS/*/src/RenderScript.stamp)
+$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/APPS/*/src/renderscript/)
+
+# ************************************************
+# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST
+# ************************************************
diff --git a/cpp/RenderScript.cpp b/cpp/RenderScript.cpp
index 503798b..134d34b 100644
--- a/cpp/RenderScript.cpp
+++ b/cpp/RenderScript.cpp
@@ -20,7 +20,6 @@
 
 #include "RenderScript.h"
 #include "rs.h"
-#include "rsUtils.h"
 
 using namespace android;
 using namespace RSC;
diff --git a/cpp/Type.cpp b/cpp/Type.cpp
index 8fa505c..312020a 100644
--- a/cpp/Type.cpp
+++ b/cpp/Type.cpp
@@ -19,7 +19,6 @@
 
 #include <rs.h>
 #include "RenderScript.h"
-#include "rsUtils.h"
 
 using namespace android;
 using namespace RSC;
diff --git a/cpp/rsCppStructs.h b/cpp/rsCppStructs.h
index 97420fb..be011aa 100644
--- a/cpp/rsCppStructs.h
+++ b/cpp/rsCppStructs.h
@@ -17,7 +17,7 @@
 #ifndef ANDROID_RSCPPSTRUCTS_H
 #define ANDROID_RSCPPSTRUCTS_H
 
-#include "rsUtils.h"
+#include "rsCppUtils.h"
 #ifndef RS_SERVER
 #include "utils/RefBase.h"
 #else
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index ddc87f9..142ca6e 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -33,7 +33,12 @@
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
     LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
     LOCAL_SRC_FILES+= \
-        rsCpuIntrinsics_neon.S
+        rsCpuIntrinsics_neon.S \
+        rsCpuSample_neon.S
+endif
+
+ifeq ($(ARCH_ARM_HAVE_VFP),true)
+    LOCAL_CFLAGS += -DARCH_ARM_HAVE_VFP
 endif
 
 LOCAL_SHARED_LIBRARIES += libRS libcutils libutils libsync
diff --git a/cpu_ref/rsCpuSample_neon.S b/cpu_ref/rsCpuSample_neon.S
new file mode 100644
index 0000000..5f1060b
--- /dev/null
+++ b/cpu_ref/rsCpuSample_neon.S
@@ -0,0 +1,293 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+/*
+        r0 = base pointer
+        r1 = image stride
+        r2 = w
+        r3 = h
+        sp = float u
+        sp = float v
+*/
+
+ENTRY(rsdCpuLinearClamp2D_RGBA_k2)
+    push            {r4-r8, r10, r11, lr}
+    vpush           {q4-q7}
+
+    // Load uv
+    ldr r4, [sp, #32+64]
+    ldr r5, [sp, #32+64+4]
+    vmov d18, r4, r5                                // d18 = float  u, v
+
+
+//    float pixelU = (u * w) - 0.5f;
+//    float pixelV = (v * h) - 0.5f;
+    vmov d16, r2, r3                                // d16 = int  w, h
+    vcvt.f32.s32 d17, d16                           // d17 = float w, h
+    vmul.f32 d20, d18, d17                          // d20 = pixelUV (uv * wh)
+
+    vld1.f32 d19, =0x3F000000  // 0.5
+    vsub.f32 d20, d20, d19                          // d20 = pixelUV (uv * wh) - 0.5f
+
+//    int iu = pixelU;
+//    int iv = pixelV;
+    vcvt.s32.f32 d21, d20                           // d21 = iPixelUV
+
+
+    //float fracU = pixelU - iu;
+    //float fracV = pixelV - iv;
+    vcvt.s32.f32 d19, d20                           //
+    vcvt.f32.s32 d19, d19                           // d19 = (float)iuv
+    vsub.f32 d0, d20, d19                           // d0 = fract = pixelUV - iuv
+
+
+    //float oneMinusFracU = 1.0f - fracU;
+    //float oneMinusFracV = 1.0f - fracV;
+    vld1.f32 d22, =0x3F800000  // 0.5
+    vsub.f32 d1, d22, d0                            // d1 = oneMinusFrac
+
+
+    //float weightsX1 = oneMinusFracU * oneMinusFracV;
+    //float weightsY1 = fracU * oneMinusFracV;
+    //float weightsX2 = fracV * oneMinusFracU;
+    //float weightsY2 = fracU * fracV;
+    vmul.f32 d2, d1, d1[1]                          // d2 = 1mu * 1mv , 1mv * 1mv
+    vmul.f32 d3, d0, d1[1]                          // d3 = u * 1mv , v * 1mv
+    vmul.f32 d4, d1, d0[1]                          // d4 = v * 1mu , v * 1mv
+    vmul.f32 d5, d0, d0[1]                          // d5 = u * v,  v * v
+
+    //int nextX = rsMax(0, rsMin(iu + 1, w - 1));
+    //int nextY = rsMax(0, rsMin(iv + 1, h - 1));
+    //int locationX = rsMax(0, rsMin(iu, w - 1));
+    //int locationY = rsMax(0, rsMin(iv, h - 1));
+    vmov.u32 d6, #1
+    vmov.u32 d8, #0
+    vsub.s32 d16, d16, d6                           // d16 = h -1, w -1
+    vadd.s32 d7, d6, d21                            // d7 = iuv + 1
+
+    vmin.s32 d7, d7, d16
+    vmin.s32 d21, d21, d16
+    vmax.s32 d7, d7, d8                             // d7 = next
+    vmax.s32 d21, d21, d8                           // d21 = location
+
+    mov r2, #4
+    vmov d6, r2, r1                                 // d6 = 4, stride
+    vmul.s32 d7, d6                                 // d7 = nextX*4, nextY * stride
+    vmul.s32 d21, d6                                // d21 = locationX*4, locationY * stride
+
+    //uchar4 *p0c = (uchar4*)&p[(locationY * stride) + (locationX * 4)];
+    //uchar4 *p1c = (uchar4*)&p[(locationY * stride) + (nextX * 4)];
+    //uchar4 *p2c = (uchar4*)&p[(nextY * stride) + (locationX * 4)];
+    //uchar4 *p3c = (uchar4*)&p[(nextY * stride) + (nextX * 4)];
+    vmov r2, r3, d7                                 // r2 = nextX*4,  r3 = nextY *stride
+    vmov r4, r5, d21                                // r4 = locX*4,  r5 = locY*Stride
+    add r3, r3, r0                                  // r3 = p + nextY*stride
+    add r5, r5, r0                                  // r5 = p + locY*stride
+
+    //float4 p0 = convert_float4(*p0c);
+    //float4 p1 = convert_float4(*p1c);
+    add r1, r5, r4                                  // *p0c
+    ldr r0, [r1]
+    add r1, r5, r2                                  // *p1c
+    ldr r1, [r1]
+    vmov d0, r0, r1                                 // d0 = p0, p1
+
+    //float4 p2 = convert_float4(*p2c);
+    //float4 p3 = convert_float4(*p3c);
+    add r1, r3, r4                                  // *p2c
+    ldr r0, [r1]
+    add r1, r3, r2                                  // *p3c
+    ldr r1, [r1]
+    vmov d1, r0, r1                                 // d1 = p0, p1
+
+    //return (p0 * weightsX1 + p1 * weightsY1 + p2 * weightsX2 + p3 * weightsY2) * 0.003921569f;
+    vmovl.u8 q3, d0
+    vmovl.u8 q4, d1
+    vmovl.u16 q3, d6
+    vmovl.u16 q4, d7
+    vmovl.u16 q5, d8
+    vmovl.u16 q6, d9
+    vcvt.f32.u32 q3, q3
+    vcvt.f32.u32 q4, q4
+    vcvt.f32.u32 q5, q5
+    vcvt.f32.u32 q6, q6
+
+    //vmul.f32 q3, q3, d2[0]
+    //vmla.f32 q3, q4, d3[0]
+    //vmla.f32 q3, q5, d4[0]
+    //vmla.f32 q3, q6, d5[0]
+
+    vld1.f32 d0, =0x3B808081  // 1.f / 255.f
+    vmul.f32 q3, q3, d0[0]
+
+    vmov r0, r1, d6
+    vmov r2, r3, d7
+
+    mov r3, #0x3F800000
+
+    /* We're done, bye! */
+    vpop            {q4-q7}
+    pop             {r4-r8, r10, r11, lr}
+    bx              lr
+END(rsdCpuLinearClamp2D_RGBA_k2)
+
+
+
+
+
+
+
+
+
+/*
+        r0 = base pointer
+        r1 = image stride
+        r2 = iu
+        r3 = iv
+        sp = w
+        sp = h
+*/
+
+ENTRY(rsdCpuLinearClamp2D_RGBA_k)
+    push            {r4-r8, r10, r11, lr}
+    vpush           {q4-q7}
+
+    vmov d2, r2, r3
+
+    add r4, sp, #32+64
+    vld1.32 d3, [r4]!
+    vld1.32 {q0}, [r4]!
+
+
+    mov r2, #4
+    vmov d6, r2, r1                                 // d6 = 4, stride
+    vmul.s32 d30, d6                                // d30 = nextX*4, nextY * stride
+    vmul.s32 d31, d6                                // d31 = locationX*4, locationY * stride
+
+    //uchar4 *p0c = (uchar4*)&p[(locationY * stride) + (locationX * 4)];
+    //uchar4 *p1c = (uchar4*)&p[(locationY * stride) + (nextX * 4)];
+    //uchar4 *p2c = (uchar4*)&p[(nextY * stride) + (locationX * 4)];
+    //uchar4 *p3c = (uchar4*)&p[(nextY * stride) + (nextX * 4)];
+    vmov r2, r3, d30                                // r2 = nextX*4,  r3 = nextY *stride
+    vmov r4, r5, d31                                // r4 = locX*4,  r5 = locY*Stride
+    add r3, r3, r0                                  // r3 = p + nextY*stride
+    add r5, r5, r0                                  // r5 = p + locY*stride
+
+    //float4 p0 = convert_float4(*p0c);
+    //float4 p1 = convert_float4(*p1c);
+    add r1, r5, r4                                  // *p0c
+    ldr r0, [r1]
+    add r1, r5, r2                                  // *p1c
+    ldr r1, [r1]
+    vmov d30, r0, r1                                 // d0 = p0, p1
+
+    //float4 p2 = convert_float4(*p2c);
+    //float4 p3 = convert_float4(*p3c);
+    add r1, r3, r4                                  // *p2c
+    ldr r0, [r1]
+    add r1, r3, r2                                  // *p3c
+    ldr r1, [r1]
+    vmov d31, r0, r1                                 // d1 = p0, p1
+
+    //return (p0 * weightsX1 + p1 * weightsY1 + p2 * weightsX2 + p3 * weightsY2) * 0.003921569f;
+    vmovl.u8 q2, d30
+    vmovl.u8 q3, d31
+    vmovl.u16 q8, d4
+    vmovl.u16 q9, d5
+    vmovl.u16 q10, d6
+    vmovl.u16 q11, d7
+    vcvt.f32.u32 q8, q8, #8
+    vcvt.f32.u32 q9, q9, #8
+    vcvt.f32.u32 q10, q10, #8
+    vcvt.f32.u32 q11, q11, #8
+
+    vmul.f32 q3, q8, d0[0]
+    vmla.f32 q3, q9, d0[1]
+    vmla.f32 q3, q10, d1[0]
+    vmla.f32 q3, q11, d1[1]
+
+///    vld1.f32 d0, =0x3B808081  // 1.f / 255.f
+//    vmul.f32 q3, q3, d0[0]
+
+    vmov r0, r1, d6
+    vmov r2, r3, d7
+
+    mov r3, #0x3F800000
+
+    /* We're done, bye! */
+    vpop            {q4-q7}
+    pop             {r4-r8, r10, r11, lr}
+    bx              lr
+END(rsdCpuLinearClamp2D_RGBA_k)
+
+
+
+/*
+        r0 = uint8_t *ptr
+        r1 = image stride
+        r2,r3 = iPixel
+        sp0,1 = next
+        q0 = weights
+*/
+
+ENTRY(rsdCpuGetSample2D_RGBA_k)
+    push            {r4-r8, lr}
+
+    ldr r4, [sp, #24]                           // next.x
+    ldr r5, [sp, #24+4]                         // next.y
+
+    mul r3, r3, r1                                  // iPixel.y * stride
+    mul r5, r5, r1                                  // next.y * stride
+
+    add r2, r0, r2, LSL #2
+    add r4, r0, r4, LSL #2
+
+    ldr r0, [r2, r3]                                // r0 = p[(locationY * stride) + (locationX * 4)]
+    ldr r1, [r4, r3]                                // r1 = p[(locationY * stride) + (nextX * 4)]
+    ldr r2, [r2, r5]                                // r2 = p[(nextY * stride) + (locationX * 4)]
+    ldr r3, [r4, r5]                                // r3 = p[(nextY * stride) + (nextX * 4)]
+
+    vmov d30, r0, r1                                 // d30 = p0, p1
+    vmov d31, r2, r3                                 // d31 = p2, p3
+
+    vcvt.u32.f32 q0, q0, #8
+    vmovn.u32 d0, q0
+
+    //return (p0 * weightsX1 + p1 * weightsY1 + p2 * weightsX2 + p3 * weightsY2) * 0.003921569f;
+    vmovl.u8 q2, d30
+    vmovl.u8 q3, d31
+
+    vmull.u16 q8, d4, d0[0]
+    vmlal.u16 q8, d5, d0[1]
+    vmlal.u16 q8, d6, d0[2]
+    vmlal.u16 q8, d7, d0[3]
+
+    vcvt.f32.u32 q3, q8, #8
+
+    ldr r1, =0x3B808081  // 1.f / 255.f
+    vmov.32 d0[0], r1
+    vmul.f32 q0, q3, d0[0]
+
+    /* We're done, bye! */
+    pop             {r4-r8, lr}
+    bx              lr
+END(rsdCpuGetSample2D_RGBA_k)
+
diff --git a/driver/rsdSampler.cpp b/driver/rsdSampler.cpp
index 96b27d4..c8c338f 100644
--- a/driver/rsdSampler.cpp
+++ b/driver/rsdSampler.cpp
@@ -29,11 +29,840 @@
 #include <GLES/glext.h>
 #endif
 
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef uint8_t uchar4 __attribute__((ext_vector_type(4)));
+
 using namespace android;
 using namespace android::renderscript;
 
-bool rsdSamplerInit(const android::renderscript::Context *,
-                    const android::renderscript::Sampler *) {
+#if defined(ARCH_ARM_HAVE_VFP)
+    #define LOCAL_CALL __attribute__((pcs("aapcs-vfp")))
+#else
+    #define LOCAL_CALL
+#endif
+
+// 565 Conversion bits taken from SkBitmap
+#define SK_R16_BITS     5
+#define SK_G16_BITS     6
+#define SK_B16_BITS     5
+
+#define SK_R16_SHIFT    (SK_B16_BITS + SK_G16_BITS)
+#define SK_G16_SHIFT    (SK_B16_BITS)
+#define SK_B16_SHIFT    0
+
+#define SK_R16_MASK     ((1 << SK_R16_BITS) - 1)
+#define SK_G16_MASK     ((1 << SK_G16_BITS) - 1)
+#define SK_B16_MASK     ((1 << SK_B16_BITS) - 1)
+
+static inline unsigned SkR16ToR32(unsigned r) {
+    return (r << (8 - SK_R16_BITS)) | (r >> (2 * SK_R16_BITS - 8));
+}
+
+static inline unsigned SkG16ToG32(unsigned g) {
+    return (g << (8 - SK_G16_BITS)) | (g >> (2 * SK_G16_BITS - 8));
+}
+
+static inline unsigned SkB16ToB32(unsigned b) {
+    return (b << (8 - SK_B16_BITS)) | (b >> (2 * SK_B16_BITS - 8));
+}
+
+#define SkPacked16ToR32(c)      SkR16ToR32(SkGetPackedR16(c))
+#define SkPacked16ToG32(c)      SkG16ToG32(SkGetPackedG16(c))
+#define SkPacked16ToB32(c)      SkB16ToB32(SkGetPackedB16(c))
+
+#define SkGetPackedR16(color)   (((unsigned)(color) >> SK_R16_SHIFT) & SK_R16_MASK)
+#define SkGetPackedG16(color)   (((unsigned)(color) >> SK_G16_SHIFT) & SK_G16_MASK)
+#define SkGetPackedB16(color)   (((unsigned)(color) >> SK_B16_SHIFT) & SK_B16_MASK)
+
+static float3 getFrom565(uint16_t color) {
+    float3 result;
+    result.x = (float)SkPacked16ToR32(color);
+    result.y = (float)SkPacked16ToG32(color);
+    result.z = (float)SkPacked16ToB32(color);
+    return result;
+}
+
+
+
+/**
+* Allocation sampling
+*/
+static inline float getElementAt1(const uint8_t *p, int32_t x) {
+    float r = p[x];
+    return r;
+}
+
+static inline float2 getElementAt2(const uint8_t *p, int32_t x) {
+    x *= 2;
+    float2 r = {p[x], p[x+1]};
+    return r;
+}
+
+static inline float3 getElementAt3(const uint8_t *p, int32_t x) {
+    x *= 4;
+    float3 r = {p[x], p[x+1], p[x+2]};
+    return r;
+}
+
+static inline float4 getElementAt4(const uint8_t *p, int32_t x) {
+    x *= 4;
+    float4 r = {p[x], p[x+1], p[x+2], p[x+3]};
+    return r;
+}
+
+static inline float3 getElementAt565(const uint8_t *p, int32_t x) {
+    x *= 2;
+    float3 r = getFrom565(((const uint16_t *)p)[0]);
+    return r;
+}
+
+static inline float getElementAt1(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
+    p += y * stride;
+    float r = p[x];
+    return r;
+}
+
+static inline float2 getElementAt2(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
+    p += y * stride;
+    x *= 2;
+    float2 r = {p[x], p[x+1]};
+    return r;
+}
+
+static inline float3 getElementAt3(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
+    p += y * stride;
+    x *= 4;
+    float3 r = {p[x], p[x+1], p[x+2]};
+    return r;
+}
+
+static inline float4 getElementAt4(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
+    p += y * stride;
+    x *= 4;
+    float4 r = {p[x], p[x+1], p[x+2], p[x+3]};
+    return r;
+}
+
+static inline float3 getElementAt565(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
+    p += y * stride;
+    x *= 2;
+    float3 r = getFrom565(((const uint16_t *)p)[0]);
+    return r;
+}
+
+
+
+
+
+static float4 LOCAL_CALL
+            getSample1D_A(const uint8_t *p, int32_t iPixel,
+                          int32_t next, float w0, float w1) {
+    float p0 = getElementAt1(p, iPixel);
+    float p1 = getElementAt1(p, next);
+    float r = p0 * w0 + p1 * w1;
+    r *= (1.f / 255.f);
+    float4 ret = {0.f, 0.f, 0.f, r};
+    return ret;
+}
+static float4 LOCAL_CALL
+            getSample1D_L(const uint8_t *p, int32_t iPixel,
+                          int32_t next, float w0, float w1) {
+    float p0 = getElementAt1(p, iPixel);
+    float p1 = getElementAt1(p, next);
+    float r = p0 * w0 + p1 * w1;
+    r *= (1.f / 255.f);
+    float4 ret = {r, r, r, 1.f};
+    return ret;
+}
+static float4 LOCAL_CALL
+            getSample1D_LA(const uint8_t *p, int32_t iPixel,
+                           int32_t next, float w0, float w1) {
+    float2 p0 = getElementAt2(p, iPixel);
+    float2 p1 = getElementAt2(p, next);
+    float2 r = p0 * w0 + p1 * w1;
+    r *= (1.f / 255.f);
+    float4 ret = {r.x, r.x, r.x, r.y};
+    return ret;
+}
+static float4 LOCAL_CALL
+            getSample1D_RGB(const uint8_t *p, int32_t iPixel,
+                            int32_t next, float w0, float w1) {
+    float3 p0 = getElementAt3(p, iPixel);
+    float3 p1 = getElementAt3(p, next);
+    float3 r = p0 * w0 + p1 * w1;
+    r *= (1.f / 255.f);
+    float4 ret = {r.x, r.x, r.z, 1.f};
+    return ret;
+}
+static float4 LOCAL_CALL
+            getSample1D_565(const uint8_t *p, int32_t iPixel,
+                           int32_t next, float w0, float w1) {
+    float3 p0 = getElementAt565(p, iPixel);
+    float3 p1 = getElementAt565(p, next);
+    float3 r = p0 * w0 + p1 * w1;
+    r *= (1.f / 255.f);
+    float4 ret = {r.x, r.x, r.z, 1.f};
+    return ret;
+}
+static float4 LOCAL_CALL
+            getSample1D_RGBA(const uint8_t *p, int32_t iPixel,
+                             int32_t next, float w0, float w1) {
+    float4 p0 = getElementAt4(p, iPixel);
+    float4 p1 = getElementAt4(p, next);
+    float4 r = p0 * w0 + p1 * w1;
+    r *= (1.f / 255.f);
+    return r;
+}
+
+
+static float4 LOCAL_CALL
+            getSample2D_A(const uint8_t *p, size_t stride,
+                          int locX, int locY, int nextX, int nextY,
+                          float w0, float w1, float w2, float w3) {
+    float p0 = getElementAt1(p, stride, locX, locY);
+    float p1 = getElementAt1(p, stride, nextX, locY);
+    float p2 = getElementAt1(p, stride, locX, nextY);
+    float p3 = getElementAt1(p, stride, nextX, nextY);
+    float r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+    r *= (1.f / 255.f);
+    float4 ret = {0.f, 0.f, 0.f, r};
+    return ret;
+}
+static float4 LOCAL_CALL
+            getSample2D_L(const uint8_t *p, size_t stride,
+                         int locX, int locY, int nextX, int nextY,
+                         float w0, float w1, float w2, float w3) {
+    float p0 = getElementAt1(p, stride, locX, locY);
+    float p1 = getElementAt1(p, stride, nextX, locY);
+    float p2 = getElementAt1(p, stride, locX, nextY);
+    float p3 = getElementAt1(p, stride, nextX, nextY);
+    float r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+    r *= (1.f / 255.f);
+    float4 ret = {r, r, r, 1.f};
+    return ret;
+}
+static float4 LOCAL_CALL
+            getSample2D_LA(const uint8_t *p, size_t stride,
+                         int locX, int locY, int nextX, int nextY,
+                         float w0, float w1, float w2, float w3) {
+    float2 p0 = getElementAt2(p, stride, locX, locY);
+    float2 p1 = getElementAt2(p, stride, nextX, locY);
+    float2 p2 = getElementAt2(p, stride, locX, nextY);
+    float2 p3 = getElementAt2(p, stride, nextX, nextY);
+    float2 r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+    r *= (1.f / 255.f);
+    float4 ret = {r.x, r.x, r.x, r.y};
+    return ret;
+}
+static float4 LOCAL_CALL
+            getSample2D_RGB(const uint8_t *p, size_t stride,
+                         int locX, int locY, int nextX, int nextY,
+                         float w0, float w1, float w2, float w3) {
+    float4 p0 = getElementAt4(p, stride, locX, locY);
+    float4 p1 = getElementAt4(p, stride, nextX, locY);
+    float4 p2 = getElementAt4(p, stride, locX, nextY);
+    float4 p3 = getElementAt4(p, stride, nextX, nextY);
+    float4 r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+    r *= (1.f / 255.f);
+    float4 ret = {r.x, r.y, r.z, 1.f};
+    return ret;
+}
+static float4 LOCAL_CALL
+            getSample2D_RGBA(const uint8_t *p, size_t stride,
+                         int locX, int locY, int nextX, int nextY,
+                         float w0, float w1, float w2, float w3) {
+    float4 p0 = getElementAt4(p, stride, locX, locY);
+    float4 p1 = getElementAt4(p, stride, nextX, locY);
+    float4 p2 = getElementAt4(p, stride, locX, nextY);
+    float4 p3 = getElementAt4(p, stride, nextX, nextY);
+    float4 r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+    r *= (1.f / 255.f);
+    return r;
+}
+static float4 getSample2D_565(const uint8_t *p, size_t stride,
+                         int locX, int locY, int nextX, int nextY,
+                         float w0, float w1, float w2, float w3) {
+    float3 p0 = getElementAt565(p, stride, locX, locY);
+    float3 p1 = getElementAt565(p, stride, nextX, locY);
+    float3 p2 = getElementAt565(p, stride, locX, nextY);
+    float3 p3 = getElementAt565(p, stride, nextX, nextY);
+    float3 r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+    r *= (1.f / 255.f);
+    float4 ret = {r.x, r.y, r.z, 1.f};
+    return ret;
+}
+
+
+extern "C" {
+    typedef float4 Sampler2DFn(const uint8_t *p, size_t stride,
+                               int lx, int ly, int nx, int ny,
+                               float w0, float w1, float w2, float w3) LOCAL_CALL;
+
+    Sampler2DFn rsdCpuGetSample2D_L_k;
+    Sampler2DFn rsdCpuGetSample2D_A_k;
+    Sampler2DFn rsdCpuGetSample2D_LA_k;
+    Sampler2DFn rsdCpuGetSample2D_RGB_k;
+    Sampler2DFn rsdCpuGetSample2D_RGBA_k;
+}
+
+#if 0
+static Sampler2DFn* GetBilinearSampleTable2D[] = {
+    0, 0, 0, 0, 0, 0, 0,
+    0,//rsdCpuGetSample2D_L_k,
+    0,//rsdCpuGetSample2D_A_k,
+    0,//rsdCpuGetSample2D_LA_k,
+    0,//rsdCpuGetSample2D_RGB_k,
+    rsdCpuGetSample2D_RGBA_k
+};
+
+#else
+static Sampler2DFn* GetBilinearSampleTable2D[] = {
+    0, 0, 0, 0, 0, 0, 0,
+    &getSample2D_L,
+    &getSample2D_A,
+    &getSample2D_LA,
+    &getSample2D_RGB,
+    &getSample2D_RGBA,
+};
+#endif
+
+
+static int applyWrapMode(RsSamplerValue mode, int coord, int size) {
+    switch (mode) {
+    case RS_SAMPLER_WRAP:
+        coord = coord % size;
+        if (coord < 0) {
+            coord += size;
+        }
+        break;
+
+    case RS_SAMPLER_CLAMP:
+        coord = rsMax(0, rsMin(coord, size - 1));
+        break;
+
+    case RS_SAMPLER_MIRRORED_REPEAT:
+        coord = coord % (size * 2);
+        if (coord < 0) {
+            coord = (size * 2) + coord;
+        }
+        if (coord >= size) {
+            coord = (size * 2) - coord;
+        }
+        break;
+
+    default:
+        coord = 0;
+        rsAssert(0);
+    }
+    return coord;
+}
+
+static float4
+        sample_LOD_LinearPixel(Allocation *a, const Type *type,
+                               RsDataKind dk, RsDataType dt,
+                               Sampler *s,
+                               float uv, int32_t lod) {
+    RsSamplerValue wrapS = s->mHal.state.wrapS;
+    int32_t sourceW = type->mHal.state.lodDimX[lod];
+    float pixelUV = uv * (float)(sourceW);
+    int32_t iPixel = (int32_t)(pixelUV);
+    float frac = pixelUV - (float)iPixel;
+
+    if (frac < 0.5f) {
+        iPixel -= 1;
+        frac += 0.5f;
+    } else {
+        frac -= 0.5f;
+    }
+
+    float oneMinusFrac = 1.0f - frac;
+
+    int32_t next = applyWrapMode(wrapS, iPixel + 1, sourceW);
+    int32_t loc = applyWrapMode(wrapS, iPixel, sourceW);
+
+    const uint8_t *ptr = (const uint8_t *)a->mHal.drvState.lod[lod].mallocPtr;
+
+    if (dt == RS_TYPE_UNSIGNED_5_6_5) {
+        return getSample1D_565(ptr, loc, next, next, frac);
+    }
+
+    switch(dk) {
+    case RS_KIND_PIXEL_L:
+        return getSample1D_L(ptr, loc, next, next, frac);
+    case RS_KIND_PIXEL_A:
+        return getSample1D_A(ptr, loc, next, next, frac);
+    case RS_KIND_PIXEL_LA:
+        return getSample1D_LA(ptr, loc, next, next, frac);
+    case RS_KIND_PIXEL_RGB:
+        return getSample1D_RGB(ptr, loc, next, next, frac);
+    case RS_KIND_PIXEL_RGBA:
+        return getSample1D_RGBA(ptr, loc, next, next, frac);
+
+    case RS_KIND_PIXEL_YUV:
+    case RS_KIND_USER:
+    case RS_KIND_INVALID:
+    case RS_KIND_PIXEL_DEPTH:
+        rsAssert(0);
+        break;
+    }
+
+    return 0.f;
+}
+
+static float4
+        sample_LOD_NearestPixel(Allocation *a, const Type *type,
+                                RsDataKind dk, RsDataType dt,
+                                Sampler *s, float uv, int32_t lod) {
+    RsSamplerValue wrapS = s->mHal.state.wrapS;
+    int32_t sourceW = type->mHal.state.lodDimX[lod];
+    int32_t iPixel = (int32_t)(uv * (float)(sourceW));
+    int32_t location = applyWrapMode(wrapS, iPixel, sourceW);
+
+
+    const uint8_t *ptr = (const uint8_t *)a->mHal.drvState.lod[lod].mallocPtr;
+
+    float4 result = {0.f, 0.f, 0.f, 1.f};
+    if (dt == RS_TYPE_UNSIGNED_5_6_5) {
+        result.xyz = getElementAt565(ptr, iPixel);
+       return result;
+    }
+
+    switch(dk) {
+    case RS_KIND_PIXEL_L:
+        {
+            float t = getElementAt1(ptr, iPixel);
+            result.xyz = t;
+        }
+        break;
+    case RS_KIND_PIXEL_A:
+        result.w = getElementAt1(ptr, iPixel);
+        break;
+    case RS_KIND_PIXEL_LA:
+        {
+            float2 t = getElementAt2(ptr, iPixel);
+            result.xyz = t.x;
+            result.w = t.y;
+        }
+        break;
+    case RS_KIND_PIXEL_RGB:
+        result.xyz = getElementAt3(ptr, iPixel);
+        break;
+    case RS_KIND_PIXEL_RGBA:
+        result = getElementAt4(ptr, iPixel);
+        break;
+
+    case RS_KIND_PIXEL_YUV:
+    case RS_KIND_USER:
+    case RS_KIND_INVALID:
+    case RS_KIND_PIXEL_DEPTH:
+        rsAssert(0);
+        break;
+    }
+
+    return result * (1.f / 255.f);
+}
+
+
+static float4
+        sample_LOD_LinearPixel(Allocation *a, const Type *type,
+                               RsDataKind dk, RsDataType dt,
+                               Sampler *s, float u, float v, int32_t lod) {
+    const RsSamplerValue wrapS = s->mHal.state.wrapS;
+    const RsSamplerValue wrapT = s->mHal.state.wrapT;
+    const int sourceW = type->mHal.state.lodDimX[lod];
+    const int sourceH = type->mHal.state.lodDimY[lod];
+
+    float pixelU = u * (float)sourceW;
+    float pixelV = v * (float)sourceH;
+    int iPixelU = (int)pixelU;
+    int iPixelV = (int)pixelV;
+
+    float fracU = pixelU - iPixelU;
+    float fracV = pixelV - iPixelV;
+
+    if (fracU < 0.5f) {
+        iPixelU -= 1;
+        fracU += 0.5f;
+    } else {
+        fracU -= 0.5f;
+    }
+    if (fracV < 0.5f) {
+        iPixelV -= 1;
+        fracV += 0.5f;
+    } else {
+        fracV -= 0.5f;
+    }
+    float oneMinusFracU = 1.0f - fracU;
+    float oneMinusFracV = 1.0f - fracV;
+
+    float w1 = oneMinusFracU * oneMinusFracV;
+    float w2 = fracU * oneMinusFracV;
+    float w3 = oneMinusFracU * fracV;
+    float w4 = fracU * fracV;
+
+    int nextX = applyWrapMode(wrapS, iPixelU + 1, sourceW);
+    int nextY = applyWrapMode(wrapT, iPixelV + 1, sourceH);
+    int locX = applyWrapMode(wrapS, iPixelU, sourceW);
+    int locY = applyWrapMode(wrapT, iPixelV, sourceH);
+
+    const uint8_t *ptr = (const uint8_t *)a->mHal.drvState.lod[lod].mallocPtr;
+    size_t stride = a->mHal.drvState.lod[lod].stride;
+
+    if (dt == RS_TYPE_UNSIGNED_5_6_5) {
+        return getSample2D_565(ptr, stride, locX, locY, nextX, nextY, w1, w2, w3, w4);
+    }
+
+    return GetBilinearSampleTable2D[dk](ptr, stride, locX, locY, nextX, nextY, w1, w2, w3, w4);
+}
+
+static float4
+        sample_LOD_NearestPixel(Allocation *a, const Type *type,
+                                RsDataKind dk, RsDataType dt,
+                                Sampler *s,
+                                float u, float v, int32_t lod) {
+    RsSamplerValue wrapS = s->mHal.state.wrapS;
+    RsSamplerValue wrapT = s->mHal.state.wrapT;
+
+    int32_t sourceW = type->mHal.state.lodDimX[lod];
+    int32_t sourceH = type->mHal.state.lodDimY[lod];
+
+    int locX = applyWrapMode(wrapS, u * sourceW, sourceW);
+    int locY = applyWrapMode(wrapT, v * sourceH, sourceH);
+
+
+    const uint8_t *ptr = (const uint8_t *)a->mHal.drvState.lod[lod].mallocPtr;
+    size_t stride = a->mHal.drvState.lod[lod].stride;
+
+    float4 result = {0.f, 0.f, 0.f, 1.f};
+    if (dt == RS_TYPE_UNSIGNED_5_6_5) {
+        result.xyz = getElementAt565(ptr, stride, locX, locY);
+       return result;
+    }
+
+    switch(dk) {
+    case RS_KIND_PIXEL_L:
+        {
+            float t = getElementAt1(ptr, stride, locX, locY);
+            result.xyz = t;
+        }
+        break;
+    case RS_KIND_PIXEL_A:
+        result.w = getElementAt1(ptr, stride, locX, locY);
+        break;
+    case RS_KIND_PIXEL_LA:
+        {
+            float2 t = getElementAt2(ptr, stride, locX, locY);
+            result.xyz = t.x;
+            result.w = t.y;
+        }
+        break;
+    case RS_KIND_PIXEL_RGB:
+        result.xyz = getElementAt3(ptr, stride, locX, locY);
+        break;
+    case RS_KIND_PIXEL_RGBA:
+        result = getElementAt4(ptr, stride, locX, locY);
+        break;
+
+
+    case RS_KIND_PIXEL_YUV:
+    case RS_KIND_USER:
+    case RS_KIND_INVALID:
+    case RS_KIND_PIXEL_DEPTH:
+        rsAssert(0);
+        break;
+    }
+
+    return result * (1.f / 255.f);
+}
+
+
+
+static float4 GenericSample1D(Allocation *a, Sampler *s, float u, float lod) {
+    const Type *type = a->getType();
+    const Element *elem = type->getElement();
+    const RsDataKind dk = elem->getKind();
+    const RsDataType dt = elem->getType();
+
+    if (dk == RS_KIND_USER || (dt != RS_TYPE_UNSIGNED_8 && dt != RS_TYPE_UNSIGNED_5_6_5)) {
+        return 0.f;
+    }
+
+    if (!(a->mHal.state.usageFlags & RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE)) {
+        const Context *rsc = RsdCpuReference::getTlsContext();
+        rsc->setError(RS_ERROR_BAD_VALUE, "Sampling from texture witout USAGE_GRAPHICS_TEXTURE.");
+        return 0.f;
+    }
+
+    if (lod <= 0.0f) {
+        if (s->mHal.state.magFilter == RS_SAMPLER_NEAREST) {
+            return sample_LOD_NearestPixel(a, type, dk, dt, s, u, 0);
+        }
+        return sample_LOD_LinearPixel(a, type, dk, dt, s, u, 0);
+    }
+
+    if (s->mHal.state.minFilter == RS_SAMPLER_LINEAR_MIP_NEAREST) {
+        int32_t maxLOD = type->mHal.state.lodCount - 1;
+        lod = rsMin(lod, (float)maxLOD);
+        int32_t nearestLOD = (int32_t)round(lod);
+        return sample_LOD_LinearPixel(a, type, dk, dt, s, u, nearestLOD);
+    }
+
+    if (s->mHal.state.minFilter == RS_SAMPLER_LINEAR_MIP_LINEAR) {
+        int32_t lod0 = (int32_t)floor(lod);
+        int32_t lod1 = (int32_t)ceil(lod);
+        int32_t maxLOD = type->mHal.state.lodCount - 1;
+        lod0 = rsMin(lod0, maxLOD);
+        lod1 = rsMin(lod1, maxLOD);
+        float4 sample0 = sample_LOD_LinearPixel(a, type, dk, dt, s, u, lod0);
+        float4 sample1 = sample_LOD_LinearPixel(a, type, dk, dt, s, u, lod1);
+        float frac = lod - (float)lod0;
+        return sample0 * (1.0f - frac) + sample1 * frac;
+    }
+
+    return sample_LOD_NearestPixel(a, type, dk, dt, s, u, 0);
+}
+
+static float4 GenericSample2D(Allocation *a, Sampler *s, float u, float v, float lod) {
+    const Type *type = a->getType();
+    const Element *elem = type->getElement();
+    const RsDataKind dk = elem->getKind();
+    const RsDataType dt = elem->getType();
+
+    if (dk == RS_KIND_USER || (dt != RS_TYPE_UNSIGNED_8 && dt != RS_TYPE_UNSIGNED_5_6_5)) {
+        return 0.f;
+    }
+
+    if (!(a->mHal.state.usageFlags & RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE)) {
+        const Context *rsc = RsdCpuReference::getTlsContext();
+        rsc->setError(RS_ERROR_BAD_VALUE, "Sampling from texture witout USAGE_GRAPHICS_TEXTURE.");
+        return 0.f;
+    }
+
+    if (lod <= 0.0f) {
+        if (s->mHal.state.magFilter == RS_SAMPLER_NEAREST) {
+            return sample_LOD_NearestPixel(a, type, dk, dt, s, u, v, 0);
+        }
+        return sample_LOD_LinearPixel(a, type, dk, dt, s, u, v, 0);
+    }
+
+    if (s->mHal.state.minFilter == RS_SAMPLER_LINEAR_MIP_NEAREST) {
+        int32_t maxLOD = type->mHal.state.lodCount - 1;
+        lod = rsMin(lod, (float)maxLOD);
+        int32_t nearestLOD = (int32_t)round(lod);
+        return sample_LOD_LinearPixel(a, type, dk, dt, s, u, v, nearestLOD);
+    }
+
+    if (s->mHal.state.minFilter == RS_SAMPLER_LINEAR_MIP_LINEAR) {
+        int32_t lod0 = (int32_t)floor(lod);
+        int32_t lod1 = (int32_t)ceil(lod);
+        int32_t maxLOD = type->mHal.state.lodCount - 1;
+        lod0 = rsMin(lod0, maxLOD);
+        lod1 = rsMin(lod1, maxLOD);
+        float4 sample0 = sample_LOD_LinearPixel(a, type, dk, dt, s, u, v, lod0);
+        float4 sample1 = sample_LOD_LinearPixel(a, type, dk, dt, s, u, v, lod1);
+        float frac = lod - (float)lod0;
+        return sample0 * (1.0f - frac) + sample1 * frac;
+    }
+
+    return sample_LOD_NearestPixel(a, type, dk, dt, s, u, v, 0);
+}
+
+
+
+
+// Must match pixel kind in rsDefines.h
+static void * NearestWrap[] = {
+    (void *) GenericSample1D,                // L,
+    (void *) GenericSample1D,                // A,
+    (void *) GenericSample1D,                // LA,
+    (void *) GenericSample1D,                // RGB,
+    (void *) GenericSample1D,                // RGBA,
+    0,
+    (void *) GenericSample1D,                // YUV
+
+    (void *) GenericSample2D,                // L,
+    (void *) GenericSample2D,                // A,
+    (void *) GenericSample2D,                // LA,
+    (void *) GenericSample2D,                // RGB,
+    (void *) GenericSample2D,                // RGBA,
+    0,
+    (void *) GenericSample2D,                // YUV
+};
+
+static void * NearestClamp[] = {
+    (void *) GenericSample1D,                // L,
+    (void *) GenericSample1D,                // A,
+    (void *) GenericSample1D,                // LA,
+    (void *) GenericSample1D,                // RGB,
+    (void *) GenericSample1D,                // RGBA,
+    0,
+    (void *) GenericSample1D,                // YUV
+
+    (void *) GenericSample2D,                // L,
+    (void *) GenericSample2D,                // A,
+    (void *) GenericSample2D,                // LA,
+    (void *) GenericSample2D,                // RGB,
+    (void *) GenericSample2D,                // RGBA,
+    0,
+    (void *) GenericSample2D,                // YUV
+};
+
+static void * NearestMirroredRepeat[] = {
+    (void *) GenericSample1D,                // L,
+    (void *) GenericSample1D,                // A,
+    (void *) GenericSample1D,                // LA,
+    (void *) GenericSample1D,                // RGB,
+    (void *) GenericSample1D,                // RGBA,
+    0,
+    (void *) GenericSample1D,                // YUV
+
+    (void *) GenericSample2D,                // L,
+    (void *) GenericSample2D,                // A,
+    (void *) GenericSample2D,                // LA,
+    (void *) GenericSample2D,                // RGB,
+    (void *) GenericSample2D,                // RGBA,
+    0,
+    (void *) GenericSample2D,                // YUV
+};
+
+// Must match pixel kind in rsDefines.h
+static void * LinearWrap[] = {
+    (void *) GenericSample1D,                // L,
+    (void *) GenericSample1D,                // A,
+    (void *) GenericSample1D,                // LA,
+    (void *) GenericSample1D,                // RGB,
+    (void *) GenericSample1D,                // RGBA,
+    0,
+    (void *) GenericSample1D,                // YUV
+
+    (void *) GenericSample2D,                // L,
+    (void *) GenericSample2D,                // A,
+    (void *) GenericSample2D,                // LA,
+    (void *) GenericSample2D,                // RGB,
+    (void *) GenericSample2D,                // RGBA,
+    0,
+    (void *) GenericSample2D,                // YUV
+};
+
+// Must match pixel kind in rsDefines.h
+static void * LinearClamp[] = {
+    (void *) GenericSample1D,                // L,
+    (void *) GenericSample1D,                // A,
+    (void *) GenericSample1D,                // LA,
+    (void *) GenericSample1D,                // RGB,
+    (void *) GenericSample1D,                // RGBA,
+    0,
+    (void *) GenericSample1D,                // YUV
+
+    (void *) GenericSample2D,                // L,
+    (void *) GenericSample2D,                // A,
+    (void *) GenericSample2D,                // LA,
+    (void *) GenericSample2D,                // RGB,
+    (void *) GenericSample2D,                // RGBA,
+    0,
+    (void *) GenericSample2D,                // YUV
+};
+
+// Must match pixel kind in rsDefines.h
+static void * LinearMirroredRepeat[] = {
+    (void *) GenericSample1D,                // L,
+    (void *) GenericSample1D,                // A,
+    (void *) GenericSample1D,                // LA,
+    (void *) GenericSample1D,                // RGB,
+    (void *) GenericSample1D,                // RGBA,
+    0,
+    (void *) GenericSample1D,                // YUV
+
+    (void *) GenericSample2D,                // L,
+    (void *) GenericSample2D,                // A,
+    (void *) GenericSample2D,                // LA,
+    (void *) GenericSample2D,                // RGB,
+    (void *) GenericSample2D,                // RGBA,
+    0,
+    (void *) GenericSample2D,                // YUV
+};
+
+// Must match pixel kind in rsDefines.h
+static void * Generic[] = {
+    (void *) GenericSample1D,                // L,
+    (void *) GenericSample1D,                // A,
+    (void *) GenericSample1D,                // LA,
+    (void *) GenericSample1D,                // RGB,
+    (void *) GenericSample1D,                // RGBA,
+    0,
+    (void *) GenericSample1D,                // YUV
+
+    (void *) GenericSample2D,                // L,
+    (void *) GenericSample2D,                // A,
+    (void *) GenericSample2D,                // LA,
+    (void *) GenericSample2D,                // RGB,
+    (void *) GenericSample2D,                // RGBA,
+    0,
+    (void *) GenericSample2D,                // YUV
+};
+
+bool rsdSamplerInit(const Context *, const Sampler *s) {
+    s->mHal.drv = Generic;
+
+    if ((s->mHal.state.minFilter == s->mHal.state.magFilter) &&
+        (s->mHal.state.wrapS == s->mHal.state.wrapT)) {
+        // We have fast paths for these.
+
+        switch(s->mHal.state.minFilter) {
+        case RS_SAMPLER_NEAREST:
+            switch(s->mHal.state.wrapS) {
+            case RS_SAMPLER_WRAP:
+                s->mHal.drv = NearestWrap;
+                break;
+            case RS_SAMPLER_CLAMP:
+                s->mHal.drv = NearestClamp;
+                break;
+            case RS_SAMPLER_MIRRORED_REPEAT:
+                s->mHal.drv = NearestMirroredRepeat;
+                break;
+            default:
+                break;
+            }
+            break;
+        case RS_SAMPLER_LINEAR:
+            switch(s->mHal.state.wrapS) {
+            case RS_SAMPLER_WRAP:
+                s->mHal.drv = LinearWrap;
+                break;
+            case RS_SAMPLER_CLAMP:
+                s->mHal.drv = LinearClamp;
+                break;
+            case RS_SAMPLER_MIRRORED_REPEAT:
+                s->mHal.drv = LinearMirroredRepeat;
+                break;
+            default:
+                break;
+            }
+            break;
+        case RS_SAMPLER_LINEAR_MIP_LINEAR:
+            switch(s->mHal.state.wrapS) {
+            case RS_SAMPLER_WRAP:
+                s->mHal.drv = LinearWrap;
+                break;
+            case RS_SAMPLER_CLAMP:
+                s->mHal.drv = LinearClamp;
+                break;
+            case RS_SAMPLER_MIRRORED_REPEAT:
+                s->mHal.drv = LinearMirroredRepeat;
+                break;
+            default:
+                break;
+            }
+            break;
+        default:
+            rsAssert(0);
+            break;
+        }
+
+    }
+
     return true;
 }
 
diff --git a/java/Android.mk b/java/Android.mk
new file mode 100644
index 0000000..6145a3d
--- /dev/null
+++ b/java/Android.mk
@@ -0,0 +1,3 @@
+LOCAL_PATH:=$(call my-dir)
+
+include $(call all-makefiles-under,$(LOCAL_PATH))
diff --git a/java/tests/Android.mk b/java/tests/Android.mk
new file mode 100644
index 0000000..9b55c42
--- /dev/null
+++ b/java/tests/Android.mk
@@ -0,0 +1,3 @@
+ifneq (true,$(TARGET_BUILD_PDK))
+include $(call all-subdir-makefiles)
+endif
diff --git a/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java b/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
index 8cf46c2..975027a 100644
--- a/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
+++ b/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
@@ -424,8 +424,16 @@
 
 
         mRS = RenderScript.create(this);
-        mInPixelsAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
-        mInPixelsAllocation2 = Allocation.createFromBitmap(mRS, mBitmapIn2);
+        mInPixelsAllocation = Allocation.createFromBitmap(mRS, mBitmapIn,
+                                                          Allocation.MipmapControl.MIPMAP_NONE,
+                                                          Allocation.USAGE_SHARED |
+                                                          Allocation.USAGE_GRAPHICS_TEXTURE |
+                                                          Allocation.USAGE_SCRIPT);
+        mInPixelsAllocation2 = Allocation.createFromBitmap(mRS, mBitmapIn2,
+                                                           Allocation.MipmapControl.MIPMAP_NONE,
+                                                           Allocation.USAGE_SHARED |
+                                                           Allocation.USAGE_GRAPHICS_TEXTURE |
+                                                           Allocation.USAGE_SCRIPT);
         mOutPixelsAllocation = Allocation.createFromBitmap(mRS, mBitmapOut);
 
 
diff --git a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
index 8645ae5..c0eeeea 100644
--- a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
+++ b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
@@ -82,6 +82,7 @@
         unitTests.add(new UT_rstime(this, mRes, mCtx));
         unitTests.add(new UT_rstypes(this, mRes, mCtx));
         unitTests.add(new UT_alloc(this, mRes, mCtx));
+        unitTests.add(new UT_static_globals(this, mRes, mCtx));
         unitTests.add(new UT_refcount(this, mRes, mCtx));
         unitTests.add(new UT_foreach(this, mRes, mCtx));
         unitTests.add(new UT_foreach_bounds(this, mRes, mCtx));
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_static_globals.java b/java/tests/RsTest/src/com/android/rs/test/UT_static_globals.java
new file mode 100644
index 0000000..f2b309c
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_static_globals.java
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.test;
+
+import android.content.Context;
+import android.content.res.Resources;
+import android.renderscript.*;
+
+public class UT_static_globals extends UnitTest {
+    private Resources mRes;
+
+    protected UT_static_globals(RSTestCore rstc, Resources res, Context ctx) {
+        super(rstc, "Static Globals", ctx);
+        mRes = res;
+    }
+
+    public void run() {
+        RenderScript pRS = RenderScript.create(mCtx);
+        ScriptC_static_globals s = new ScriptC_static_globals(pRS);
+        pRS.setMessageHandler(mRsMessage);
+        Type.Builder typeBuilder = new Type.Builder(pRS, Element.I32(pRS));
+        Allocation A = Allocation.createTyped(pRS, typeBuilder.setX(1).create());
+        s.forEach_root(A);
+        s.invoke_static_globals_test();
+        pRS.finish();
+        waitForMessage();
+        pRS.destroy();
+    }
+}
diff --git a/java/tests/RsTest/src/com/android/rs/test/static_globals.rs b/java/tests/RsTest/src/com/android/rs/test/static_globals.rs
new file mode 100644
index 0000000..3e19faa
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/static_globals.rs
@@ -0,0 +1,17 @@
+#include "shared.rsh"
+
+static bool b = false;
+
+void root(const int *o, uint32_t x, uint32_t y) {
+    b = true;
+}
+
+void static_globals_test() {
+    if (!b) {
+        rsSendToClientBlocking(RS_MSG_TEST_FAILED);
+    }
+    else {
+        rsSendToClientBlocking(RS_MSG_TEST_PASSED);
+    }
+}
+
diff --git a/rsCppUtils.h b/rsCppUtils.h
new file mode 100644
index 0000000..abae7d8
--- /dev/null
+++ b/rsCppUtils.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RS_CPP_UTILS_H
+#define ANDROID_RS_CPP_UTILS_H
+
+#ifndef RS_SERVER
+#include <utils/Log.h>
+#include <utils/String8.h>
+#include <utils/Vector.h>
+#include <cutils/atomic.h>
+#endif
+
+#include <stdint.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <time.h>
+
+#include <math.h>
+
+#ifdef RS_SERVER
+
+#include <string>
+#include <vector>
+#include <algorithm>
+
+namespace android {
+
+    // server has no Vector or String8 classes; implement on top of STL
+    class String8: public std::string {
+    public:
+    String8(const char *ptr) : std::string(ptr) {
+
+        }
+    String8() : std::string() {
+
+        }
+
+        const char* string() const {
+            return this->c_str();
+        }
+
+        void setTo(const char* str, ssize_t len) {
+            this->assign(str, len);
+        }
+        void setTo(const char* str) {
+            this->assign(str);
+        }
+
+    };
+
+    template <class T> class Vector: public std::vector<T> {
+    public:
+        void push(T obj) {
+            this->push_back(obj);
+        }
+        void removeAt(uint32_t index) {
+            this->erase(this->begin() + index);
+        }
+        ssize_t add(const T& obj) {
+            this->push_back(obj);
+            return this->size() - 1;
+        }
+        void setCapacity(ssize_t capacity) {
+            this->resize(capacity);
+        }
+
+        T* editArray() {
+            return this->data();
+        }
+
+        const T* array() {
+            return this->data();
+        }
+
+    };
+
+    template<> class Vector<bool>: public std::vector<char> {
+    public:
+        void push(bool obj) {
+            this->push_back(obj);
+        }
+        void removeAt(uint32_t index) {
+            this->erase(this->begin() + index);
+        }
+        ssize_t add(const bool& obj) {
+            this->push_back(obj);
+            return this->size() - 1;
+        }
+        void setCapacity(ssize_t capacity) {
+            this->resize(capacity);
+        }
+
+        bool* editArray() {
+            return (bool*)this->data();
+        }
+
+        const bool* array() {
+            return (const bool*)this->data();
+        }
+    };
+
+}
+
+#endif // RS_SERVER
+
+namespace android {
+namespace renderscript {
+
+#if 1
+#define rsAssert(v) do {if(!(v)) ALOGE("rsAssert failed: %s, in %s at %i", #v, __FILE__, __LINE__);} while (0)
+#else
+#define rsAssert(v) while (0)
+#endif
+
+template<typename T>
+T rsMin(T in1, T in2)
+{
+    if (in1 > in2) {
+        return in2;
+    }
+    return in1;
+}
+
+template<typename T>
+T rsMax(T in1, T in2) {
+    if (in1 < in2) {
+        return in2;
+    }
+    return in1;
+}
+
+template<typename T>
+T rsFindHighBit(T val) {
+    uint32_t bit = 0;
+    while (val > 1) {
+        bit++;
+        val>>=1;
+    }
+    return bit;
+}
+
+template<typename T>
+bool rsIsPow2(T val) {
+    return (val & (val-1)) == 0;
+}
+
+template<typename T>
+T rsHigherPow2(T v) {
+    if (rsIsPow2(v)) {
+        return v;
+    }
+    return 1 << (rsFindHighBit(v) + 1);
+}
+
+template<typename T>
+T rsLowerPow2(T v) {
+    if (rsIsPow2(v)) {
+        return v;
+    }
+    return 1 << rsFindHighBit(v);
+}
+
+template<typename T>
+T rsRound(T v, unsigned int r) {
+    // Only valid for rounding up to powers of 2.
+    if ((r & (r - 1)) != 0) {
+        rsAssert(false && "Must be power of 2 for rounding up");
+        return v;
+    }
+    T res = v + (r - 1);
+    if (res < v) {
+        rsAssert(false && "Overflow of rounding operation");
+        return v;
+    }
+    res &= ~(r - 1);
+    return res;
+}
+
+static inline uint16_t rs888to565(uint32_t r, uint32_t g, uint32_t b) {
+    uint16_t t = 0;
+    t |= b >> 3;
+    t |= (g >> 2) << 5;
+    t |= (r >> 3) << 11;
+    return t;
+}
+
+static inline uint16_t rsBoxFilter565(uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4) {
+    uint32_t r = ((i1 & 0x1f) + (i2 & 0x1f) + (i3 & 0x1f) + (i4 & 0x1f));
+    uint32_t g = ((i1 >> 5) & 0x3f) + ((i2 >> 5) & 0x3f) + ((i3 >> 5) & 0x3f) + ((i4 >> 5) & 0x3f);
+    uint32_t b = ((i1 >> 11) + (i2 >> 11) + (i3 >> 11) + (i4 >> 11));
+    return (r >> 2) | ((g >> 2) << 5) | ((b >> 2) << 11);
+}
+
+static inline uint32_t rsBoxFilter8888(uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4) {
+    uint32_t r = (i1 & 0xff) +         (i2 & 0xff) +         (i3 & 0xff) +         (i4 & 0xff);
+    uint32_t g = ((i1 >> 8) & 0xff) +  ((i2 >> 8) & 0xff) +  ((i3 >> 8) & 0xff) +  ((i4 >> 8) & 0xff);
+    uint32_t b = ((i1 >> 16) & 0xff) + ((i2 >> 16) & 0xff) + ((i3 >> 16) & 0xff) + ((i4 >> 16) & 0xff);
+    uint32_t a = ((i1 >> 24) & 0xff) + ((i2 >> 24) & 0xff) + ((i3 >> 24) & 0xff) + ((i4 >> 24) & 0xff);
+    return (r >> 2) | ((g >> 2) << 8) | ((b >> 2) << 16) | ((a >> 2) << 24);
+}
+
+}
+}
+
+#endif //ANDROID_RS_OBJECT_BASE_H
+
+
diff --git a/rsUtils.h b/rsUtils.h
index 3ef6b5a..d698928 100644
--- a/rsUtils.h
+++ b/rsUtils.h
@@ -20,211 +20,17 @@
 #define LOG_NDEBUG 0
 #define LOG_TAG "RenderScript"
 
-#ifndef RS_SERVER
-#include <utils/Log.h>
-#include <utils/String8.h>
-#include <utils/Vector.h>
-#include <cutils/atomic.h>
-#endif
-
-#include <stdint.h>
-
-#include <stdlib.h>
-#include <pthread.h>
-#include <time.h>
-
-#include <math.h>
-
 #ifdef RS_SERVER
 
-#include <string>
-#include <vector>
-#include <algorithm>
-
 #define ALOGE(...)
 #define ALOGV(...)
 #define ALOGW(...)
 #define ALOGD(...)
 
-namespace android {
-
-    // server has no Vector or String8 classes; implement on top of STL
-    class String8: public std::string {
-    public:
-    String8(const char *ptr) : std::string(ptr) {
-
-        }
-    String8() : std::string() {
-
-        }
-
-        const char* string() const {
-            return this->c_str();
-        }
-
-        void setTo(const char* str, ssize_t len) {
-            this->assign(str, len);
-        }
-        void setTo(const char* str) {
-            this->assign(str);
-        }
-
-    };
-
-    template <class T> class Vector: public std::vector<T> {
-    public:
-        void push(T obj) {
-            this->push_back(obj);
-        }
-        void removeAt(uint32_t index) {
-            this->erase(this->begin() + index);
-        }
-        ssize_t add(const T& obj) {
-            this->push_back(obj);
-            return this->size() - 1;
-        }
-        void setCapacity(ssize_t capacity) {
-            this->resize(capacity);
-        }
-
-        T* editArray() {
-            return this->data();
-        }
-
-        const T* array() {
-            return this->data();
-        }
-
-    };
-
-    template<> class Vector<bool>: public std::vector<char> {
-    public:
-        void push(bool obj) {
-            this->push_back(obj);
-        }
-        void removeAt(uint32_t index) {
-            this->erase(this->begin() + index);
-        }
-        ssize_t add(const bool& obj) {
-            this->push_back(obj);
-            return this->size() - 1;
-        }
-        void setCapacity(ssize_t capacity) {
-            this->resize(capacity);
-        }
-
-        bool* editArray() {
-            return (bool*)this->data();
-        }
-
-        const bool* array() {
-            return (const bool*)this->data();
-        }
-    };
-
-}
-
-#endif // RS_SERVER
-
-namespace android {
-namespace renderscript {
-
-#if 1
-#define rsAssert(v) do {if(!(v)) ALOGE("rsAssert failed: %s, in %s at %i", #v, __FILE__, __LINE__);} while (0)
-#else
-#define rsAssert(v) while (0)
 #endif
 
-template<typename T>
-T rsMin(T in1, T in2)
-{
-    if (in1 > in2) {
-        return in2;
-    }
-    return in1;
-}
+#include "rsCppUtils.h"
 
-template<typename T>
-T rsMax(T in1, T in2) {
-    if (in1 < in2) {
-        return in2;
-    }
-    return in1;
-}
-
-template<typename T>
-T rsFindHighBit(T val) {
-    uint32_t bit = 0;
-    while (val > 1) {
-        bit++;
-        val>>=1;
-    }
-    return bit;
-}
-
-template<typename T>
-bool rsIsPow2(T val) {
-    return (val & (val-1)) == 0;
-}
-
-template<typename T>
-T rsHigherPow2(T v) {
-    if (rsIsPow2(v)) {
-        return v;
-    }
-    return 1 << (rsFindHighBit(v) + 1);
-}
-
-template<typename T>
-T rsLowerPow2(T v) {
-    if (rsIsPow2(v)) {
-        return v;
-    }
-    return 1 << rsFindHighBit(v);
-}
-
-template<typename T>
-T rsRound(T v, unsigned int r) {
-    // Only valid for rounding up to powers of 2.
-    if ((r & (r - 1)) != 0) {
-        rsAssert(false && "Must be power of 2 for rounding up");
-        return v;
-    }
-    T res = v + (r - 1);
-    if (res < v) {
-        rsAssert(false && "Overflow of rounding operation");
-        return v;
-    }
-    res &= ~(r - 1);
-    return res;
-}
-
-static inline uint16_t rs888to565(uint32_t r, uint32_t g, uint32_t b) {
-    uint16_t t = 0;
-    t |= b >> 3;
-    t |= (g >> 2) << 5;
-    t |= (r >> 3) << 11;
-    return t;
-}
-
-static inline uint16_t rsBoxFilter565(uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4) {
-    uint32_t r = ((i1 & 0x1f) + (i2 & 0x1f) + (i3 & 0x1f) + (i4 & 0x1f));
-    uint32_t g = ((i1 >> 5) & 0x3f) + ((i2 >> 5) & 0x3f) + ((i3 >> 5) & 0x3f) + ((i4 >> 5) & 0x3f);
-    uint32_t b = ((i1 >> 11) + (i2 >> 11) + (i3 >> 11) + (i4 >> 11));
-    return (r >> 2) | ((g >> 2) << 5) | ((b >> 2) << 11);
-}
-
-static inline uint32_t rsBoxFilter8888(uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4) {
-    uint32_t r = (i1 & 0xff) +         (i2 & 0xff) +         (i3 & 0xff) +         (i4 & 0xff);
-    uint32_t g = ((i1 >> 8) & 0xff) +  ((i2 >> 8) & 0xff) +  ((i3 >> 8) & 0xff) +  ((i4 >> 8) & 0xff);
-    uint32_t b = ((i1 >> 16) & 0xff) + ((i2 >> 16) & 0xff) + ((i3 >> 16) & 0xff) + ((i4 >> 16) & 0xff);
-    uint32_t a = ((i1 >> 24) & 0xff) + ((i2 >> 24) & 0xff) + ((i3 >> 24) & 0xff) + ((i4 >> 24) & 0xff);
-    return (r >> 2) | ((g >> 2) << 8) | ((b >> 2) << 16) | ((a >> 2) << 24);
-}
-
-}
-}
-
-#endif //ANDROID_RS_OBJECT_BASE_H
+#endif //ANDROID_RS_UTILS_H