start refactoring rsSample
Change-Id: I3bc2209991dcb2a13a2629e8ecfe6955ae83a9a8
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index ddc87f9..142ca6e 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -33,7 +33,12 @@
ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
LOCAL_SRC_FILES+= \
- rsCpuIntrinsics_neon.S
+ rsCpuIntrinsics_neon.S \
+ rsCpuSample_neon.S
+endif
+
+ifeq ($(ARCH_ARM_HAVE_VFP),true)
+ LOCAL_CFLAGS += -DARCH_ARM_HAVE_VFP
endif
LOCAL_SHARED_LIBRARIES += libRS libcutils libutils libsync
diff --git a/cpu_ref/rsCpuSample_neon.S b/cpu_ref/rsCpuSample_neon.S
new file mode 100644
index 0000000..5f1060b
--- /dev/null
+++ b/cpu_ref/rsCpuSample_neon.S
@@ -0,0 +1,293 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+/*
+ r0 = base pointer
+ r1 = image stride
+ r2 = w
+ r3 = h
+ sp = float u
+ sp = float v
+*/
+
+ENTRY(rsdCpuLinearClamp2D_RGBA_k2)
+ push {r4-r8, r10, r11, lr}
+ vpush {q4-q7}
+
+ // Load uv
+ ldr r4, [sp, #32+64]
+ ldr r5, [sp, #32+64+4]
+ vmov d18, r4, r5 // d18 = float u, v
+
+
+// float pixelU = (u * w) - 0.5f;
+// float pixelV = (v * h) - 0.5f;
+ vmov d16, r2, r3 // d16 = int w, h
+ vcvt.f32.s32 d17, d16 // d17 = float w, h
+ vmul.f32 d20, d18, d17 // d20 = pixelUV (uv * wh)
+
+ vld1.f32 d19, =0x3F000000 // 0.5
+ vsub.f32 d20, d20, d19 // d20 = pixelUV (uv * wh) - 0.5f
+
+// int iu = pixelU;
+// int iv = pixelV;
+ vcvt.s32.f32 d21, d20 // d21 = iPixelUV
+
+
+ //float fracU = pixelU - iu;
+ //float fracV = pixelV - iv;
+ vcvt.s32.f32 d19, d20 //
+ vcvt.f32.s32 d19, d19 // d19 = (float)iuv
+ vsub.f32 d0, d20, d19 // d0 = fract = pixelUV - iuv
+
+
+ //float oneMinusFracU = 1.0f - fracU;
+ //float oneMinusFracV = 1.0f - fracV;
+ vld1.f32 d22, =0x3F800000 // 0.5
+ vsub.f32 d1, d22, d0 // d1 = oneMinusFrac
+
+
+ //float weightsX1 = oneMinusFracU * oneMinusFracV;
+ //float weightsY1 = fracU * oneMinusFracV;
+ //float weightsX2 = fracV * oneMinusFracU;
+ //float weightsY2 = fracU * fracV;
+ vmul.f32 d2, d1, d1[1] // d2 = 1mu * 1mv , 1mv * 1mv
+ vmul.f32 d3, d0, d1[1] // d3 = u * 1mv , v * 1mv
+ vmul.f32 d4, d1, d0[1] // d4 = v * 1mu , v * 1mv
+ vmul.f32 d5, d0, d0[1] // d5 = u * v, v * v
+
+ //int nextX = rsMax(0, rsMin(iu + 1, w - 1));
+ //int nextY = rsMax(0, rsMin(iv + 1, h - 1));
+ //int locationX = rsMax(0, rsMin(iu, w - 1));
+ //int locationY = rsMax(0, rsMin(iv, h - 1));
+ vmov.u32 d6, #1
+ vmov.u32 d8, #0
+ vsub.s32 d16, d16, d6 // d16 = h -1, w -1
+ vadd.s32 d7, d6, d21 // d7 = iuv + 1
+
+ vmin.s32 d7, d7, d16
+ vmin.s32 d21, d21, d16
+ vmax.s32 d7, d7, d8 // d7 = next
+ vmax.s32 d21, d21, d8 // d21 = location
+
+ mov r2, #4
+ vmov d6, r2, r1 // d6 = 4, stride
+ vmul.s32 d7, d6 // d7 = nextX*4, nextY * stride
+ vmul.s32 d21, d6 // d21 = locationX*4, locationY * stride
+
+ //uchar4 *p0c = (uchar4*)&p[(locationY * stride) + (locationX * 4)];
+ //uchar4 *p1c = (uchar4*)&p[(locationY * stride) + (nextX * 4)];
+ //uchar4 *p2c = (uchar4*)&p[(nextY * stride) + (locationX * 4)];
+ //uchar4 *p3c = (uchar4*)&p[(nextY * stride) + (nextX * 4)];
+ vmov r2, r3, d7 // r2 = nextX*4, r3 = nextY *stride
+ vmov r4, r5, d21 // r4 = locX*4, r5 = locY*Stride
+ add r3, r3, r0 // r3 = p + nextY*stride
+ add r5, r5, r0 // r5 = p + locY*stride
+
+ //float4 p0 = convert_float4(*p0c);
+ //float4 p1 = convert_float4(*p1c);
+ add r1, r5, r4 // *p0c
+ ldr r0, [r1]
+ add r1, r5, r2 // *p1c
+ ldr r1, [r1]
+ vmov d0, r0, r1 // d0 = p0, p1
+
+ //float4 p2 = convert_float4(*p2c);
+ //float4 p3 = convert_float4(*p3c);
+ add r1, r3, r4 // *p2c
+ ldr r0, [r1]
+ add r1, r3, r2 // *p3c
+ ldr r1, [r1]
+ vmov d1, r0, r1 // d1 = p0, p1
+
+ //return (p0 * weightsX1 + p1 * weightsY1 + p2 * weightsX2 + p3 * weightsY2) * 0.003921569f;
+ vmovl.u8 q3, d0
+ vmovl.u8 q4, d1
+ vmovl.u16 q3, d6
+ vmovl.u16 q4, d7
+ vmovl.u16 q5, d8
+ vmovl.u16 q6, d9
+ vcvt.f32.u32 q3, q3
+ vcvt.f32.u32 q4, q4
+ vcvt.f32.u32 q5, q5
+ vcvt.f32.u32 q6, q6
+
+ //vmul.f32 q3, q3, d2[0]
+ //vmla.f32 q3, q4, d3[0]
+ //vmla.f32 q3, q5, d4[0]
+ //vmla.f32 q3, q6, d5[0]
+
+ vld1.f32 d0, =0x3B808081 // 1.f / 255.f
+ vmul.f32 q3, q3, d0[0]
+
+ vmov r0, r1, d6
+ vmov r2, r3, d7
+
+ mov r3, #0x3F800000
+
+ /* We're done, bye! */
+ vpop {q4-q7}
+ pop {r4-r8, r10, r11, lr}
+ bx lr
+END(rsdCpuLinearClamp2D_RGBA_k2)
+
+
+
+
+
+
+
+
+
+/*
+ r0 = base pointer
+ r1 = image stride
+ r2 = iu
+ r3 = iv
+ sp = w
+ sp = h
+*/
+
+ENTRY(rsdCpuLinearClamp2D_RGBA_k)
+ push {r4-r8, r10, r11, lr}
+ vpush {q4-q7}
+
+ vmov d2, r2, r3
+
+ add r4, sp, #32+64
+ vld1.32 d3, [r4]!
+ vld1.32 {q0}, [r4]!
+
+
+ mov r2, #4
+ vmov d6, r2, r1 // d6 = 4, stride
+ vmul.s32 d30, d6 // d30 = nextX*4, nextY * stride
+ vmul.s32 d31, d6 // d31 = locationX*4, locationY * stride
+
+ //uchar4 *p0c = (uchar4*)&p[(locationY * stride) + (locationX * 4)];
+ //uchar4 *p1c = (uchar4*)&p[(locationY * stride) + (nextX * 4)];
+ //uchar4 *p2c = (uchar4*)&p[(nextY * stride) + (locationX * 4)];
+ //uchar4 *p3c = (uchar4*)&p[(nextY * stride) + (nextX * 4)];
+ vmov r2, r3, d30 // r2 = nextX*4, r3 = nextY *stride
+ vmov r4, r5, d31 // r4 = locX*4, r5 = locY*Stride
+ add r3, r3, r0 // r3 = p + nextY*stride
+ add r5, r5, r0 // r5 = p + locY*stride
+
+ //float4 p0 = convert_float4(*p0c);
+ //float4 p1 = convert_float4(*p1c);
+ add r1, r5, r4 // *p0c
+ ldr r0, [r1]
+ add r1, r5, r2 // *p1c
+ ldr r1, [r1]
+ vmov d30, r0, r1 // d0 = p0, p1
+
+ //float4 p2 = convert_float4(*p2c);
+ //float4 p3 = convert_float4(*p3c);
+ add r1, r3, r4 // *p2c
+ ldr r0, [r1]
+ add r1, r3, r2 // *p3c
+ ldr r1, [r1]
+ vmov d31, r0, r1 // d1 = p0, p1
+
+ //return (p0 * weightsX1 + p1 * weightsY1 + p2 * weightsX2 + p3 * weightsY2) * 0.003921569f;
+ vmovl.u8 q2, d30
+ vmovl.u8 q3, d31
+ vmovl.u16 q8, d4
+ vmovl.u16 q9, d5
+ vmovl.u16 q10, d6
+ vmovl.u16 q11, d7
+ vcvt.f32.u32 q8, q8, #8
+ vcvt.f32.u32 q9, q9, #8
+ vcvt.f32.u32 q10, q10, #8
+ vcvt.f32.u32 q11, q11, #8
+
+ vmul.f32 q3, q8, d0[0]
+ vmla.f32 q3, q9, d0[1]
+ vmla.f32 q3, q10, d1[0]
+ vmla.f32 q3, q11, d1[1]
+
+/// vld1.f32 d0, =0x3B808081 // 1.f / 255.f
+// vmul.f32 q3, q3, d0[0]
+
+ vmov r0, r1, d6
+ vmov r2, r3, d7
+
+ mov r3, #0x3F800000
+
+ /* We're done, bye! */
+ vpop {q4-q7}
+ pop {r4-r8, r10, r11, lr}
+ bx lr
+END(rsdCpuLinearClamp2D_RGBA_k)
+
+
+
+/*
+ r0 = uint8_t *ptr
+ r1 = image stride
+ r2,r3 = iPixel
+ sp0,1 = next
+ q0 = weights
+*/
+
+ENTRY(rsdCpuGetSample2D_RGBA_k)
+ push {r4-r8, lr}
+
+ ldr r4, [sp, #24] // next.x
+ ldr r5, [sp, #24+4] // next.y
+
+ mul r3, r3, r1 // iPixel.y * stride
+ mul r5, r5, r1 // next.y * stride
+
+ add r2, r0, r2, LSL #2
+ add r4, r0, r4, LSL #2
+
+ ldr r0, [r2, r3] // r0 = p[(locationY * stride) + (locationX * 4)]
+ ldr r1, [r4, r3] // r1 = p[(locationY * stride) + (nextX * 4)]
+ ldr r2, [r2, r5] // r2 = p[(nextY * stride) + (locationX * 4)]
+ ldr r3, [r4, r5] // r3 = p[(nextY * stride) + (nextX * 4)]
+
+ vmov d30, r0, r1 // d30 = p0, p1
+ vmov d31, r2, r3 // d31 = p2, p3
+
+ vcvt.u32.f32 q0, q0, #8
+ vmovn.u32 d0, q0
+
+ //return (p0 * weightsX1 + p1 * weightsY1 + p2 * weightsX2 + p3 * weightsY2) * 0.003921569f;
+ vmovl.u8 q2, d30
+ vmovl.u8 q3, d31
+
+ vmull.u16 q8, d4, d0[0]
+ vmlal.u16 q8, d5, d0[1]
+ vmlal.u16 q8, d6, d0[2]
+ vmlal.u16 q8, d7, d0[3]
+
+ vcvt.f32.u32 q3, q8, #8
+
+ ldr r1, =0x3B808081 // 1.f / 255.f
+ vmov.32 d0[0], r1
+ vmul.f32 q0, q3, d0[0]
+
+ /* We're done, bye! */
+ pop {r4-r8, lr}
+ bx lr
+END(rsdCpuGetSample2D_RGBA_k)
+
diff --git a/driver/rsdSampler.cpp b/driver/rsdSampler.cpp
index 96b27d4..c8c338f 100644
--- a/driver/rsdSampler.cpp
+++ b/driver/rsdSampler.cpp
@@ -29,11 +29,840 @@
#include <GLES/glext.h>
#endif
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef uint8_t uchar4 __attribute__((ext_vector_type(4)));
+
using namespace android;
using namespace android::renderscript;
-bool rsdSamplerInit(const android::renderscript::Context *,
- const android::renderscript::Sampler *) {
+#if defined(ARCH_ARM_HAVE_VFP)
+ #define LOCAL_CALL __attribute__((pcs("aapcs-vfp")))
+#else
+ #define LOCAL_CALL
+#endif
+
+// 565 Conversion bits taken from SkBitmap
+#define SK_R16_BITS 5
+#define SK_G16_BITS 6
+#define SK_B16_BITS 5
+
+#define SK_R16_SHIFT (SK_B16_BITS + SK_G16_BITS)
+#define SK_G16_SHIFT (SK_B16_BITS)
+#define SK_B16_SHIFT 0
+
+#define SK_R16_MASK ((1 << SK_R16_BITS) - 1)
+#define SK_G16_MASK ((1 << SK_G16_BITS) - 1)
+#define SK_B16_MASK ((1 << SK_B16_BITS) - 1)
+
+static inline unsigned SkR16ToR32(unsigned r) {
+ return (r << (8 - SK_R16_BITS)) | (r >> (2 * SK_R16_BITS - 8));
+}
+
+static inline unsigned SkG16ToG32(unsigned g) {
+ return (g << (8 - SK_G16_BITS)) | (g >> (2 * SK_G16_BITS - 8));
+}
+
+static inline unsigned SkB16ToB32(unsigned b) {
+ return (b << (8 - SK_B16_BITS)) | (b >> (2 * SK_B16_BITS - 8));
+}
+
+#define SkPacked16ToR32(c) SkR16ToR32(SkGetPackedR16(c))
+#define SkPacked16ToG32(c) SkG16ToG32(SkGetPackedG16(c))
+#define SkPacked16ToB32(c) SkB16ToB32(SkGetPackedB16(c))
+
+#define SkGetPackedR16(color) (((unsigned)(color) >> SK_R16_SHIFT) & SK_R16_MASK)
+#define SkGetPackedG16(color) (((unsigned)(color) >> SK_G16_SHIFT) & SK_G16_MASK)
+#define SkGetPackedB16(color) (((unsigned)(color) >> SK_B16_SHIFT) & SK_B16_MASK)
+
+static float3 getFrom565(uint16_t color) {
+ float3 result;
+ result.x = (float)SkPacked16ToR32(color);
+ result.y = (float)SkPacked16ToG32(color);
+ result.z = (float)SkPacked16ToB32(color);
+ return result;
+}
+
+
+
+/**
+* Allocation sampling
+*/
+static inline float getElementAt1(const uint8_t *p, int32_t x) {
+ float r = p[x];
+ return r;
+}
+
+static inline float2 getElementAt2(const uint8_t *p, int32_t x) {
+ x *= 2;
+ float2 r = {p[x], p[x+1]};
+ return r;
+}
+
+static inline float3 getElementAt3(const uint8_t *p, int32_t x) {
+ x *= 4;
+ float3 r = {p[x], p[x+1], p[x+2]};
+ return r;
+}
+
+static inline float4 getElementAt4(const uint8_t *p, int32_t x) {
+ x *= 4;
+ float4 r = {p[x], p[x+1], p[x+2], p[x+3]};
+ return r;
+}
+
+static inline float3 getElementAt565(const uint8_t *p, int32_t x) {
+ x *= 2;
+ float3 r = getFrom565(((const uint16_t *)p)[0]);
+ return r;
+}
+
+static inline float getElementAt1(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
+ p += y * stride;
+ float r = p[x];
+ return r;
+}
+
+static inline float2 getElementAt2(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
+ p += y * stride;
+ x *= 2;
+ float2 r = {p[x], p[x+1]};
+ return r;
+}
+
+static inline float3 getElementAt3(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
+ p += y * stride;
+ x *= 4;
+ float3 r = {p[x], p[x+1], p[x+2]};
+ return r;
+}
+
+static inline float4 getElementAt4(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
+ p += y * stride;
+ x *= 4;
+ float4 r = {p[x], p[x+1], p[x+2], p[x+3]};
+ return r;
+}
+
+static inline float3 getElementAt565(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
+ p += y * stride;
+ x *= 2;
+ float3 r = getFrom565(((const uint16_t *)p)[0]);
+ return r;
+}
+
+
+
+
+
+static float4 LOCAL_CALL
+ getSample1D_A(const uint8_t *p, int32_t iPixel,
+ int32_t next, float w0, float w1) {
+ float p0 = getElementAt1(p, iPixel);
+ float p1 = getElementAt1(p, next);
+ float r = p0 * w0 + p1 * w1;
+ r *= (1.f / 255.f);
+ float4 ret = {0.f, 0.f, 0.f, r};
+ return ret;
+}
+static float4 LOCAL_CALL
+ getSample1D_L(const uint8_t *p, int32_t iPixel,
+ int32_t next, float w0, float w1) {
+ float p0 = getElementAt1(p, iPixel);
+ float p1 = getElementAt1(p, next);
+ float r = p0 * w0 + p1 * w1;
+ r *= (1.f / 255.f);
+ float4 ret = {r, r, r, 1.f};
+ return ret;
+}
+static float4 LOCAL_CALL
+ getSample1D_LA(const uint8_t *p, int32_t iPixel,
+ int32_t next, float w0, float w1) {
+ float2 p0 = getElementAt2(p, iPixel);
+ float2 p1 = getElementAt2(p, next);
+ float2 r = p0 * w0 + p1 * w1;
+ r *= (1.f / 255.f);
+ float4 ret = {r.x, r.x, r.x, r.y};
+ return ret;
+}
+static float4 LOCAL_CALL
+ getSample1D_RGB(const uint8_t *p, int32_t iPixel,
+ int32_t next, float w0, float w1) {
+ float3 p0 = getElementAt3(p, iPixel);
+ float3 p1 = getElementAt3(p, next);
+ float3 r = p0 * w0 + p1 * w1;
+ r *= (1.f / 255.f);
+ float4 ret = {r.x, r.x, r.z, 1.f};
+ return ret;
+}
+static float4 LOCAL_CALL
+ getSample1D_565(const uint8_t *p, int32_t iPixel,
+ int32_t next, float w0, float w1) {
+ float3 p0 = getElementAt565(p, iPixel);
+ float3 p1 = getElementAt565(p, next);
+ float3 r = p0 * w0 + p1 * w1;
+ r *= (1.f / 255.f);
+ float4 ret = {r.x, r.x, r.z, 1.f};
+ return ret;
+}
+static float4 LOCAL_CALL
+ getSample1D_RGBA(const uint8_t *p, int32_t iPixel,
+ int32_t next, float w0, float w1) {
+ float4 p0 = getElementAt4(p, iPixel);
+ float4 p1 = getElementAt4(p, next);
+ float4 r = p0 * w0 + p1 * w1;
+ r *= (1.f / 255.f);
+ return r;
+}
+
+
+static float4 LOCAL_CALL
+ getSample2D_A(const uint8_t *p, size_t stride,
+ int locX, int locY, int nextX, int nextY,
+ float w0, float w1, float w2, float w3) {
+ float p0 = getElementAt1(p, stride, locX, locY);
+ float p1 = getElementAt1(p, stride, nextX, locY);
+ float p2 = getElementAt1(p, stride, locX, nextY);
+ float p3 = getElementAt1(p, stride, nextX, nextY);
+ float r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+ r *= (1.f / 255.f);
+ float4 ret = {0.f, 0.f, 0.f, r};
+ return ret;
+}
+static float4 LOCAL_CALL
+ getSample2D_L(const uint8_t *p, size_t stride,
+ int locX, int locY, int nextX, int nextY,
+ float w0, float w1, float w2, float w3) {
+ float p0 = getElementAt1(p, stride, locX, locY);
+ float p1 = getElementAt1(p, stride, nextX, locY);
+ float p2 = getElementAt1(p, stride, locX, nextY);
+ float p3 = getElementAt1(p, stride, nextX, nextY);
+ float r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+ r *= (1.f / 255.f);
+ float4 ret = {r, r, r, 1.f};
+ return ret;
+}
+static float4 LOCAL_CALL
+ getSample2D_LA(const uint8_t *p, size_t stride,
+ int locX, int locY, int nextX, int nextY,
+ float w0, float w1, float w2, float w3) {
+ float2 p0 = getElementAt2(p, stride, locX, locY);
+ float2 p1 = getElementAt2(p, stride, nextX, locY);
+ float2 p2 = getElementAt2(p, stride, locX, nextY);
+ float2 p3 = getElementAt2(p, stride, nextX, nextY);
+ float2 r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+ r *= (1.f / 255.f);
+ float4 ret = {r.x, r.x, r.x, r.y};
+ return ret;
+}
+static float4 LOCAL_CALL
+ getSample2D_RGB(const uint8_t *p, size_t stride,
+ int locX, int locY, int nextX, int nextY,
+ float w0, float w1, float w2, float w3) {
+ float4 p0 = getElementAt4(p, stride, locX, locY);
+ float4 p1 = getElementAt4(p, stride, nextX, locY);
+ float4 p2 = getElementAt4(p, stride, locX, nextY);
+ float4 p3 = getElementAt4(p, stride, nextX, nextY);
+ float4 r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+ r *= (1.f / 255.f);
+ float4 ret = {r.x, r.y, r.z, 1.f};
+ return ret;
+}
+static float4 LOCAL_CALL
+ getSample2D_RGBA(const uint8_t *p, size_t stride,
+ int locX, int locY, int nextX, int nextY,
+ float w0, float w1, float w2, float w3) {
+ float4 p0 = getElementAt4(p, stride, locX, locY);
+ float4 p1 = getElementAt4(p, stride, nextX, locY);
+ float4 p2 = getElementAt4(p, stride, locX, nextY);
+ float4 p3 = getElementAt4(p, stride, nextX, nextY);
+ float4 r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+ r *= (1.f / 255.f);
+ return r;
+}
+static float4 getSample2D_565(const uint8_t *p, size_t stride,
+ int locX, int locY, int nextX, int nextY,
+ float w0, float w1, float w2, float w3) {
+ float3 p0 = getElementAt565(p, stride, locX, locY);
+ float3 p1 = getElementAt565(p, stride, nextX, locY);
+ float3 p2 = getElementAt565(p, stride, locX, nextY);
+ float3 p3 = getElementAt565(p, stride, nextX, nextY);
+ float3 r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+ r *= (1.f / 255.f);
+ float4 ret = {r.x, r.y, r.z, 1.f};
+ return ret;
+}
+
+
+extern "C" {
+ typedef float4 Sampler2DFn(const uint8_t *p, size_t stride,
+ int lx, int ly, int nx, int ny,
+ float w0, float w1, float w2, float w3) LOCAL_CALL;
+
+ Sampler2DFn rsdCpuGetSample2D_L_k;
+ Sampler2DFn rsdCpuGetSample2D_A_k;
+ Sampler2DFn rsdCpuGetSample2D_LA_k;
+ Sampler2DFn rsdCpuGetSample2D_RGB_k;
+ Sampler2DFn rsdCpuGetSample2D_RGBA_k;
+}
+
+#if 0
+static Sampler2DFn* GetBilinearSampleTable2D[] = {
+ 0, 0, 0, 0, 0, 0, 0,
+ 0,//rsdCpuGetSample2D_L_k,
+ 0,//rsdCpuGetSample2D_A_k,
+ 0,//rsdCpuGetSample2D_LA_k,
+ 0,//rsdCpuGetSample2D_RGB_k,
+ rsdCpuGetSample2D_RGBA_k
+};
+
+#else
+static Sampler2DFn* GetBilinearSampleTable2D[] = {
+ 0, 0, 0, 0, 0, 0, 0,
+ &getSample2D_L,
+ &getSample2D_A,
+ &getSample2D_LA,
+ &getSample2D_RGB,
+ &getSample2D_RGBA,
+};
+#endif
+
+
+static int applyWrapMode(RsSamplerValue mode, int coord, int size) {
+ switch (mode) {
+ case RS_SAMPLER_WRAP:
+ coord = coord % size;
+ if (coord < 0) {
+ coord += size;
+ }
+ break;
+
+ case RS_SAMPLER_CLAMP:
+ coord = rsMax(0, rsMin(coord, size - 1));
+ break;
+
+ case RS_SAMPLER_MIRRORED_REPEAT:
+ coord = coord % (size * 2);
+ if (coord < 0) {
+ coord = (size * 2) + coord;
+ }
+ if (coord >= size) {
+ coord = (size * 2) - coord;
+ }
+ break;
+
+ default:
+ coord = 0;
+ rsAssert(0);
+ }
+ return coord;
+}
+
+static float4
+ sample_LOD_LinearPixel(Allocation *a, const Type *type,
+ RsDataKind dk, RsDataType dt,
+ Sampler *s,
+ float uv, int32_t lod) {
+ RsSamplerValue wrapS = s->mHal.state.wrapS;
+ int32_t sourceW = type->mHal.state.lodDimX[lod];
+ float pixelUV = uv * (float)(sourceW);
+ int32_t iPixel = (int32_t)(pixelUV);
+ float frac = pixelUV - (float)iPixel;
+
+ if (frac < 0.5f) {
+ iPixel -= 1;
+ frac += 0.5f;
+ } else {
+ frac -= 0.5f;
+ }
+
+ float oneMinusFrac = 1.0f - frac;
+
+ int32_t next = applyWrapMode(wrapS, iPixel + 1, sourceW);
+ int32_t loc = applyWrapMode(wrapS, iPixel, sourceW);
+
+ const uint8_t *ptr = (const uint8_t *)a->mHal.drvState.lod[lod].mallocPtr;
+
+ if (dt == RS_TYPE_UNSIGNED_5_6_5) {
+ return getSample1D_565(ptr, loc, next, next, frac);
+ }
+
+ switch(dk) {
+ case RS_KIND_PIXEL_L:
+ return getSample1D_L(ptr, loc, next, next, frac);
+ case RS_KIND_PIXEL_A:
+ return getSample1D_A(ptr, loc, next, next, frac);
+ case RS_KIND_PIXEL_LA:
+ return getSample1D_LA(ptr, loc, next, next, frac);
+ case RS_KIND_PIXEL_RGB:
+ return getSample1D_RGB(ptr, loc, next, next, frac);
+ case RS_KIND_PIXEL_RGBA:
+ return getSample1D_RGBA(ptr, loc, next, next, frac);
+
+ case RS_KIND_PIXEL_YUV:
+ case RS_KIND_USER:
+ case RS_KIND_INVALID:
+ case RS_KIND_PIXEL_DEPTH:
+ rsAssert(0);
+ break;
+ }
+
+ return 0.f;
+}
+
+static float4
+ sample_LOD_NearestPixel(Allocation *a, const Type *type,
+ RsDataKind dk, RsDataType dt,
+ Sampler *s, float uv, int32_t lod) {
+ RsSamplerValue wrapS = s->mHal.state.wrapS;
+ int32_t sourceW = type->mHal.state.lodDimX[lod];
+ int32_t iPixel = (int32_t)(uv * (float)(sourceW));
+ int32_t location = applyWrapMode(wrapS, iPixel, sourceW);
+
+
+ const uint8_t *ptr = (const uint8_t *)a->mHal.drvState.lod[lod].mallocPtr;
+
+ float4 result = {0.f, 0.f, 0.f, 1.f};
+ if (dt == RS_TYPE_UNSIGNED_5_6_5) {
+ result.xyz = getElementAt565(ptr, iPixel);
+ return result;
+ }
+
+ switch(dk) {
+ case RS_KIND_PIXEL_L:
+ {
+ float t = getElementAt1(ptr, iPixel);
+ result.xyz = t;
+ }
+ break;
+ case RS_KIND_PIXEL_A:
+ result.w = getElementAt1(ptr, iPixel);
+ break;
+ case RS_KIND_PIXEL_LA:
+ {
+ float2 t = getElementAt2(ptr, iPixel);
+ result.xyz = t.x;
+ result.w = t.y;
+ }
+ break;
+ case RS_KIND_PIXEL_RGB:
+ result.xyz = getElementAt3(ptr, iPixel);
+ break;
+ case RS_KIND_PIXEL_RGBA:
+ result = getElementAt4(ptr, iPixel);
+ break;
+
+ case RS_KIND_PIXEL_YUV:
+ case RS_KIND_USER:
+ case RS_KIND_INVALID:
+ case RS_KIND_PIXEL_DEPTH:
+ rsAssert(0);
+ break;
+ }
+
+ return result * (1.f / 255.f);
+}
+
+
+static float4
+ sample_LOD_LinearPixel(Allocation *a, const Type *type,
+ RsDataKind dk, RsDataType dt,
+ Sampler *s, float u, float v, int32_t lod) {
+ const RsSamplerValue wrapS = s->mHal.state.wrapS;
+ const RsSamplerValue wrapT = s->mHal.state.wrapT;
+ const int sourceW = type->mHal.state.lodDimX[lod];
+ const int sourceH = type->mHal.state.lodDimY[lod];
+
+ float pixelU = u * (float)sourceW;
+ float pixelV = v * (float)sourceH;
+ int iPixelU = (int)pixelU;
+ int iPixelV = (int)pixelV;
+
+ float fracU = pixelU - iPixelU;
+ float fracV = pixelV - iPixelV;
+
+ if (fracU < 0.5f) {
+ iPixelU -= 1;
+ fracU += 0.5f;
+ } else {
+ fracU -= 0.5f;
+ }
+ if (fracV < 0.5f) {
+ iPixelV -= 1;
+ fracV += 0.5f;
+ } else {
+ fracV -= 0.5f;
+ }
+ float oneMinusFracU = 1.0f - fracU;
+ float oneMinusFracV = 1.0f - fracV;
+
+ float w1 = oneMinusFracU * oneMinusFracV;
+ float w2 = fracU * oneMinusFracV;
+ float w3 = oneMinusFracU * fracV;
+ float w4 = fracU * fracV;
+
+ int nextX = applyWrapMode(wrapS, iPixelU + 1, sourceW);
+ int nextY = applyWrapMode(wrapT, iPixelV + 1, sourceH);
+ int locX = applyWrapMode(wrapS, iPixelU, sourceW);
+ int locY = applyWrapMode(wrapT, iPixelV, sourceH);
+
+ const uint8_t *ptr = (const uint8_t *)a->mHal.drvState.lod[lod].mallocPtr;
+ size_t stride = a->mHal.drvState.lod[lod].stride;
+
+ if (dt == RS_TYPE_UNSIGNED_5_6_5) {
+ return getSample2D_565(ptr, stride, locX, locY, nextX, nextY, w1, w2, w3, w4);
+ }
+
+ return GetBilinearSampleTable2D[dk](ptr, stride, locX, locY, nextX, nextY, w1, w2, w3, w4);
+}
+
+static float4
+ sample_LOD_NearestPixel(Allocation *a, const Type *type,
+ RsDataKind dk, RsDataType dt,
+ Sampler *s,
+ float u, float v, int32_t lod) {
+ RsSamplerValue wrapS = s->mHal.state.wrapS;
+ RsSamplerValue wrapT = s->mHal.state.wrapT;
+
+ int32_t sourceW = type->mHal.state.lodDimX[lod];
+ int32_t sourceH = type->mHal.state.lodDimY[lod];
+
+ int locX = applyWrapMode(wrapS, u * sourceW, sourceW);
+ int locY = applyWrapMode(wrapT, v * sourceH, sourceH);
+
+
+ const uint8_t *ptr = (const uint8_t *)a->mHal.drvState.lod[lod].mallocPtr;
+ size_t stride = a->mHal.drvState.lod[lod].stride;
+
+ float4 result = {0.f, 0.f, 0.f, 1.f};
+ if (dt == RS_TYPE_UNSIGNED_5_6_5) {
+ result.xyz = getElementAt565(ptr, stride, locX, locY);
+ return result;
+ }
+
+ switch(dk) {
+ case RS_KIND_PIXEL_L:
+ {
+ float t = getElementAt1(ptr, stride, locX, locY);
+ result.xyz = t;
+ }
+ break;
+ case RS_KIND_PIXEL_A:
+ result.w = getElementAt1(ptr, stride, locX, locY);
+ break;
+ case RS_KIND_PIXEL_LA:
+ {
+ float2 t = getElementAt2(ptr, stride, locX, locY);
+ result.xyz = t.x;
+ result.w = t.y;
+ }
+ break;
+ case RS_KIND_PIXEL_RGB:
+ result.xyz = getElementAt3(ptr, stride, locX, locY);
+ break;
+ case RS_KIND_PIXEL_RGBA:
+ result = getElementAt4(ptr, stride, locX, locY);
+ break;
+
+
+ case RS_KIND_PIXEL_YUV:
+ case RS_KIND_USER:
+ case RS_KIND_INVALID:
+ case RS_KIND_PIXEL_DEPTH:
+ rsAssert(0);
+ break;
+ }
+
+ return result * (1.f / 255.f);
+}
+
+
+
+static float4 GenericSample1D(Allocation *a, Sampler *s, float u, float lod) {
+ const Type *type = a->getType();
+ const Element *elem = type->getElement();
+ const RsDataKind dk = elem->getKind();
+ const RsDataType dt = elem->getType();
+
+ if (dk == RS_KIND_USER || (dt != RS_TYPE_UNSIGNED_8 && dt != RS_TYPE_UNSIGNED_5_6_5)) {
+ return 0.f;
+ }
+
+ if (!(a->mHal.state.usageFlags & RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE)) {
+ const Context *rsc = RsdCpuReference::getTlsContext();
+ rsc->setError(RS_ERROR_BAD_VALUE, "Sampling from texture witout USAGE_GRAPHICS_TEXTURE.");
+ return 0.f;
+ }
+
+ if (lod <= 0.0f) {
+ if (s->mHal.state.magFilter == RS_SAMPLER_NEAREST) {
+ return sample_LOD_NearestPixel(a, type, dk, dt, s, u, 0);
+ }
+ return sample_LOD_LinearPixel(a, type, dk, dt, s, u, 0);
+ }
+
+ if (s->mHal.state.minFilter == RS_SAMPLER_LINEAR_MIP_NEAREST) {
+ int32_t maxLOD = type->mHal.state.lodCount - 1;
+ lod = rsMin(lod, (float)maxLOD);
+ int32_t nearestLOD = (int32_t)round(lod);
+ return sample_LOD_LinearPixel(a, type, dk, dt, s, u, nearestLOD);
+ }
+
+ if (s->mHal.state.minFilter == RS_SAMPLER_LINEAR_MIP_LINEAR) {
+ int32_t lod0 = (int32_t)floor(lod);
+ int32_t lod1 = (int32_t)ceil(lod);
+ int32_t maxLOD = type->mHal.state.lodCount - 1;
+ lod0 = rsMin(lod0, maxLOD);
+ lod1 = rsMin(lod1, maxLOD);
+ float4 sample0 = sample_LOD_LinearPixel(a, type, dk, dt, s, u, lod0);
+ float4 sample1 = sample_LOD_LinearPixel(a, type, dk, dt, s, u, lod1);
+ float frac = lod - (float)lod0;
+ return sample0 * (1.0f - frac) + sample1 * frac;
+ }
+
+ return sample_LOD_NearestPixel(a, type, dk, dt, s, u, 0);
+}
+
+static float4 GenericSample2D(Allocation *a, Sampler *s, float u, float v, float lod) {
+ const Type *type = a->getType();
+ const Element *elem = type->getElement();
+ const RsDataKind dk = elem->getKind();
+ const RsDataType dt = elem->getType();
+
+ if (dk == RS_KIND_USER || (dt != RS_TYPE_UNSIGNED_8 && dt != RS_TYPE_UNSIGNED_5_6_5)) {
+ return 0.f;
+ }
+
+ if (!(a->mHal.state.usageFlags & RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE)) {
+ const Context *rsc = RsdCpuReference::getTlsContext();
+ rsc->setError(RS_ERROR_BAD_VALUE, "Sampling from texture witout USAGE_GRAPHICS_TEXTURE.");
+ return 0.f;
+ }
+
+ if (lod <= 0.0f) {
+ if (s->mHal.state.magFilter == RS_SAMPLER_NEAREST) {
+ return sample_LOD_NearestPixel(a, type, dk, dt, s, u, v, 0);
+ }
+ return sample_LOD_LinearPixel(a, type, dk, dt, s, u, v, 0);
+ }
+
+ if (s->mHal.state.minFilter == RS_SAMPLER_LINEAR_MIP_NEAREST) {
+ int32_t maxLOD = type->mHal.state.lodCount - 1;
+ lod = rsMin(lod, (float)maxLOD);
+ int32_t nearestLOD = (int32_t)round(lod);
+ return sample_LOD_LinearPixel(a, type, dk, dt, s, u, v, nearestLOD);
+ }
+
+ if (s->mHal.state.minFilter == RS_SAMPLER_LINEAR_MIP_LINEAR) {
+ int32_t lod0 = (int32_t)floor(lod);
+ int32_t lod1 = (int32_t)ceil(lod);
+ int32_t maxLOD = type->mHal.state.lodCount - 1;
+ lod0 = rsMin(lod0, maxLOD);
+ lod1 = rsMin(lod1, maxLOD);
+ float4 sample0 = sample_LOD_LinearPixel(a, type, dk, dt, s, u, v, lod0);
+ float4 sample1 = sample_LOD_LinearPixel(a, type, dk, dt, s, u, v, lod1);
+ float frac = lod - (float)lod0;
+ return sample0 * (1.0f - frac) + sample1 * frac;
+ }
+
+ return sample_LOD_NearestPixel(a, type, dk, dt, s, u, v, 0);
+}
+
+
+
+
+// Must match pixel kind in rsDefines.h
+static void * NearestWrap[] = {
+ (void *) GenericSample1D, // L,
+ (void *) GenericSample1D, // A,
+ (void *) GenericSample1D, // LA,
+ (void *) GenericSample1D, // RGB,
+ (void *) GenericSample1D, // RGBA,
+ 0,
+ (void *) GenericSample1D, // YUV
+
+ (void *) GenericSample2D, // L,
+ (void *) GenericSample2D, // A,
+ (void *) GenericSample2D, // LA,
+ (void *) GenericSample2D, // RGB,
+ (void *) GenericSample2D, // RGBA,
+ 0,
+ (void *) GenericSample2D, // YUV
+};
+
+static void * NearestClamp[] = {
+ (void *) GenericSample1D, // L,
+ (void *) GenericSample1D, // A,
+ (void *) GenericSample1D, // LA,
+ (void *) GenericSample1D, // RGB,
+ (void *) GenericSample1D, // RGBA,
+ 0,
+ (void *) GenericSample1D, // YUV
+
+ (void *) GenericSample2D, // L,
+ (void *) GenericSample2D, // A,
+ (void *) GenericSample2D, // LA,
+ (void *) GenericSample2D, // RGB,
+ (void *) GenericSample2D, // RGBA,
+ 0,
+ (void *) GenericSample2D, // YUV
+};
+
+static void * NearestMirroredRepeat[] = {
+ (void *) GenericSample1D, // L,
+ (void *) GenericSample1D, // A,
+ (void *) GenericSample1D, // LA,
+ (void *) GenericSample1D, // RGB,
+ (void *) GenericSample1D, // RGBA,
+ 0,
+ (void *) GenericSample1D, // YUV
+
+ (void *) GenericSample2D, // L,
+ (void *) GenericSample2D, // A,
+ (void *) GenericSample2D, // LA,
+ (void *) GenericSample2D, // RGB,
+ (void *) GenericSample2D, // RGBA,
+ 0,
+ (void *) GenericSample2D, // YUV
+};
+
+// Must match pixel kind in rsDefines.h
+static void * LinearWrap[] = {
+ (void *) GenericSample1D, // L,
+ (void *) GenericSample1D, // A,
+ (void *) GenericSample1D, // LA,
+ (void *) GenericSample1D, // RGB,
+ (void *) GenericSample1D, // RGBA,
+ 0,
+ (void *) GenericSample1D, // YUV
+
+ (void *) GenericSample2D, // L,
+ (void *) GenericSample2D, // A,
+ (void *) GenericSample2D, // LA,
+ (void *) GenericSample2D, // RGB,
+ (void *) GenericSample2D, // RGBA,
+ 0,
+ (void *) GenericSample2D, // YUV
+};
+
+// Must match pixel kind in rsDefines.h
+static void * LinearClamp[] = {
+ (void *) GenericSample1D, // L,
+ (void *) GenericSample1D, // A,
+ (void *) GenericSample1D, // LA,
+ (void *) GenericSample1D, // RGB,
+ (void *) GenericSample1D, // RGBA,
+ 0,
+ (void *) GenericSample1D, // YUV
+
+ (void *) GenericSample2D, // L,
+ (void *) GenericSample2D, // A,
+ (void *) GenericSample2D, // LA,
+ (void *) GenericSample2D, // RGB,
+ (void *) GenericSample2D, // RGBA,
+ 0,
+ (void *) GenericSample2D, // YUV
+};
+
+// Must match pixel kind in rsDefines.h
+static void * LinearMirroredRepeat[] = {
+ (void *) GenericSample1D, // L,
+ (void *) GenericSample1D, // A,
+ (void *) GenericSample1D, // LA,
+ (void *) GenericSample1D, // RGB,
+ (void *) GenericSample1D, // RGBA,
+ 0,
+ (void *) GenericSample1D, // YUV
+
+ (void *) GenericSample2D, // L,
+ (void *) GenericSample2D, // A,
+ (void *) GenericSample2D, // LA,
+ (void *) GenericSample2D, // RGB,
+ (void *) GenericSample2D, // RGBA,
+ 0,
+ (void *) GenericSample2D, // YUV
+};
+
+// Must match pixel kind in rsDefines.h
+static void * Generic[] = {
+ (void *) GenericSample1D, // L,
+ (void *) GenericSample1D, // A,
+ (void *) GenericSample1D, // LA,
+ (void *) GenericSample1D, // RGB,
+ (void *) GenericSample1D, // RGBA,
+ 0,
+ (void *) GenericSample1D, // YUV
+
+ (void *) GenericSample2D, // L,
+ (void *) GenericSample2D, // A,
+ (void *) GenericSample2D, // LA,
+ (void *) GenericSample2D, // RGB,
+ (void *) GenericSample2D, // RGBA,
+ 0,
+ (void *) GenericSample2D, // YUV
+};
+
+bool rsdSamplerInit(const Context *, const Sampler *s) {
+ s->mHal.drv = Generic;
+
+ if ((s->mHal.state.minFilter == s->mHal.state.magFilter) &&
+ (s->mHal.state.wrapS == s->mHal.state.wrapT)) {
+ // We have fast paths for these.
+
+ switch(s->mHal.state.minFilter) {
+ case RS_SAMPLER_NEAREST:
+ switch(s->mHal.state.wrapS) {
+ case RS_SAMPLER_WRAP:
+ s->mHal.drv = NearestWrap;
+ break;
+ case RS_SAMPLER_CLAMP:
+ s->mHal.drv = NearestClamp;
+ break;
+ case RS_SAMPLER_MIRRORED_REPEAT:
+ s->mHal.drv = NearestMirroredRepeat;
+ break;
+ default:
+ break;
+ }
+ break;
+ case RS_SAMPLER_LINEAR:
+ switch(s->mHal.state.wrapS) {
+ case RS_SAMPLER_WRAP:
+ s->mHal.drv = LinearWrap;
+ break;
+ case RS_SAMPLER_CLAMP:
+ s->mHal.drv = LinearClamp;
+ break;
+ case RS_SAMPLER_MIRRORED_REPEAT:
+ s->mHal.drv = LinearMirroredRepeat;
+ break;
+ default:
+ break;
+ }
+ break;
+ case RS_SAMPLER_LINEAR_MIP_LINEAR:
+ switch(s->mHal.state.wrapS) {
+ case RS_SAMPLER_WRAP:
+ s->mHal.drv = LinearWrap;
+ break;
+ case RS_SAMPLER_CLAMP:
+ s->mHal.drv = LinearClamp;
+ break;
+ case RS_SAMPLER_MIRRORED_REPEAT:
+ s->mHal.drv = LinearMirroredRepeat;
+ break;
+ default:
+ break;
+ }
+ break;
+ default:
+ rsAssert(0);
+ break;
+ }
+
+ }
+
return true;
}