Apply eSize optimization to getElementAt

Implement 3D versions of the typed get/set functions

Performance improves 2-4% in most benchmarks that use these.

Change-Id: I6c752b3381d9b3c866b50039c33767fef46b2d20
diff --git a/lib/Renderscript/runtime/rs_allocation.c b/lib/Renderscript/runtime/rs_allocation.c
index e348d3a..de7ddbd 100644
--- a/lib/Renderscript/runtime/rs_allocation.c
+++ b/lib/Renderscript/runtime/rs_allocation.c
@@ -97,62 +97,103 @@
     memcpy((void*)&p[(eSize * x) + (y * stride)], ptr, eSize);
 }
 
-#define SET_ELEMENT_AT(T)                                               \
+extern void __attribute__((overloadable))
+        rsSetElementAt(rs_allocation a, void* ptr, uint32_t x, uint32_t y, uint32_t z) {
+    Allocation_t *alloc = (Allocation_t *)a.p;
+    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr;
+    const uint32_t eSize = alloc->mHal.state.elementSizeBytes;
+    const uint32_t stride = alloc->mHal.drvState.lod[0].stride;
+    const uint32_t dimY = alloc->mHal.drvState.lod[0].dimY;
+    memcpy((void*)&p[(eSize * x) + (y * stride) + (z * stride * dimY)], ptr, eSize);
+}
+
+#define ELEMENT_AT(T)                                                   \
     extern void __attribute__((overloadable))                           \
-    __rsSetElementAt_##T(rs_allocation a, T val, uint32_t x) {          \
+    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x) {            \
         Allocation_t *alloc = (Allocation_t *)a.p;                      \
-        const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
+        uint8_t *p = (uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
         const uint32_t eSize = sizeof(T);                               \
         *((T*)&p[(eSize * x)]) = val;                                   \
     }                                                                   \
     extern void __attribute__((overloadable))                           \
-    __rsSetElementAt_##T(rs_allocation a, T val, uint32_t x, uint32_t y) { \
+    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x, uint32_t y) { \
         Allocation_t *alloc = (Allocation_t *)a.p;                      \
-        const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
+        uint8_t *p = (uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
         const uint32_t eSize = sizeof(T);                               \
         const uint32_t stride = alloc->mHal.drvState.lod[0].stride;     \
         *((T*)&p[(eSize * x) + (y * stride)]) = val;                    \
+    }                                                                   \
+    extern void __attribute__((overloadable))                           \
+            rsSetElementAt(rs_allocation a, T val, uint32_t x, uint32_t y, uint32_t z) { \
+        Allocation_t *alloc = (Allocation_t *)a.p;                      \
+        uint8_t *p = (uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
+        const uint32_t stride = alloc->mHal.drvState.lod[0].stride;     \
+        const uint32_t dimY = alloc->mHal.drvState.lod[0].dimY;         \
+        uint8_t *dp = &p[(sizeof(T) * x) + (y * stride) + (z * stride * dimY)]; \
+        ((T*)dp)[0] = val;                                        \
+    }                                                                   \
+    extern T __attribute__((overloadable))                              \
+    rsGetElementAt_##T(rs_allocation a, uint32_t x) {                   \
+        Allocation_t *alloc = (Allocation_t *)a.p;                      \
+        const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
+        return *((T*)&p[(sizeof(T) * x)]);                              \
+    }                                                                   \
+    extern T __attribute__((overloadable))                              \
+    rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y) {       \
+        Allocation_t *alloc = (Allocation_t *)a.p;                      \
+        const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
+        const uint32_t stride = alloc->mHal.drvState.lod[0].stride;     \
+        return *((T*)&p[(sizeof(T) * x) + (y * stride)]);               \
+    }                                                                   \
+    extern T __attribute__((overloadable))                              \
+            rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) { \
+        Allocation_t *alloc = (Allocation_t *)a.p;                      \
+        const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
+        const uint32_t stride = alloc->mHal.drvState.lod[0].stride;     \
+        const uint32_t dimY = alloc->mHal.drvState.lod[0].dimY;         \
+        const uint8_t *dp = &p[(sizeof(T) * x) + (y * stride) + (z * stride * dimY)]; \
+        return ((const T*)dp)[0];                                       \
     }
 
-SET_ELEMENT_AT(char)
-SET_ELEMENT_AT(char2)
-SET_ELEMENT_AT(char3)
-SET_ELEMENT_AT(char4)
-SET_ELEMENT_AT(uchar)
-SET_ELEMENT_AT(uchar2)
-SET_ELEMENT_AT(uchar3)
-SET_ELEMENT_AT(uchar4)
-SET_ELEMENT_AT(short)
-SET_ELEMENT_AT(short2)
-SET_ELEMENT_AT(short3)
-SET_ELEMENT_AT(short4)
-SET_ELEMENT_AT(ushort)
-SET_ELEMENT_AT(ushort2)
-SET_ELEMENT_AT(ushort3)
-SET_ELEMENT_AT(ushort4)
-SET_ELEMENT_AT(int)
-SET_ELEMENT_AT(int2)
-SET_ELEMENT_AT(int3)
-SET_ELEMENT_AT(int4)
-SET_ELEMENT_AT(uint)
-SET_ELEMENT_AT(uint2)
-SET_ELEMENT_AT(uint3)
-SET_ELEMENT_AT(uint4)
-SET_ELEMENT_AT(long)
-SET_ELEMENT_AT(long2)
-SET_ELEMENT_AT(long3)
-SET_ELEMENT_AT(long4)
-SET_ELEMENT_AT(ulong)
-SET_ELEMENT_AT(ulong2)
-SET_ELEMENT_AT(ulong3)
-SET_ELEMENT_AT(ulong4)
-SET_ELEMENT_AT(float)
-SET_ELEMENT_AT(float2)
-SET_ELEMENT_AT(float3)
-SET_ELEMENT_AT(float4)
-SET_ELEMENT_AT(double)
-SET_ELEMENT_AT(double2)
-SET_ELEMENT_AT(double3)
-SET_ELEMENT_AT(double4)
+ELEMENT_AT(char)
+ELEMENT_AT(char2)
+ELEMENT_AT(char3)
+ELEMENT_AT(char4)
+ELEMENT_AT(uchar)
+ELEMENT_AT(uchar2)
+ELEMENT_AT(uchar3)
+ELEMENT_AT(uchar4)
+ELEMENT_AT(short)
+ELEMENT_AT(short2)
+ELEMENT_AT(short3)
+ELEMENT_AT(short4)
+ELEMENT_AT(ushort)
+ELEMENT_AT(ushort2)
+ELEMENT_AT(ushort3)
+ELEMENT_AT(ushort4)
+ELEMENT_AT(int)
+ELEMENT_AT(int2)
+ELEMENT_AT(int3)
+ELEMENT_AT(int4)
+ELEMENT_AT(uint)
+ELEMENT_AT(uint2)
+ELEMENT_AT(uint3)
+ELEMENT_AT(uint4)
+ELEMENT_AT(long)
+ELEMENT_AT(long2)
+ELEMENT_AT(long3)
+ELEMENT_AT(long4)
+ELEMENT_AT(ulong)
+ELEMENT_AT(ulong2)
+ELEMENT_AT(ulong3)
+ELEMENT_AT(ulong4)
+ELEMENT_AT(float)
+ELEMENT_AT(float2)
+ELEMENT_AT(float3)
+ELEMENT_AT(float4)
+ELEMENT_AT(double)
+ELEMENT_AT(double2)
+ELEMENT_AT(double3)
+ELEMENT_AT(double4)
 
-#undef SET_ELEMENT_AT
+#undef ELEMENT_AT