blob: 5f1060bad509dd6cbfde86b3baa569e77c574a45 [file] [log] [blame]
/*
* Copyright (C) 2013 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <machine/cpu-features.h>
#include <machine/asm.h>
/*
r0 = base pointer
r1 = image stride
r2 = w
r3 = h
sp = float u
sp = float v
*/
ENTRY(rsdCpuLinearClamp2D_RGBA_k2)
push {r4-r8, r10, r11, lr}
vpush {q4-q7}
// Load uv
ldr r4, [sp, #32+64]
ldr r5, [sp, #32+64+4]
vmov d18, r4, r5 // d18 = float u, v
// float pixelU = (u * w) - 0.5f;
// float pixelV = (v * h) - 0.5f;
vmov d16, r2, r3 // d16 = int w, h
vcvt.f32.s32 d17, d16 // d17 = float w, h
vmul.f32 d20, d18, d17 // d20 = pixelUV (uv * wh)
vld1.f32 d19, =0x3F000000 // 0.5
vsub.f32 d20, d20, d19 // d20 = pixelUV (uv * wh) - 0.5f
// int iu = pixelU;
// int iv = pixelV;
vcvt.s32.f32 d21, d20 // d21 = iPixelUV
//float fracU = pixelU - iu;
//float fracV = pixelV - iv;
vcvt.s32.f32 d19, d20 //
vcvt.f32.s32 d19, d19 // d19 = (float)iuv
vsub.f32 d0, d20, d19 // d0 = fract = pixelUV - iuv
//float oneMinusFracU = 1.0f - fracU;
//float oneMinusFracV = 1.0f - fracV;
vld1.f32 d22, =0x3F800000 // 0.5
vsub.f32 d1, d22, d0 // d1 = oneMinusFrac
//float weightsX1 = oneMinusFracU * oneMinusFracV;
//float weightsY1 = fracU * oneMinusFracV;
//float weightsX2 = fracV * oneMinusFracU;
//float weightsY2 = fracU * fracV;
vmul.f32 d2, d1, d1[1] // d2 = 1mu * 1mv , 1mv * 1mv
vmul.f32 d3, d0, d1[1] // d3 = u * 1mv , v * 1mv
vmul.f32 d4, d1, d0[1] // d4 = v * 1mu , v * 1mv
vmul.f32 d5, d0, d0[1] // d5 = u * v, v * v
//int nextX = rsMax(0, rsMin(iu + 1, w - 1));
//int nextY = rsMax(0, rsMin(iv + 1, h - 1));
//int locationX = rsMax(0, rsMin(iu, w - 1));
//int locationY = rsMax(0, rsMin(iv, h - 1));
vmov.u32 d6, #1
vmov.u32 d8, #0
vsub.s32 d16, d16, d6 // d16 = h -1, w -1
vadd.s32 d7, d6, d21 // d7 = iuv + 1
vmin.s32 d7, d7, d16
vmin.s32 d21, d21, d16
vmax.s32 d7, d7, d8 // d7 = next
vmax.s32 d21, d21, d8 // d21 = location
mov r2, #4
vmov d6, r2, r1 // d6 = 4, stride
vmul.s32 d7, d6 // d7 = nextX*4, nextY * stride
vmul.s32 d21, d6 // d21 = locationX*4, locationY * stride
//uchar4 *p0c = (uchar4*)&p[(locationY * stride) + (locationX * 4)];
//uchar4 *p1c = (uchar4*)&p[(locationY * stride) + (nextX * 4)];
//uchar4 *p2c = (uchar4*)&p[(nextY * stride) + (locationX * 4)];
//uchar4 *p3c = (uchar4*)&p[(nextY * stride) + (nextX * 4)];
vmov r2, r3, d7 // r2 = nextX*4, r3 = nextY *stride
vmov r4, r5, d21 // r4 = locX*4, r5 = locY*Stride
add r3, r3, r0 // r3 = p + nextY*stride
add r5, r5, r0 // r5 = p + locY*stride
//float4 p0 = convert_float4(*p0c);
//float4 p1 = convert_float4(*p1c);
add r1, r5, r4 // *p0c
ldr r0, [r1]
add r1, r5, r2 // *p1c
ldr r1, [r1]
vmov d0, r0, r1 // d0 = p0, p1
//float4 p2 = convert_float4(*p2c);
//float4 p3 = convert_float4(*p3c);
add r1, r3, r4 // *p2c
ldr r0, [r1]
add r1, r3, r2 // *p3c
ldr r1, [r1]
vmov d1, r0, r1 // d1 = p0, p1
//return (p0 * weightsX1 + p1 * weightsY1 + p2 * weightsX2 + p3 * weightsY2) * 0.003921569f;
vmovl.u8 q3, d0
vmovl.u8 q4, d1
vmovl.u16 q3, d6
vmovl.u16 q4, d7
vmovl.u16 q5, d8
vmovl.u16 q6, d9
vcvt.f32.u32 q3, q3
vcvt.f32.u32 q4, q4
vcvt.f32.u32 q5, q5
vcvt.f32.u32 q6, q6
//vmul.f32 q3, q3, d2[0]
//vmla.f32 q3, q4, d3[0]
//vmla.f32 q3, q5, d4[0]
//vmla.f32 q3, q6, d5[0]
vld1.f32 d0, =0x3B808081 // 1.f / 255.f
vmul.f32 q3, q3, d0[0]
vmov r0, r1, d6
vmov r2, r3, d7
mov r3, #0x3F800000
/* We're done, bye! */
vpop {q4-q7}
pop {r4-r8, r10, r11, lr}
bx lr
END(rsdCpuLinearClamp2D_RGBA_k2)
/*
r0 = base pointer
r1 = image stride
r2 = iu
r3 = iv
sp = w
sp = h
*/
ENTRY(rsdCpuLinearClamp2D_RGBA_k)
push {r4-r8, r10, r11, lr}
vpush {q4-q7}
vmov d2, r2, r3
add r4, sp, #32+64
vld1.32 d3, [r4]!
vld1.32 {q0}, [r4]!
mov r2, #4
vmov d6, r2, r1 // d6 = 4, stride
vmul.s32 d30, d6 // d30 = nextX*4, nextY * stride
vmul.s32 d31, d6 // d31 = locationX*4, locationY * stride
//uchar4 *p0c = (uchar4*)&p[(locationY * stride) + (locationX * 4)];
//uchar4 *p1c = (uchar4*)&p[(locationY * stride) + (nextX * 4)];
//uchar4 *p2c = (uchar4*)&p[(nextY * stride) + (locationX * 4)];
//uchar4 *p3c = (uchar4*)&p[(nextY * stride) + (nextX * 4)];
vmov r2, r3, d30 // r2 = nextX*4, r3 = nextY *stride
vmov r4, r5, d31 // r4 = locX*4, r5 = locY*Stride
add r3, r3, r0 // r3 = p + nextY*stride
add r5, r5, r0 // r5 = p + locY*stride
//float4 p0 = convert_float4(*p0c);
//float4 p1 = convert_float4(*p1c);
add r1, r5, r4 // *p0c
ldr r0, [r1]
add r1, r5, r2 // *p1c
ldr r1, [r1]
vmov d30, r0, r1 // d0 = p0, p1
//float4 p2 = convert_float4(*p2c);
//float4 p3 = convert_float4(*p3c);
add r1, r3, r4 // *p2c
ldr r0, [r1]
add r1, r3, r2 // *p3c
ldr r1, [r1]
vmov d31, r0, r1 // d1 = p0, p1
//return (p0 * weightsX1 + p1 * weightsY1 + p2 * weightsX2 + p3 * weightsY2) * 0.003921569f;
vmovl.u8 q2, d30
vmovl.u8 q3, d31
vmovl.u16 q8, d4
vmovl.u16 q9, d5
vmovl.u16 q10, d6
vmovl.u16 q11, d7
vcvt.f32.u32 q8, q8, #8
vcvt.f32.u32 q9, q9, #8
vcvt.f32.u32 q10, q10, #8
vcvt.f32.u32 q11, q11, #8
vmul.f32 q3, q8, d0[0]
vmla.f32 q3, q9, d0[1]
vmla.f32 q3, q10, d1[0]
vmla.f32 q3, q11, d1[1]
/// vld1.f32 d0, =0x3B808081 // 1.f / 255.f
// vmul.f32 q3, q3, d0[0]
vmov r0, r1, d6
vmov r2, r3, d7
mov r3, #0x3F800000
/* We're done, bye! */
vpop {q4-q7}
pop {r4-r8, r10, r11, lr}
bx lr
END(rsdCpuLinearClamp2D_RGBA_k)
/*
r0 = uint8_t *ptr
r1 = image stride
r2,r3 = iPixel
sp0,1 = next
q0 = weights
*/
ENTRY(rsdCpuGetSample2D_RGBA_k)
push {r4-r8, lr}
ldr r4, [sp, #24] // next.x
ldr r5, [sp, #24+4] // next.y
mul r3, r3, r1 // iPixel.y * stride
mul r5, r5, r1 // next.y * stride
add r2, r0, r2, LSL #2
add r4, r0, r4, LSL #2
ldr r0, [r2, r3] // r0 = p[(locationY * stride) + (locationX * 4)]
ldr r1, [r4, r3] // r1 = p[(locationY * stride) + (nextX * 4)]
ldr r2, [r2, r5] // r2 = p[(nextY * stride) + (locationX * 4)]
ldr r3, [r4, r5] // r3 = p[(nextY * stride) + (nextX * 4)]
vmov d30, r0, r1 // d30 = p0, p1
vmov d31, r2, r3 // d31 = p2, p3
vcvt.u32.f32 q0, q0, #8
vmovn.u32 d0, q0
//return (p0 * weightsX1 + p1 * weightsY1 + p2 * weightsX2 + p3 * weightsY2) * 0.003921569f;
vmovl.u8 q2, d30
vmovl.u8 q3, d31
vmull.u16 q8, d4, d0[0]
vmlal.u16 q8, d5, d0[1]
vmlal.u16 q8, d6, d0[2]
vmlal.u16 q8, d7, d0[3]
vcvt.f32.u32 q3, q8, #8
ldr r1, =0x3B808081 // 1.f / 255.f
vmov.32 d0[0], r1
vmul.f32 q0, q3, d0[0]
/* We're done, bye! */
pop {r4-r8, lr}
bx lr
END(rsdCpuGetSample2D_RGBA_k)