Merge "Clamp rsPackPixel" into jb-mr2-dev
diff --git a/lib/Renderscript/runtime/Android.mk b/lib/Renderscript/runtime/Android.mk
index 3005624..80b310d 100755
--- a/lib/Renderscript/runtime/Android.mk
+++ b/lib/Renderscript/runtime/Android.mk
@@ -28,7 +28,6 @@
     rs_sample.c \
     rs_sampler.c \
     convert.ll \
-    pixel_packing.ll \
     rsClamp.ll
 
 clcore_files := \
diff --git a/lib/Renderscript/runtime/arch/generic.c b/lib/Renderscript/runtime/arch/generic.c
index ab92227..77ce4c5 100644
--- a/lib/Renderscript/runtime/arch/generic.c
+++ b/lib/Renderscript/runtime/arch/generic.c
@@ -20,13 +20,52 @@
 extern short __attribute__((overloadable, always_inline)) rsClamp(short amount, short low, short high);
 extern float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high);
 extern uchar4 __attribute__((overloadable)) convert_uchar4(short4);
+extern uchar4 __attribute__((overloadable)) convert_uchar4(float4);
+extern float4 __attribute__((overloadable)) convert_float4(uchar4);
 extern float __attribute__((overloadable)) sqrt(float);
 
+/**
+ * clz
+ */
+extern uint32_t __attribute__((overloadable)) clz(uint32_t v) {
+    return __builtin_clz(v);
+}
+extern uint16_t __attribute__((overloadable)) clz(uint16_t v) {
+    return (uint16_t)__builtin_clz(v);
+}
+extern uint8_t __attribute__((overloadable)) clz(uint8_t v) {
+    return (uint8_t)__builtin_clz(v);
+}
+extern int32_t __attribute__((overloadable)) clz(int32_t v) {
+    return (int32_t)__builtin_clz((uint32_t)v);
+}
+extern int16_t __attribute__((overloadable)) clz(int16_t v) {
+    return (int16_t)__builtin_clz(v);
+}
+extern int8_t __attribute__((overloadable)) clz(int8_t v) {
+    return (int8_t)__builtin_clz(v);
+}
+
+extern uint32_t __attribute__((overloadable)) abs(int32_t v) {
+    if (v < 0)
+        return -v;
+    return v;
+}
+extern uint16_t __attribute__((overloadable)) abs(int16_t v) {
+    if (v < 0)
+        return -v;
+    return v;
+}
+extern uint8_t __attribute__((overloadable)) abs(int8_t v) {
+    if (v < 0)
+        return -v;
+    return v;
+}
+
 
 /*
  * CLAMP
  */
-
 extern float __attribute__((overloadable)) clamp(float amount, float low, float high) {
     return amount < low ? low : (amount > high ? high : amount);
 }
@@ -706,6 +745,16 @@
     return fmin(v1, v2);
 }
 
+extern float step(float edge, float v) {
+    if (v < edge) return 0.f;
+    return 1.f;
+}
+
+extern float sign(float value) {
+    if (value > 0) return 1.f;
+    if (value < 0) return -1.f;
+    return value;
+}
 
 /*
  * YUV
@@ -842,6 +891,9 @@
     return r;
 }
 
+/**
+ * matrix ops
+ */
 
 extern float4 __attribute__((overloadable))
 rsMatrixMultiply(const rs_matrix4x4 *m, float4 in) {
@@ -911,3 +963,44 @@
     return rsMatrixMultiply((const rs_matrix3x3 *)m, in);
 }
 
+/**
+ * Pixel Ops
+ */
+extern uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b)
+{
+    uchar4 c;
+    c.x = (uchar)clamp((r * 255.f + 0.5f), 0.f, 255.f);
+    c.y = (uchar)clamp((g * 255.f + 0.5f), 0.f, 255.f);
+    c.z = (uchar)clamp((b * 255.f + 0.5f), 0.f, 255.f);
+    c.w = 255;
+    return c;
+}
+
+extern uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a)
+{
+    uchar4 c;
+    c.x = (uchar)clamp((r * 255.f + 0.5f), 0.f, 255.f);
+    c.y = (uchar)clamp((g * 255.f + 0.5f), 0.f, 255.f);
+    c.z = (uchar)clamp((b * 255.f + 0.5f), 0.f, 255.f);
+    c.w = (uchar)clamp((a * 255.f + 0.5f), 0.f, 255.f);
+    return c;
+}
+
+extern uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
+{
+    color *= 255.f;
+    color += 0.5f;
+    color = clamp(color, 0.f, 255.f);
+    uchar4 c = {color.x, color.y, color.z, 255};
+    return c;
+}
+
+extern uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color)
+{
+    color *= 255.f;
+    color += 0.5f;
+    color = clamp(color, 0.f, 255.f);
+    uchar4 c = {color.x, color.y, color.z, color.w};
+    return c;
+}
+
diff --git a/lib/Renderscript/runtime/arch/neon.ll b/lib/Renderscript/runtime/arch/neon.ll
index 3b85e1b..466a623 100644
--- a/lib/Renderscript/runtime/arch/neon.ll
+++ b/lib/Renderscript/runtime/arch/neon.ll
@@ -1006,3 +1006,57 @@
   ret <4 x float> %r
 }
 
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;              pixel ops                 ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+@fc_255.0 = internal constant <4 x float> <float 255.0, float 255.0, float 255.0, float 255.0>, align 16
+@fc_0.5 = internal constant <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, align 16
+@fc_0 = internal constant <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, align 16
+
+declare <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone
+declare <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone
+
+; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color)
+define <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %color) nounwind readnone {
+    %f255 = load <4 x float>* @fc_255.0, align 16
+    %f05 = load <4 x float>* @fc_0.5, align 16
+    %f0 = load <4 x float>* @fc_0, align 16
+    %v1 = fmul <4 x float> %f255, %color
+    %v2 = fadd <4 x float> %f05, %v1
+    %v3 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %v2, <4 x float> %f0, <4 x float> %f255) nounwind readnone
+    %v4 = tail call <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %v3) nounwind readnone
+    ret <4 x i8> %v4
+}
+
+; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
+define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<3 x float> %color) nounwind readnone {
+    %1 = shufflevector <3 x float> %color, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %2 = insertelement <4 x float> %1, float 1.0, i32 3
+    %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone
+    ret <4 x i8> %3
+}
+
+; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b)
+define <4 x i8> @_Z17rsPackColorTo8888fff(float %r, float %g, float %b) nounwind readnone {
+    %1 = insertelement <4 x float> undef, float %r, i32 0
+    %2 = insertelement <4 x float> %1, float %g, i32 1
+    %3 = insertelement <4 x float> %2, float %b, i32 2
+    %4 = insertelement <4 x float> %3, float 1.0, i32 3
+    %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
+    ret <4 x i8> %5
+}
+
+; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a)
+define <4 x i8> @_Z17rsPackColorTo8888ffff(float %r, float %g, float %b, float %a) nounwind readnone {
+    %1 = insertelement <4 x float> undef, float %r, i32 0
+    %2 = insertelement <4 x float> %1, float %g, i32 1
+    %3 = insertelement <4 x float> %2, float %b, i32 2
+    %4 = insertelement <4 x float> %3, float %a, i32 3
+    %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
+    ret <4 x i8> %5
+}
+
diff --git a/lib/Renderscript/runtime/pixel_packing.ll b/lib/Renderscript/runtime/pixel_packing.ll
deleted file mode 100644
index 65401a6..0000000
--- a/lib/Renderscript/runtime/pixel_packing.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
-target triple = "armv7-none-linux-gnueabi"
-
-@fc_255.0 = internal constant <4 x float> <float 255.0, float 255.0, float 255.0, float 255.0>, align 16
-@fc_0.5 = internal constant <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, align 16
-
-declare <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone
-declare <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone
-
-; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color)
-define <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %color) nounwind readnone {
-    %f255 = load <4 x float>* @fc_255.0, align 16
-    %f05 = load <4 x float>* @fc_0.5, align 16
-    %v1 = fmul <4 x float> %f255, %color
-    %v2 = fadd <4 x float> %f05, %v1
-    %v3 = tail call <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %v2) nounwind readnone
-    ret <4 x i8> %v3
-}
-
-; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
-define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<3 x float> %color) nounwind readnone {
-    %1 = shufflevector <3 x float> %color, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-    %2 = insertelement <4 x float> %1, float 1.0, i32 3
-    %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone
-    ret <4 x i8> %3
-}
-
-; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b)
-define <4 x i8> @_Z17rsPackColorTo8888fff(float %r, float %g, float %b) nounwind readnone {
-    %1 = insertelement <4 x float> undef, float %r, i32 0
-    %2 = insertelement <4 x float> %1, float %g, i32 1
-    %3 = insertelement <4 x float> %2, float %b, i32 2
-    %4 = insertelement <4 x float> %3, float 1.0, i32 3
-    %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
-    ret <4 x i8> %5
-}
-
-; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a)
-define <4 x i8> @_Z17rsPackColorTo8888ffff(float %r, float %g, float %b, float %a) nounwind readnone {
-    %1 = insertelement <4 x float> undef, float %r, i32 0
-    %2 = insertelement <4 x float> %1, float %g, i32 1
-    %3 = insertelement <4 x float> %2, float %b, i32 2
-    %4 = insertelement <4 x float> %3, float %a, i32 3
-    %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
-    ret <4 x i8> %5
-}
-