Enable additional asm on arm

Bug: 6168278
Change-Id: Icb87356462ff2219c939bfeedc6aac7f4db69af7
diff --git a/crypto/Android.mk b/crypto/Android.mk
index fb599ce..d53baa9 100644
--- a/crypto/Android.mk
+++ b/crypto/Android.mk
@@ -1,12 +1,14 @@
 LOCAL_PATH:= $(call my-dir)
 
-arm_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
+arm_cflags := -DOPENSSL_BN_ASM_GF2m -DOPENSSL_BN_ASM_MONT -DGHASH_ASM -DAES_ASM -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
 mips_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM
 
 arm_src_files := \
     aes/asm/aes-armv4.s \
+    bn/asm/armv4-gf2m.s \
     bn/asm/armv4-mont.s \
     bn/bn_asm.c \
+    modes/asm/ghash-armv4.s \
     sha/asm/sha1-armv4-large.s \
     sha/asm/sha256-armv4.s \
     sha/asm/sha512-armv4.s
diff --git a/crypto/bn/asm/armv4-gf2m.s b/crypto/bn/asm/armv4-gf2m.s
new file mode 100644
index 0000000..2c209e1
--- /dev/null
+++ b/crypto/bn/asm/armv4-gf2m.s
@@ -0,0 +1,213 @@
+#include "arm_arch.h"
+
+.text
+.code	32
+
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.type	mul_1x1_neon,%function
+.align	5
+mul_1x1_neon:
+	vshl.u64	d2,d16,#8	@ q1-q3 are slided 
+	vmull.p8	q0,d16,d17	@ a·bb
+	vshl.u64	d4,d16,#16
+	vmull.p8	q1,d2,d17	@ a<<8·bb
+	vshl.u64	d6,d16,#24
+	vmull.p8	q2,d4,d17	@ a<<16·bb
+	vshr.u64	d2,#8
+	vmull.p8	q3,d6,d17	@ a<<24·bb
+	vshl.u64	d3,#24
+	veor		d0,d2
+	vshr.u64	d4,#16
+	veor		d0,d3
+	vshl.u64	d5,#16
+	veor		d0,d4
+	vshr.u64	d6,#24
+	veor		d0,d5
+	vshl.u64	d7,#8
+	veor		d0,d6
+	veor		d0,d7
+	.word	0xe12fff1e
+.size	mul_1x1_neon,.-mul_1x1_neon
+#endif
+.type	mul_1x1_ialu,%function
+.align	5
+mul_1x1_ialu:
+	mov	r4,#0
+	bic	r5,r1,#3<<30		@ a1=a&0x3fffffff
+	str	r4,[sp,#0]		@ tab[0]=0
+	add	r6,r5,r5		@ a2=a1<<1
+	str	r5,[sp,#4]		@ tab[1]=a1
+	eor	r7,r5,r6		@ a1^a2
+	str	r6,[sp,#8]		@ tab[2]=a2
+	mov	r8,r5,lsl#2		@ a4=a1<<2
+	str	r7,[sp,#12]		@ tab[3]=a1^a2
+	eor	r9,r5,r8		@ a1^a4
+	str	r8,[sp,#16]		@ tab[4]=a4
+	eor	r4,r6,r8		@ a2^a4
+	str	r9,[sp,#20]		@ tab[5]=a1^a4
+	eor	r7,r7,r8		@ a1^a2^a4
+	str	r4,[sp,#24]		@ tab[6]=a2^a4
+	and	r8,r12,r0,lsl#2
+	str	r7,[sp,#28]		@ tab[7]=a1^a2^a4
+
+	and	r9,r12,r0,lsr#1
+	ldr	r5,[sp,r8]		@ tab[b       & 0x7]
+	and	r8,r12,r0,lsr#4
+	ldr	r7,[sp,r9]		@ tab[b >>  3 & 0x7]
+	and	r9,r12,r0,lsr#7
+	ldr	r6,[sp,r8]		@ tab[b >>  6 & 0x7]
+	eor	r5,r5,r7,lsl#3	@ stall
+	mov	r4,r7,lsr#29
+	ldr	r7,[sp,r9]		@ tab[b >>  9 & 0x7]
+
+	and	r8,r12,r0,lsr#10
+	eor	r5,r5,r6,lsl#6
+	eor	r4,r4,r6,lsr#26
+	ldr	r6,[sp,r8]		@ tab[b >> 12 & 0x7]
+
+	and	r9,r12,r0,lsr#13
+	eor	r5,r5,r7,lsl#9
+	eor	r4,r4,r7,lsr#23
+	ldr	r7,[sp,r9]		@ tab[b >> 15 & 0x7]
+
+	and	r8,r12,r0,lsr#16
+	eor	r5,r5,r6,lsl#12
+	eor	r4,r4,r6,lsr#20
+	ldr	r6,[sp,r8]		@ tab[b >> 18 & 0x7]
+
+	and	r9,r12,r0,lsr#19
+	eor	r5,r5,r7,lsl#15
+	eor	r4,r4,r7,lsr#17
+	ldr	r7,[sp,r9]		@ tab[b >> 21 & 0x7]
+
+	and	r8,r12,r0,lsr#22
+	eor	r5,r5,r6,lsl#18
+	eor	r4,r4,r6,lsr#14
+	ldr	r6,[sp,r8]		@ tab[b >> 24 & 0x7]
+
+	and	r9,r12,r0,lsr#25
+	eor	r5,r5,r7,lsl#21
+	eor	r4,r4,r7,lsr#11
+	ldr	r7,[sp,r9]		@ tab[b >> 27 & 0x7]
+
+	tst	r1,#1<<30
+	and	r8,r12,r0,lsr#28
+	eor	r5,r5,r6,lsl#24
+	eor	r4,r4,r6,lsr#8
+	ldr	r6,[sp,r8]		@ tab[b >> 30      ]
+
+	eorne	r5,r5,r0,lsl#30
+	eorne	r4,r4,r0,lsr#2
+	tst	r1,#1<<31
+	eor	r5,r5,r7,lsl#27
+	eor	r4,r4,r7,lsr#5
+	eorne	r5,r5,r0,lsl#31
+	eorne	r4,r4,r0,lsr#1
+	eor	r5,r5,r6,lsl#30
+	eor	r4,r4,r6,lsr#2
+
+	mov	pc,lr
+.size	mul_1x1_ialu,.-mul_1x1_ialu
+.global	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,%function
+.align	5
+bn_GF2m_mul_2x2:
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+.Lpic:	ldr	r12,[pc,r12]
+	tst	r12,#1
+	beq	.Lialu
+
+	veor	d18,d18
+	vmov.32	d19,r3,r3		@ two copies of b1
+	vmov.32	d18[0],r1		@ a1
+
+	veor	d20,d20
+	vld1.32	d21[],[sp,:32]	@ two copies of b0
+	vmov.32	d20[0],r2		@ a0
+	mov	r12,lr
+
+	vmov	d16,d18
+	vmov	d17,d19
+	bl	mul_1x1_neon		@ a1·b1
+	vmov	d22,d0
+
+	vmov	d16,d20
+	vmov	d17,d21
+	bl	mul_1x1_neon		@ a0·b0
+	vmov	d23,d0
+
+	veor	d16,d20,d18
+	veor	d17,d21,d19
+	veor	d20,d23,d22
+	bl	mul_1x1_neon		@ (a0+a1)·(b0+b1)
+
+	veor	d0,d20			@ (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	vshl.u64 d1,d0,#32
+	vshr.u64 d0,d0,#32
+	veor	d23,d1
+	veor	d22,d0
+	vst1.32	{d23[0]},[r0,:32]!
+	vst1.32	{d23[1]},[r0,:32]!
+	vst1.32	{d22[0]},[r0,:32]!
+	vst1.32	{d22[1]},[r0,:32]
+	bx	r12
+.align	4
+.Lialu:
+#endif
+	stmdb	sp!,{r4-r10,lr}
+	mov	r10,r0			@ reassign 1st argument
+	mov	r0,r3			@ r0=b1
+	ldr	r3,[sp,#32]		@ load b0
+	mov	r12,#7<<2
+	sub	sp,sp,#32		@ allocate tab[8]
+
+	bl	mul_1x1_ialu		@ a1·b1
+	str	r5,[r10,#8]
+	str	r4,[r10,#12]
+
+	eor	r0,r0,r3		@ flip b0 and b1
+	 eor	r1,r1,r2		@ flip a0 and a1
+	eor	r3,r3,r0
+	 eor	r2,r2,r1
+	eor	r0,r0,r3
+	 eor	r1,r1,r2
+	bl	mul_1x1_ialu		@ a0·b0
+	str	r5,[r10]
+	str	r4,[r10,#4]
+
+	eor	r1,r1,r2
+	eor	r0,r0,r3
+	bl	mul_1x1_ialu		@ (a1+a0)·(b1+b0)
+	ldmia	r10,{r6-r9}
+	eor	r5,r5,r4
+	eor	r4,r4,r7
+	eor	r5,r5,r6
+	eor	r4,r4,r8
+	eor	r5,r5,r9
+	eor	r4,r4,r9
+	str	r4,[r10,#8]
+	eor	r5,r5,r4
+	add	sp,sp,#32		@ destroy tab[8]
+	str	r5,[r10,#4]
+
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r10,pc}
+#else
+	ldmia	sp!,{r4-r10,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_ARCH__>=7
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align	5
+
+.comm	OPENSSL_armcap_P,4,4
diff --git a/crypto/modes/asm/ghash-armv4.s b/crypto/modes/asm/ghash-armv4.s
new file mode 100644
index 0000000..4da2156
--- /dev/null
+++ b/crypto/modes/asm/ghash-armv4.s
@@ -0,0 +1,408 @@
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.type	rem_4bit,%object
+.align	5
+rem_4bit:
+.short	0x0000,0x1C20,0x3840,0x2460
+.short	0x7080,0x6CA0,0x48C0,0x54E0
+.short	0xE100,0xFD20,0xD940,0xC560
+.short	0x9180,0x8DA0,0xA9C0,0xB5E0
+.size	rem_4bit,.-rem_4bit
+
+.type	rem_4bit_get,%function
+rem_4bit_get:
+	sub	r2,pc,#8
+	sub	r2,r2,#32	@ &rem_4bit
+	b	.Lrem_4bit_got
+	nop
+.size	rem_4bit_get,.-rem_4bit_get
+
+.global	gcm_ghash_4bit
+.type	gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+	sub	r12,pc,#8
+	add	r3,r2,r3		@ r3 to point at the end
+	stmdb	sp!,{r3-r11,lr}		@ save r3/end too
+	sub	r12,r12,#48		@ &rem_4bit
+
+	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
+	stmdb	sp!,{r4-r11}		@ ... to stack
+
+	ldrb	r12,[r2,#15]
+	ldrb	r14,[r0,#15]
+.Louter:
+	eor	r12,r12,r14
+	and	r14,r12,#0xf0
+	and	r12,r12,#0x0f
+	mov	r3,#14
+
+	add	r7,r1,r12,lsl#4
+	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
+	add	r11,r1,r14
+	ldrb	r12,[r2,#14]
+
+	and	r14,r4,#0xf		@ rem
+	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
+	add	r14,r14,r14
+	eor	r4,r8,r4,lsr#4
+	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
+	eor	r4,r4,r5,lsl#28
+	ldrb	r14,[r0,#14]
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+	eor	r12,r12,r14
+	and	r14,r12,#0xf0
+	and	r12,r12,#0x0f
+	eor	r7,r7,r8,lsl#16
+
+.Linner:
+	add	r11,r1,r12,lsl#4
+	and	r12,r4,#0xf		@ rem
+	subs	r3,r3,#1
+	add	r12,r12,r12
+	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
+	eor	r4,r8,r4,lsr#4
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
+	eor	r6,r10,r6,lsr#4
+	ldrplb	r12,[r2,r3]
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+
+	add	r11,r1,r14
+	and	r14,r4,#0xf		@ rem
+	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
+	add	r14,r14,r14
+	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
+	eor	r4,r8,r4,lsr#4
+	ldrplb	r8,[r0,r3]
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	ldrh	r9,[sp,r14]
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eorpl	r12,r12,r8
+	eor	r7,r11,r7,lsr#4
+	andpl	r14,r12,#0xf0
+	andpl	r12,r12,#0x0f
+	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Linner
+
+	ldr	r3,[sp,#32]		@ re-load r3/end
+	add	r2,r2,#16
+	mov	r14,r4
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r4,r4
+	str	r4,[r0,#12]
+#elif defined(__ARMEB__)
+	str	r4,[r0,#12]
+#else
+	mov	r9,r4,lsr#8
+	strb	r4,[r0,#12+3]
+	mov	r10,r4,lsr#16
+	strb	r9,[r0,#12+2]
+	mov	r11,r4,lsr#24
+	strb	r10,[r0,#12+1]
+	strb	r11,[r0,#12]
+#endif
+	cmp	r2,r3
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r5,r5
+	str	r5,[r0,#8]
+#elif defined(__ARMEB__)
+	str	r5,[r0,#8]
+#else
+	mov	r9,r5,lsr#8
+	strb	r5,[r0,#8+3]
+	mov	r10,r5,lsr#16
+	strb	r9,[r0,#8+2]
+	mov	r11,r5,lsr#24
+	strb	r10,[r0,#8+1]
+	strb	r11,[r0,#8]
+#endif
+	ldrneb	r12,[r2,#15]
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r6,r6
+	str	r6,[r0,#4]
+#elif defined(__ARMEB__)
+	str	r6,[r0,#4]
+#else
+	mov	r9,r6,lsr#8
+	strb	r6,[r0,#4+3]
+	mov	r10,r6,lsr#16
+	strb	r9,[r0,#4+2]
+	mov	r11,r6,lsr#24
+	strb	r10,[r0,#4+1]
+	strb	r11,[r0,#4]
+#endif
+	
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r7,r7
+	str	r7,[r0,#0]
+#elif defined(__ARMEB__)
+	str	r7,[r0,#0]
+#else
+	mov	r9,r7,lsr#8
+	strb	r7,[r0,#0+3]
+	mov	r10,r7,lsr#16
+	strb	r9,[r0,#0+2]
+	mov	r11,r7,lsr#24
+	strb	r10,[r0,#0+1]
+	strb	r11,[r0,#0]
+#endif
+	
+	bne	.Louter
+
+	add	sp,sp,#36
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global	gcm_gmult_4bit
+.type	gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+	stmdb	sp!,{r4-r11,lr}
+	ldrb	r12,[r0,#15]
+	b	rem_4bit_get
+.Lrem_4bit_got:
+	and	r14,r12,#0xf0
+	and	r12,r12,#0x0f
+	mov	r3,#14
+
+	add	r7,r1,r12,lsl#4
+	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
+	ldrb	r12,[r0,#14]
+
+	add	r11,r1,r14
+	and	r14,r4,#0xf		@ rem
+	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
+	add	r14,r14,r14
+	eor	r4,r8,r4,lsr#4
+	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+	and	r14,r12,#0xf0
+	eor	r7,r7,r8,lsl#16
+	and	r12,r12,#0x0f
+
+.Loop:
+	add	r11,r1,r12,lsl#4
+	and	r12,r4,#0xf		@ rem
+	subs	r3,r3,#1
+	add	r12,r12,r12
+	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
+	eor	r4,r8,r4,lsr#4
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
+	eor	r6,r10,r6,lsr#4
+	ldrplb	r12,[r0,r3]
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+
+	add	r11,r1,r14
+	and	r14,r4,#0xf		@ rem
+	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
+	add	r14,r14,r14
+	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
+	eor	r4,r8,r4,lsr#4
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+	andpl	r14,r12,#0xf0
+	andpl	r12,r12,#0x0f
+	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Loop
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r4,r4
+	str	r4,[r0,#12]
+#elif defined(__ARMEB__)
+	str	r4,[r0,#12]
+#else
+	mov	r9,r4,lsr#8
+	strb	r4,[r0,#12+3]
+	mov	r10,r4,lsr#16
+	strb	r9,[r0,#12+2]
+	mov	r11,r4,lsr#24
+	strb	r10,[r0,#12+1]
+	strb	r11,[r0,#12]
+#endif
+	
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r5,r5
+	str	r5,[r0,#8]
+#elif defined(__ARMEB__)
+	str	r5,[r0,#8]
+#else
+	mov	r9,r5,lsr#8
+	strb	r5,[r0,#8+3]
+	mov	r10,r5,lsr#16
+	strb	r9,[r0,#8+2]
+	mov	r11,r5,lsr#24
+	strb	r10,[r0,#8+1]
+	strb	r11,[r0,#8]
+#endif
+	
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r6,r6
+	str	r6,[r0,#4]
+#elif defined(__ARMEB__)
+	str	r6,[r0,#4]
+#else
+	mov	r9,r6,lsr#8
+	strb	r6,[r0,#4+3]
+	mov	r10,r6,lsr#16
+	strb	r9,[r0,#4+2]
+	mov	r11,r6,lsr#24
+	strb	r10,[r0,#4+1]
+	strb	r11,[r0,#4]
+#endif
+	
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r7,r7
+	str	r7,[r0,#0]
+#elif defined(__ARMEB__)
+	str	r7,[r0,#0]
+#else
+	mov	r9,r7,lsr#8
+	strb	r7,[r0,#0+3]
+	mov	r10,r7,lsr#16
+	strb	r9,[r0,#0+2]
+	mov	r11,r7,lsr#24
+	strb	r10,[r0,#0+1]
+	strb	r11,[r0,#0]
+#endif
+	
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	sub		r1,#16		@ point at H in GCM128_CTX
+	vld1.64		d29,[r0,:64]!@ load Xi
+	vmov.i32	d5,#0xe1		@ our irreducible polynomial
+	vld1.64		d28,[r0,:64]!
+	vshr.u64	d5,#32
+	vldmia		r1,{d0-d1}	@ load H
+	veor		q12,q12
+#ifdef __ARMEL__
+	vrev64.8	q14,q14
+#endif
+	veor		q13,q13
+	veor		q11,q11
+	mov		r1,#16
+	veor		q10,q10
+	mov		r3,#16
+	veor		d2,d2
+	vdup.8		d4,d28[0]	@ broadcast lowest byte
+	b		.Linner_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64		d21,[r0,:64]!	@ load Xi
+	vmov.i32	d5,#0xe1		@ our irreducible polynomial
+	vld1.64		d20,[r0,:64]!
+	vshr.u64	d5,#32
+	vldmia		r0,{d0-d1}		@ load H
+	veor		q12,q12
+	nop
+#ifdef __ARMEL__
+	vrev64.8	q10,q10
+#endif
+.Louter_neon:
+	vld1.64		d29,[r2]!	@ load inp
+	veor		q13,q13
+	vld1.64		d28,[r2]!
+	veor		q11,q11
+	mov		r1,#16
+#ifdef __ARMEL__
+	vrev64.8	q14,q14
+#endif
+	veor		d2,d2
+	veor		q14,q10			@ inp^=Xi
+	veor		q10,q10
+	vdup.8		d4,d28[0]	@ broadcast lowest byte
+.Linner_neon:
+	subs		r1,r1,#1
+	vmull.p8	q9,d1,d4		@ H.lo·Xi[i]
+	vmull.p8	q8,d0,d4		@ H.hi·Xi[i]
+	vext.8		q14,q12,#1		@ IN>>=8
+
+	veor		q10,q13		@ modulo-scheduled part
+	vshl.i64	d22,#48
+	vdup.8		d4,d28[0]	@ broadcast lowest byte
+	veor		d3,d18,d20
+
+	veor		d21,d22
+	vuzp.8		q9,q8
+	vsli.8		d2,d3,#1		@ compose the "carry" byte
+	vext.8		q10,q12,#1		@ Z>>=8
+
+	vmull.p8	q11,d2,d5		@ "carry"·0xe1
+	vshr.u8		d2,d3,#7		@ save Z's bottom bit
+	vext.8		q13,q9,q12,#1	@ Qlo>>=8
+	veor		q10,q8
+	bne		.Linner_neon
+
+	veor		q10,q13		@ modulo-scheduled artefact
+	vshl.i64	d22,#48
+	veor		d21,d22
+
+	@ finalization, normalize Z:Zo
+	vand		d2,d5		@ suffices to mask the bit
+	vshr.u64	d3,d20,#63
+	vshl.i64	q10,#1
+	subs		r3,#16
+	vorr		q10,q1		@ Z=Z:Zo<<1
+	bne		.Louter_neon
+
+#ifdef __ARMEL__
+	vrev64.8	q10,q10
+#endif
+	sub		r0,#16	
+	vst1.64		d21,[r0,:64]!	@ write out Xi
+	vst1.64		d20,[r0,:64]
+
+	.word	0xe12fff1e
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align  2
diff --git a/import_openssl.sh b/import_openssl.sh
index a5dd3e6..da2f533 100755
--- a/import_openssl.sh
+++ b/import_openssl.sh
@@ -136,7 +136,9 @@
 
   # Generate arm asm
   perl crypto/aes/asm/aes-armv4.pl         > crypto/aes/asm/aes-armv4.s
+  perl crypto/bn/asm/armv4-gf2m.pl         > crypto/bn/asm/armv4-gf2m.s
   perl crypto/bn/asm/armv4-mont.pl         > crypto/bn/asm/armv4-mont.s
+  perl crypto/modes/asm/ghash-armv4.pl     > crypto/modes/asm/ghash-armv4.s
   perl crypto/sha/asm/sha1-armv4-large.pl  > crypto/sha/asm/sha1-armv4-large.s
   perl crypto/sha/asm/sha256-armv4.pl      > crypto/sha/asm/sha256-armv4.s
   perl crypto/sha/asm/sha512-armv4.pl      > crypto/sha/asm/sha512-armv4.s
diff --git a/patches/crypto_Android.mk b/patches/crypto_Android.mk
index fb599ce..d53baa9 100644
--- a/patches/crypto_Android.mk
+++ b/patches/crypto_Android.mk
@@ -1,12 +1,14 @@
 LOCAL_PATH:= $(call my-dir)
 
-arm_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
+arm_cflags := -DOPENSSL_BN_ASM_GF2m -DOPENSSL_BN_ASM_MONT -DGHASH_ASM -DAES_ASM -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
 mips_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM
 
 arm_src_files := \
     aes/asm/aes-armv4.s \
+    bn/asm/armv4-gf2m.s \
     bn/asm/armv4-mont.s \
     bn/bn_asm.c \
+    modes/asm/ghash-armv4.s \
     sha/asm/sha1-armv4-large.s \
     sha/asm/sha256-armv4.s \
     sha/asm/sha512-armv4.s