Add x86_64 assembly files.

This patch modifies import_openssl.sh to also generate assembly
files for x86_64 (using the appropriate Perl scripts).

These new sources are not used by the Android build, but by the
Chromium "linux_redux" build which uses OpenSSL has its SSL engine.

Change-Id: I3d1435de17f2de10633a71b9197b6cec328e93a7
diff --git a/crypto/aes/asm/aes-x86_64.S b/crypto/aes/asm/aes-x86_64.S
new file mode 100644
index 0000000..e385566
--- /dev/null
+++ b/crypto/aes/asm/aes-x86_64.S
@@ -0,0 +1,2541 @@
+.text	
+.type	_x86_64_AES_encrypt,@function
+.align	16
+_x86_64_AES_encrypt:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+
+	movl	240(%r15),%r13d
+	subl	$1,%r13d
+	jmp	.Lenc_loop
+.align	16
+.Lenc_loop:
+
+	movzbl	%al,%esi
+	movzbl	%bl,%edi
+	movzbl	%cl,%ebp
+	movl	0(%r14,%rsi,8),%r10d
+	movl	0(%r14,%rdi,8),%r11d
+	movl	0(%r14,%rbp,8),%r12d
+
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	movzbl	%dl,%ebp
+	xorl	3(%r14,%rsi,8),%r10d
+	xorl	3(%r14,%rdi,8),%r11d
+	movl	0(%r14,%rbp,8),%r8d
+
+	movzbl	%dh,%esi
+	shrl	$16,%ecx
+	movzbl	%ah,%ebp
+	xorl	3(%r14,%rsi,8),%r12d
+	shrl	$16,%edx
+	xorl	3(%r14,%rbp,8),%r8d
+
+	shrl	$16,%ebx
+	leaq	16(%r15),%r15
+	shrl	$16,%eax
+
+	movzbl	%cl,%esi
+	movzbl	%dl,%edi
+	movzbl	%al,%ebp
+	xorl	2(%r14,%rsi,8),%r10d
+	xorl	2(%r14,%rdi,8),%r11d
+	xorl	2(%r14,%rbp,8),%r12d
+
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	movzbl	%bl,%ebp
+	xorl	1(%r14,%rsi,8),%r10d
+	xorl	1(%r14,%rdi,8),%r11d
+	xorl	2(%r14,%rbp,8),%r8d
+
+	movl	12(%r15),%edx
+	movzbl	%bh,%edi
+	movzbl	%ch,%ebp
+	movl	0(%r15),%eax
+	xorl	1(%r14,%rdi,8),%r12d
+	xorl	1(%r14,%rbp,8),%r8d
+
+	movl	4(%r15),%ebx
+	movl	8(%r15),%ecx
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+	subl	$1,%r13d
+	jnz	.Lenc_loop
+	movzbl	%al,%esi
+	movzbl	%bl,%edi
+	movzbl	%cl,%ebp
+	movzbl	2(%r14,%rsi,8),%r10d
+	movzbl	2(%r14,%rdi,8),%r11d
+	movzbl	2(%r14,%rbp,8),%r12d
+
+	movzbl	%dl,%esi
+	movzbl	%bh,%edi
+	movzbl	%ch,%ebp
+	movzbl	2(%r14,%rsi,8),%r8d
+	movl	0(%r14,%rdi,8),%edi
+	movl	0(%r14,%rbp,8),%ebp
+
+	andl	$65280,%edi
+	andl	$65280,%ebp
+
+	xorl	%edi,%r10d
+	xorl	%ebp,%r11d
+	shrl	$16,%ecx
+
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	shrl	$16,%edx
+	movl	0(%r14,%rsi,8),%esi
+	movl	0(%r14,%rdi,8),%edi
+
+	andl	$65280,%esi
+	andl	$65280,%edi
+	shrl	$16,%ebx
+	xorl	%esi,%r12d
+	xorl	%edi,%r8d
+	shrl	$16,%eax
+
+	movzbl	%cl,%esi
+	movzbl	%dl,%edi
+	movzbl	%al,%ebp
+	movl	0(%r14,%rsi,8),%esi
+	movl	0(%r14,%rdi,8),%edi
+	movl	0(%r14,%rbp,8),%ebp
+
+	andl	$16711680,%esi
+	andl	$16711680,%edi
+	andl	$16711680,%ebp
+
+	xorl	%esi,%r10d
+	xorl	%edi,%r11d
+	xorl	%ebp,%r12d
+
+	movzbl	%bl,%esi
+	movzbl	%dh,%edi
+	movzbl	%ah,%ebp
+	movl	0(%r14,%rsi,8),%esi
+	movl	2(%r14,%rdi,8),%edi
+	movl	2(%r14,%rbp,8),%ebp
+
+	andl	$16711680,%esi
+	andl	$4278190080,%edi
+	andl	$4278190080,%ebp
+
+	xorl	%esi,%r8d
+	xorl	%edi,%r10d
+	xorl	%ebp,%r11d
+
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	movl	16+12(%r15),%edx
+	movl	2(%r14,%rsi,8),%esi
+	movl	2(%r14,%rdi,8),%edi
+	movl	16+0(%r15),%eax
+
+	andl	$4278190080,%esi
+	andl	$4278190080,%edi
+
+	xorl	%esi,%r12d
+	xorl	%edi,%r8d
+
+	movl	16+4(%r15),%ebx
+	movl	16+8(%r15),%ecx
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
+.type	_x86_64_AES_encrypt_compact,@function
+.align	16
+_x86_64_AES_encrypt_compact:
+	leaq	128(%r14),%r8
+	movl	0-128(%r8),%edi
+	movl	32-128(%r8),%ebp
+	movl	64-128(%r8),%r10d
+	movl	96-128(%r8),%r11d
+	movl	128-128(%r8),%edi
+	movl	160-128(%r8),%ebp
+	movl	192-128(%r8),%r10d
+	movl	224-128(%r8),%r11d
+	jmp	.Lenc_loop_compact
+.align	16
+.Lenc_loop_compact:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+	leaq	16(%r15),%r15
+	movzbl	%al,%r10d
+	movzbl	%bl,%r11d
+	movzbl	%cl,%r12d
+	movzbl	(%r14,%r10,1),%r10d
+	movzbl	(%r14,%r11,1),%r11d
+	movzbl	(%r14,%r12,1),%r12d
+
+	movzbl	%dl,%r8d
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	movzbl	(%r14,%r8,1),%r8d
+	movzbl	(%r14,%rsi,1),%r9d
+	movzbl	(%r14,%rdi,1),%r13d
+
+	movzbl	%dh,%ebp
+	movzbl	%ah,%esi
+	shrl	$16,%ecx
+	movzbl	(%r14,%rbp,1),%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	shrl	$16,%edx
+
+	movzbl	%cl,%edi
+	shll	$8,%r9d
+	shll	$8,%r13d
+	movzbl	(%r14,%rdi,1),%edi
+	xorl	%r9d,%r10d
+	xorl	%r13d,%r11d
+
+	movzbl	%dl,%r9d
+	shrl	$16,%eax
+	shrl	$16,%ebx
+	movzbl	%al,%r13d
+	shll	$8,%ebp
+	shll	$8,%esi
+	movzbl	(%r14,%r9,1),%r9d
+	movzbl	(%r14,%r13,1),%r13d
+	xorl	%ebp,%r12d
+	xorl	%esi,%r8d
+
+	movzbl	%bl,%ebp
+	movzbl	%dh,%esi
+	shll	$16,%edi
+	movzbl	(%r14,%rbp,1),%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	xorl	%edi,%r10d
+
+	movzbl	%ah,%edi
+	shrl	$8,%ecx
+	shrl	$8,%ebx
+	movzbl	(%r14,%rdi,1),%edi
+	movzbl	(%r14,%rcx,1),%edx
+	movzbl	(%r14,%rbx,1),%ecx
+	shll	$16,%r9d
+	shll	$16,%r13d
+	shll	$16,%ebp
+	xorl	%r9d,%r11d
+	xorl	%r13d,%r12d
+	xorl	%ebp,%r8d
+
+	shll	$24,%esi
+	shll	$24,%edi
+	shll	$24,%edx
+	xorl	%esi,%r10d
+	shll	$24,%ecx
+	xorl	%edi,%r11d
+	movl	%r10d,%eax
+	movl	%r11d,%ebx
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+	cmpq	16(%rsp),%r15
+	je	.Lenc_compact_done
+	movl	%eax,%esi
+	movl	%ebx,%edi
+	andl	$2155905152,%esi
+	andl	$2155905152,%edi
+	movl	%esi,%r10d
+	movl	%edi,%r11d
+	shrl	$7,%r10d
+	leal	(%rax,%rax,1),%r8d
+	shrl	$7,%r11d
+	leal	(%rbx,%rbx,1),%r9d
+	subl	%r10d,%esi
+	subl	%r11d,%edi
+	andl	$4278124286,%r8d
+	andl	$4278124286,%r9d
+	andl	$454761243,%esi
+	andl	$454761243,%edi
+	movl	%eax,%r10d
+	movl	%ebx,%r11d
+	xorl	%esi,%r8d
+	xorl	%edi,%r9d
+
+	xorl	%r8d,%eax
+	xorl	%r9d,%ebx
+	movl	%ecx,%esi
+	movl	%edx,%edi
+	roll	$24,%eax
+	roll	$24,%ebx
+	andl	$2155905152,%esi
+	andl	$2155905152,%edi
+	xorl	%r8d,%eax
+	xorl	%r9d,%ebx
+	movl	%esi,%r12d
+	movl	%edi,%ebp
+	rorl	$16,%r10d
+	rorl	$16,%r11d
+	shrl	$7,%r12d
+	leal	(%rcx,%rcx,1),%r8d
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+	shrl	$7,%ebp
+	leal	(%rdx,%rdx,1),%r9d
+	rorl	$8,%r10d
+	rorl	$8,%r11d
+	subl	%r12d,%esi
+	subl	%ebp,%edi
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+
+	andl	$4278124286,%r8d
+	andl	$4278124286,%r9d
+	andl	$454761243,%esi
+	andl	$454761243,%edi
+	movl	%ecx,%r12d
+	movl	%edx,%ebp
+	xorl	%esi,%r8d
+	xorl	%edi,%r9d
+
+	xorl	%r8d,%ecx
+	xorl	%r9d,%edx
+	roll	$24,%ecx
+	roll	$24,%edx
+	xorl	%r8d,%ecx
+	xorl	%r9d,%edx
+	movl	0(%r14),%esi
+	rorl	$16,%r12d
+	rorl	$16,%ebp
+	movl	64(%r14),%edi
+	xorl	%r12d,%ecx
+	xorl	%ebp,%edx
+	movl	128(%r14),%r8d
+	rorl	$8,%r12d
+	rorl	$8,%ebp
+	movl	192(%r14),%r9d
+	xorl	%r12d,%ecx
+	xorl	%ebp,%edx
+	jmp	.Lenc_loop_compact
+.align	16
+.Lenc_compact_done:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
+.globl	AES_encrypt
+.type	AES_encrypt,@function
+.align	16
+.globl	asm_AES_encrypt
+.hidden	asm_AES_encrypt
+asm_AES_encrypt:
+AES_encrypt:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+
+	movq	%rsp,%r10
+	leaq	-63(%rdx),%rcx
+	andq	$-64,%rsp
+	subq	%rsp,%rcx
+	negq	%rcx
+	andq	$960,%rcx
+	subq	%rcx,%rsp
+	subq	$32,%rsp
+
+	movq	%rsi,16(%rsp)
+	movq	%r10,24(%rsp)
+.Lenc_prologue:
+
+	movq	%rdx,%r15
+	movl	240(%r15),%r13d
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+
+	shll	$4,%r13d
+	leaq	(%r15,%r13,1),%rbp
+	movq	%r15,(%rsp)
+	movq	%rbp,8(%rsp)
+
+
+	leaq	.LAES_Te+2048(%rip),%r14
+	leaq	768(%rsp),%rbp
+	subq	%r14,%rbp
+	andq	$768,%rbp
+	leaq	(%r14,%rbp,1),%r14
+
+	call	_x86_64_AES_encrypt_compact
+
+	movq	16(%rsp),%r9
+	movq	24(%rsp),%rsi
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lenc_epilogue:
+	.byte	0xf3,0xc3
+.size	AES_encrypt,.-AES_encrypt
+.type	_x86_64_AES_decrypt,@function
+.align	16
+_x86_64_AES_decrypt:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+
+	movl	240(%r15),%r13d
+	subl	$1,%r13d
+	jmp	.Ldec_loop
+.align	16
+.Ldec_loop:
+
+	movzbl	%al,%esi
+	movzbl	%bl,%edi
+	movzbl	%cl,%ebp
+	movl	0(%r14,%rsi,8),%r10d
+	movl	0(%r14,%rdi,8),%r11d
+	movl	0(%r14,%rbp,8),%r12d
+
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	movzbl	%dl,%ebp
+	xorl	3(%r14,%rsi,8),%r10d
+	xorl	3(%r14,%rdi,8),%r11d
+	movl	0(%r14,%rbp,8),%r8d
+
+	movzbl	%bh,%esi
+	shrl	$16,%eax
+	movzbl	%ch,%ebp
+	xorl	3(%r14,%rsi,8),%r12d
+	shrl	$16,%edx
+	xorl	3(%r14,%rbp,8),%r8d
+
+	shrl	$16,%ebx
+	leaq	16(%r15),%r15
+	shrl	$16,%ecx
+
+	movzbl	%cl,%esi
+	movzbl	%dl,%edi
+	movzbl	%al,%ebp
+	xorl	2(%r14,%rsi,8),%r10d
+	xorl	2(%r14,%rdi,8),%r11d
+	xorl	2(%r14,%rbp,8),%r12d
+
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	movzbl	%bl,%ebp
+	xorl	1(%r14,%rsi,8),%r10d
+	xorl	1(%r14,%rdi,8),%r11d
+	xorl	2(%r14,%rbp,8),%r8d
+
+	movzbl	%dh,%esi
+	movl	12(%r15),%edx
+	movzbl	%ah,%ebp
+	xorl	1(%r14,%rsi,8),%r12d
+	movl	0(%r15),%eax
+	xorl	1(%r14,%rbp,8),%r8d
+
+	xorl	%r10d,%eax
+	movl	4(%r15),%ebx
+	movl	8(%r15),%ecx
+	xorl	%r12d,%ecx
+	xorl	%r11d,%ebx
+	xorl	%r8d,%edx
+	subl	$1,%r13d
+	jnz	.Ldec_loop
+	leaq	2048(%r14),%r14
+	movzbl	%al,%esi
+	movzbl	%bl,%edi
+	movzbl	%cl,%ebp
+	movzbl	(%r14,%rsi,1),%r10d
+	movzbl	(%r14,%rdi,1),%r11d
+	movzbl	(%r14,%rbp,1),%r12d
+
+	movzbl	%dl,%esi
+	movzbl	%dh,%edi
+	movzbl	%ah,%ebp
+	movzbl	(%r14,%rsi,1),%r8d
+	movzbl	(%r14,%rdi,1),%edi
+	movzbl	(%r14,%rbp,1),%ebp
+
+	shll	$8,%edi
+	shll	$8,%ebp
+
+	xorl	%edi,%r10d
+	xorl	%ebp,%r11d
+	shrl	$16,%edx
+
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	shrl	$16,%eax
+	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%rdi,1),%edi
+
+	shll	$8,%esi
+	shll	$8,%edi
+	shrl	$16,%ebx
+	xorl	%esi,%r12d
+	xorl	%edi,%r8d
+	shrl	$16,%ecx
+
+	movzbl	%cl,%esi
+	movzbl	%dl,%edi
+	movzbl	%al,%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%rdi,1),%edi
+	movzbl	(%r14,%rbp,1),%ebp
+
+	shll	$16,%esi
+	shll	$16,%edi
+	shll	$16,%ebp
+
+	xorl	%esi,%r10d
+	xorl	%edi,%r11d
+	xorl	%ebp,%r12d
+
+	movzbl	%bl,%esi
+	movzbl	%bh,%edi
+	movzbl	%ch,%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%rdi,1),%edi
+	movzbl	(%r14,%rbp,1),%ebp
+
+	shll	$16,%esi
+	shll	$24,%edi
+	shll	$24,%ebp
+
+	xorl	%esi,%r8d
+	xorl	%edi,%r10d
+	xorl	%ebp,%r11d
+
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	movl	16+12(%r15),%edx
+	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%rdi,1),%edi
+	movl	16+0(%r15),%eax
+
+	shll	$24,%esi
+	shll	$24,%edi
+
+	xorl	%esi,%r12d
+	xorl	%edi,%r8d
+
+	movl	16+4(%r15),%ebx
+	movl	16+8(%r15),%ecx
+	leaq	-2048(%r14),%r14
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
+.type	_x86_64_AES_decrypt_compact,@function
+.align	16
+_x86_64_AES_decrypt_compact:
+	leaq	128(%r14),%r8
+	movl	0-128(%r8),%edi
+	movl	32-128(%r8),%ebp
+	movl	64-128(%r8),%r10d
+	movl	96-128(%r8),%r11d
+	movl	128-128(%r8),%edi
+	movl	160-128(%r8),%ebp
+	movl	192-128(%r8),%r10d
+	movl	224-128(%r8),%r11d
+	jmp	.Ldec_loop_compact
+
+.align	16
+.Ldec_loop_compact:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+	leaq	16(%r15),%r15
+	movzbl	%al,%r10d
+	movzbl	%bl,%r11d
+	movzbl	%cl,%r12d
+	movzbl	(%r14,%r10,1),%r10d
+	movzbl	(%r14,%r11,1),%r11d
+	movzbl	(%r14,%r12,1),%r12d
+
+	movzbl	%dl,%r8d
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	movzbl	(%r14,%r8,1),%r8d
+	movzbl	(%r14,%rsi,1),%r9d
+	movzbl	(%r14,%rdi,1),%r13d
+
+	movzbl	%bh,%ebp
+	movzbl	%ch,%esi
+	shrl	$16,%ecx
+	movzbl	(%r14,%rbp,1),%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	shrl	$16,%edx
+
+	movzbl	%cl,%edi
+	shll	$8,%r9d
+	shll	$8,%r13d
+	movzbl	(%r14,%rdi,1),%edi
+	xorl	%r9d,%r10d
+	xorl	%r13d,%r11d
+
+	movzbl	%dl,%r9d
+	shrl	$16,%eax
+	shrl	$16,%ebx
+	movzbl	%al,%r13d
+	shll	$8,%ebp
+	shll	$8,%esi
+	movzbl	(%r14,%r9,1),%r9d
+	movzbl	(%r14,%r13,1),%r13d
+	xorl	%ebp,%r12d
+	xorl	%esi,%r8d
+
+	movzbl	%bl,%ebp
+	movzbl	%bh,%esi
+	shll	$16,%edi
+	movzbl	(%r14,%rbp,1),%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	xorl	%edi,%r10d
+
+	movzbl	%ch,%edi
+	shll	$16,%r9d
+	shll	$16,%r13d
+	movzbl	(%r14,%rdi,1),%ebx
+	xorl	%r9d,%r11d
+	xorl	%r13d,%r12d
+
+	movzbl	%dh,%edi
+	shrl	$8,%eax
+	shll	$16,%ebp
+	movzbl	(%r14,%rdi,1),%ecx
+	movzbl	(%r14,%rax,1),%edx
+	xorl	%ebp,%r8d
+
+	shll	$24,%esi
+	shll	$24,%ebx
+	shll	$24,%ecx
+	xorl	%esi,%r10d
+	shll	$24,%edx
+	xorl	%r11d,%ebx
+	movl	%r10d,%eax
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+	cmpq	16(%rsp),%r15
+	je	.Ldec_compact_done
+
+	movq	256+0(%r14),%rsi
+	shlq	$32,%rbx
+	shlq	$32,%rdx
+	movq	256+8(%r14),%rdi
+	orq	%rbx,%rax
+	orq	%rdx,%rcx
+	movq	256+16(%r14),%rbp
+	movq	%rax,%rbx
+	movq	%rcx,%rdx
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r9
+	movq	%rdx,%r12
+	shrq	$7,%r9
+	leaq	(%rax,%rax,1),%r8
+	shrq	$7,%r12
+	leaq	(%rcx,%rcx,1),%r11
+	subq	%r9,%rbx
+	subq	%r12,%rdx
+	andq	%rdi,%r8
+	andq	%rdi,%r11
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%r8,%rbx
+	xorq	%r11,%rdx
+	movq	%rbx,%r8
+	movq	%rdx,%r11
+
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r10
+	movq	%rdx,%r13
+	shrq	$7,%r10
+	leaq	(%r8,%r8,1),%r9
+	shrq	$7,%r13
+	leaq	(%r11,%r11,1),%r12
+	subq	%r10,%rbx
+	subq	%r13,%rdx
+	andq	%rdi,%r9
+	andq	%rdi,%r12
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%r9,%rbx
+	xorq	%r12,%rdx
+	movq	%rbx,%r9
+	movq	%rdx,%r12
+
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r10
+	movq	%rdx,%r13
+	shrq	$7,%r10
+	xorq	%rax,%r8
+	shrq	$7,%r13
+	xorq	%rcx,%r11
+	subq	%r10,%rbx
+	subq	%r13,%rdx
+	leaq	(%r9,%r9,1),%r10
+	leaq	(%r12,%r12,1),%r13
+	xorq	%rax,%r9
+	xorq	%rcx,%r12
+	andq	%rdi,%r10
+	andq	%rdi,%r13
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%rbx,%r10
+	xorq	%rdx,%r13
+
+	xorq	%r10,%rax
+	xorq	%r13,%rcx
+	xorq	%r10,%r8
+	xorq	%r13,%r11
+	movq	%rax,%rbx
+	movq	%rcx,%rdx
+	xorq	%r10,%r9
+	xorq	%r13,%r12
+	shrq	$32,%rbx
+	shrq	$32,%rdx
+	xorq	%r8,%r10
+	xorq	%r11,%r13
+	roll	$8,%eax
+	roll	$8,%ecx
+	xorq	%r9,%r10
+	xorq	%r12,%r13
+
+	roll	$8,%ebx
+	roll	$8,%edx
+	xorl	%r10d,%eax
+	xorl	%r13d,%ecx
+	shrq	$32,%r10
+	shrq	$32,%r13
+	xorl	%r10d,%ebx
+	xorl	%r13d,%edx
+
+	movq	%r8,%r10
+	movq	%r11,%r13
+	shrq	$32,%r10
+	shrq	$32,%r13
+	roll	$24,%r8d
+	roll	$24,%r11d
+	roll	$24,%r10d
+	roll	$24,%r13d
+	xorl	%r8d,%eax
+	xorl	%r11d,%ecx
+	movq	%r9,%r8
+	movq	%r12,%r11
+	xorl	%r10d,%ebx
+	xorl	%r13d,%edx
+
+	movq	0(%r14),%rsi
+	shrq	$32,%r8
+	shrq	$32,%r11
+	movq	64(%r14),%rdi
+	roll	$16,%r9d
+	roll	$16,%r12d
+	movq	128(%r14),%rbp
+	roll	$16,%r8d
+	roll	$16,%r11d
+	movq	192(%r14),%r10
+	xorl	%r9d,%eax
+	xorl	%r12d,%ecx
+	movq	256(%r14),%r13
+	xorl	%r8d,%ebx
+	xorl	%r11d,%edx
+	jmp	.Ldec_loop_compact
+.align	16
+.Ldec_compact_done:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
+.globl	AES_decrypt
+.type	AES_decrypt,@function
+.align	16
+.globl	asm_AES_decrypt
+.hidden	asm_AES_decrypt
+asm_AES_decrypt:
+AES_decrypt:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+
+	movq	%rsp,%r10
+	leaq	-63(%rdx),%rcx
+	andq	$-64,%rsp
+	subq	%rsp,%rcx
+	negq	%rcx
+	andq	$960,%rcx
+	subq	%rcx,%rsp
+	subq	$32,%rsp
+
+	movq	%rsi,16(%rsp)
+	movq	%r10,24(%rsp)
+.Ldec_prologue:
+
+	movq	%rdx,%r15
+	movl	240(%r15),%r13d
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+
+	shll	$4,%r13d
+	leaq	(%r15,%r13,1),%rbp
+	movq	%r15,(%rsp)
+	movq	%rbp,8(%rsp)
+
+
+	leaq	.LAES_Td+2048(%rip),%r14
+	leaq	768(%rsp),%rbp
+	subq	%r14,%rbp
+	andq	$768,%rbp
+	leaq	(%r14,%rbp,1),%r14
+	shrq	$3,%rbp
+	addq	%rbp,%r14
+
+	call	_x86_64_AES_decrypt_compact
+
+	movq	16(%rsp),%r9
+	movq	24(%rsp),%rsi
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Ldec_epilogue:
+	.byte	0xf3,0xc3
+.size	AES_decrypt,.-AES_decrypt
+.globl	private_AES_set_encrypt_key
+.type	private_AES_set_encrypt_key,@function
+.align	16
+private_AES_set_encrypt_key:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$8,%rsp
+.Lenc_key_prologue:
+
+	call	_x86_64_AES_set_encrypt_key
+
+	movq	8(%rsp),%r15
+	movq	16(%rsp),%r14
+	movq	24(%rsp),%r13
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%rbp
+	movq	48(%rsp),%rbx
+	addq	$56,%rsp
+.Lenc_key_epilogue:
+	.byte	0xf3,0xc3
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
+
+.type	_x86_64_AES_set_encrypt_key,@function
+.align	16
+_x86_64_AES_set_encrypt_key:
+	movl	%esi,%ecx
+	movq	%rdi,%rsi
+	movq	%rdx,%rdi
+
+	testq	$-1,%rsi
+	jz	.Lbadpointer
+	testq	$-1,%rdi
+	jz	.Lbadpointer
+
+	leaq	.LAES_Te(%rip),%rbp
+	leaq	2048+128(%rbp),%rbp
+
+
+	movl	0-128(%rbp),%eax
+	movl	32-128(%rbp),%ebx
+	movl	64-128(%rbp),%r8d
+	movl	96-128(%rbp),%edx
+	movl	128-128(%rbp),%eax
+	movl	160-128(%rbp),%ebx
+	movl	192-128(%rbp),%r8d
+	movl	224-128(%rbp),%edx
+
+	cmpl	$128,%ecx
+	je	.L10rounds
+	cmpl	$192,%ecx
+	je	.L12rounds
+	cmpl	$256,%ecx
+	je	.L14rounds
+	movq	$-2,%rax
+	jmp	.Lexit
+
+.L10rounds:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rdx
+	movq	%rax,0(%rdi)
+	movq	%rdx,8(%rdi)
+
+	shrq	$32,%rdx
+	xorl	%ecx,%ecx
+	jmp	.L10shortcut
+.align	4
+.L10loop:
+	movl	0(%rdi),%eax
+	movl	12(%rdi),%edx
+.L10shortcut:
+	movzbl	%dl,%esi
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+
+	xorl	1024-128(%rbp,%rcx,4),%eax
+	movl	%eax,16(%rdi)
+	xorl	4(%rdi),%eax
+	movl	%eax,20(%rdi)
+	xorl	8(%rdi),%eax
+	movl	%eax,24(%rdi)
+	xorl	12(%rdi),%eax
+	movl	%eax,28(%rdi)
+	addl	$1,%ecx
+	leaq	16(%rdi),%rdi
+	cmpl	$10,%ecx
+	jl	.L10loop
+
+	movl	$10,80(%rdi)
+	xorq	%rax,%rax
+	jmp	.Lexit
+
+.L12rounds:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rbx
+	movq	16(%rsi),%rdx
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rdx,16(%rdi)
+
+	shrq	$32,%rdx
+	xorl	%ecx,%ecx
+	jmp	.L12shortcut
+.align	4
+.L12loop:
+	movl	0(%rdi),%eax
+	movl	20(%rdi),%edx
+.L12shortcut:
+	movzbl	%dl,%esi
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+
+	xorl	1024-128(%rbp,%rcx,4),%eax
+	movl	%eax,24(%rdi)
+	xorl	4(%rdi),%eax
+	movl	%eax,28(%rdi)
+	xorl	8(%rdi),%eax
+	movl	%eax,32(%rdi)
+	xorl	12(%rdi),%eax
+	movl	%eax,36(%rdi)
+
+	cmpl	$7,%ecx
+	je	.L12break
+	addl	$1,%ecx
+
+	xorl	16(%rdi),%eax
+	movl	%eax,40(%rdi)
+	xorl	20(%rdi),%eax
+	movl	%eax,44(%rdi)
+
+	leaq	24(%rdi),%rdi
+	jmp	.L12loop
+.L12break:
+	movl	$12,72(%rdi)
+	xorq	%rax,%rax
+	jmp	.Lexit
+
+.L14rounds:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rbx
+	movq	16(%rsi),%rcx
+	movq	24(%rsi),%rdx
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+
+	shrq	$32,%rdx
+	xorl	%ecx,%ecx
+	jmp	.L14shortcut
+.align	4
+.L14loop:
+	movl	0(%rdi),%eax
+	movl	28(%rdi),%edx
+.L14shortcut:
+	movzbl	%dl,%esi
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+
+	xorl	1024-128(%rbp,%rcx,4),%eax
+	movl	%eax,32(%rdi)
+	xorl	4(%rdi),%eax
+	movl	%eax,36(%rdi)
+	xorl	8(%rdi),%eax
+	movl	%eax,40(%rdi)
+	xorl	12(%rdi),%eax
+	movl	%eax,44(%rdi)
+
+	cmpl	$6,%ecx
+	je	.L14break
+	addl	$1,%ecx
+
+	movl	%eax,%edx
+	movl	16(%rdi),%eax
+	movzbl	%dl,%esi
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shrl	$16,%edx
+	shll	$8,%ebx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+
+	movl	%eax,48(%rdi)
+	xorl	20(%rdi),%eax
+	movl	%eax,52(%rdi)
+	xorl	24(%rdi),%eax
+	movl	%eax,56(%rdi)
+	xorl	28(%rdi),%eax
+	movl	%eax,60(%rdi)
+
+	leaq	32(%rdi),%rdi
+	jmp	.L14loop
+.L14break:
+	movl	$14,48(%rdi)
+	xorq	%rax,%rax
+	jmp	.Lexit
+
+.Lbadpointer:
+	movq	$-1,%rax
+.Lexit:
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
+.globl	private_AES_set_decrypt_key
+.type	private_AES_set_decrypt_key,@function
+.align	16
+private_AES_set_decrypt_key:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	pushq	%rdx
+.Ldec_key_prologue:
+
+	call	_x86_64_AES_set_encrypt_key
+	movq	(%rsp),%r8
+	cmpl	$0,%eax
+	jne	.Labort
+
+	movl	240(%r8),%r14d
+	xorq	%rdi,%rdi
+	leaq	(%rdi,%r14,4),%rcx
+	movq	%r8,%rsi
+	leaq	(%r8,%rcx,4),%rdi
+.align	4
+.Linvert:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rbx
+	movq	0(%rdi),%rcx
+	movq	8(%rdi),%rdx
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,0(%rsi)
+	movq	%rdx,8(%rsi)
+	leaq	16(%rsi),%rsi
+	leaq	-16(%rdi),%rdi
+	cmpq	%rsi,%rdi
+	jne	.Linvert
+
+	leaq	.LAES_Te+2048+1024(%rip),%rax
+
+	movq	40(%rax),%rsi
+	movq	48(%rax),%rdi
+	movq	56(%rax),%rbp
+
+	movq	%r8,%r15
+	subl	$1,%r14d
+.align	4
+.Lpermute:
+	leaq	16(%r15),%r15
+	movq	0(%r15),%rax
+	movq	8(%r15),%rcx
+	movq	%rax,%rbx
+	movq	%rcx,%rdx
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r9
+	movq	%rdx,%r12
+	shrq	$7,%r9
+	leaq	(%rax,%rax,1),%r8
+	shrq	$7,%r12
+	leaq	(%rcx,%rcx,1),%r11
+	subq	%r9,%rbx
+	subq	%r12,%rdx
+	andq	%rdi,%r8
+	andq	%rdi,%r11
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%r8,%rbx
+	xorq	%r11,%rdx
+	movq	%rbx,%r8
+	movq	%rdx,%r11
+
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r10
+	movq	%rdx,%r13
+	shrq	$7,%r10
+	leaq	(%r8,%r8,1),%r9
+	shrq	$7,%r13
+	leaq	(%r11,%r11,1),%r12
+	subq	%r10,%rbx
+	subq	%r13,%rdx
+	andq	%rdi,%r9
+	andq	%rdi,%r12
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%r9,%rbx
+	xorq	%r12,%rdx
+	movq	%rbx,%r9
+	movq	%rdx,%r12
+
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r10
+	movq	%rdx,%r13
+	shrq	$7,%r10
+	xorq	%rax,%r8
+	shrq	$7,%r13
+	xorq	%rcx,%r11
+	subq	%r10,%rbx
+	subq	%r13,%rdx
+	leaq	(%r9,%r9,1),%r10
+	leaq	(%r12,%r12,1),%r13
+	xorq	%rax,%r9
+	xorq	%rcx,%r12
+	andq	%rdi,%r10
+	andq	%rdi,%r13
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%rbx,%r10
+	xorq	%rdx,%r13
+
+	xorq	%r10,%rax
+	xorq	%r13,%rcx
+	xorq	%r10,%r8
+	xorq	%r13,%r11
+	movq	%rax,%rbx
+	movq	%rcx,%rdx
+	xorq	%r10,%r9
+	xorq	%r13,%r12
+	shrq	$32,%rbx
+	shrq	$32,%rdx
+	xorq	%r8,%r10
+	xorq	%r11,%r13
+	roll	$8,%eax
+	roll	$8,%ecx
+	xorq	%r9,%r10
+	xorq	%r12,%r13
+
+	roll	$8,%ebx
+	roll	$8,%edx
+	xorl	%r10d,%eax
+	xorl	%r13d,%ecx
+	shrq	$32,%r10
+	shrq	$32,%r13
+	xorl	%r10d,%ebx
+	xorl	%r13d,%edx
+
+	movq	%r8,%r10
+	movq	%r11,%r13
+	shrq	$32,%r10
+	shrq	$32,%r13
+	roll	$24,%r8d
+	roll	$24,%r11d
+	roll	$24,%r10d
+	roll	$24,%r13d
+	xorl	%r8d,%eax
+	xorl	%r11d,%ecx
+	movq	%r9,%r8
+	movq	%r12,%r11
+	xorl	%r10d,%ebx
+	xorl	%r13d,%edx
+
+
+	shrq	$32,%r8
+	shrq	$32,%r11
+
+	roll	$16,%r9d
+	roll	$16,%r12d
+
+	roll	$16,%r8d
+	roll	$16,%r11d
+
+	xorl	%r9d,%eax
+	xorl	%r12d,%ecx
+
+	xorl	%r8d,%ebx
+	xorl	%r11d,%edx
+	movl	%eax,0(%r15)
+	movl	%ebx,4(%r15)
+	movl	%ecx,8(%r15)
+	movl	%edx,12(%r15)
+	subl	$1,%r14d
+	jnz	.Lpermute
+
+	xorq	%rax,%rax
+.Labort:
+	movq	8(%rsp),%r15
+	movq	16(%rsp),%r14
+	movq	24(%rsp),%r13
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%rbp
+	movq	48(%rsp),%rbx
+	addq	$56,%rsp
+.Ldec_key_epilogue:
+	.byte	0xf3,0xc3
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+.globl	AES_cbc_encrypt
+.type	AES_cbc_encrypt,@function
+.align	16
+
+.globl	asm_AES_cbc_encrypt
+.hidden	asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
+AES_cbc_encrypt:
+	cmpq	$0,%rdx
+	je	.Lcbc_epilogue
+	pushfq
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+.Lcbc_prologue:
+
+	cld
+	movl	%r9d,%r9d
+
+	leaq	.LAES_Te(%rip),%r14
+	cmpq	$0,%r9
+	jne	.Lcbc_picked_te
+	leaq	.LAES_Td(%rip),%r14
+.Lcbc_picked_te:
+
+	movl	OPENSSL_ia32cap_P(%rip),%r10d
+	cmpq	$512,%rdx
+	jb	.Lcbc_slow_prologue
+	testq	$15,%rdx
+	jnz	.Lcbc_slow_prologue
+	btl	$28,%r10d
+	jc	.Lcbc_slow_prologue
+
+
+	leaq	-88-248(%rsp),%r15
+	andq	$-64,%r15
+
+
+	movq	%r14,%r10
+	leaq	2304(%r14),%r11
+	movq	%r15,%r12
+	andq	$4095,%r10
+	andq	$4095,%r11
+	andq	$4095,%r12
+
+	cmpq	%r11,%r12
+	jb	.Lcbc_te_break_out
+	subq	%r11,%r12
+	subq	%r12,%r15
+	jmp	.Lcbc_te_ok
+.Lcbc_te_break_out:
+	subq	%r10,%r12
+	andq	$4095,%r12
+	addq	$320,%r12
+	subq	%r12,%r15
+.align	4
+.Lcbc_te_ok:
+
+	xchgq	%rsp,%r15
+
+	movq	%r15,16(%rsp)
+.Lcbc_fast_body:
+	movq	%rdi,24(%rsp)
+	movq	%rsi,32(%rsp)
+	movq	%rdx,40(%rsp)
+	movq	%rcx,48(%rsp)
+	movq	%r8,56(%rsp)
+	movl	$0,80+240(%rsp)
+	movq	%r8,%rbp
+	movq	%r9,%rbx
+	movq	%rsi,%r9
+	movq	%rdi,%r8
+	movq	%rcx,%r15
+
+	movl	240(%r15),%eax
+
+	movq	%r15,%r10
+	subq	%r14,%r10
+	andq	$4095,%r10
+	cmpq	$2304,%r10
+	jb	.Lcbc_do_ecopy
+	cmpq	$4096-248,%r10
+	jb	.Lcbc_skip_ecopy
+.align	4
+.Lcbc_do_ecopy:
+	movq	%r15,%rsi
+	leaq	80(%rsp),%rdi
+	leaq	80(%rsp),%r15
+	movl	$30,%ecx
+.long	0x90A548F3	
+	movl	%eax,(%rdi)
+.Lcbc_skip_ecopy:
+	movq	%r15,0(%rsp)
+
+	movl	$18,%ecx
+.align	4
+.Lcbc_prefetch_te:
+	movq	0(%r14),%r10
+	movq	32(%r14),%r11
+	movq	64(%r14),%r12
+	movq	96(%r14),%r13
+	leaq	128(%r14),%r14
+	subl	$1,%ecx
+	jnz	.Lcbc_prefetch_te
+	leaq	-2304(%r14),%r14
+
+	cmpq	$0,%rbx
+	je	.LFAST_DECRYPT
+
+
+	movl	0(%rbp),%eax
+	movl	4(%rbp),%ebx
+	movl	8(%rbp),%ecx
+	movl	12(%rbp),%edx
+
+.align	4
+.Lcbc_fast_enc_loop:
+	xorl	0(%r8),%eax
+	xorl	4(%r8),%ebx
+	xorl	8(%r8),%ecx
+	xorl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+
+	call	_x86_64_AES_encrypt
+
+	movq	24(%rsp),%r8
+	movq	40(%rsp),%r10
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	subq	$16,%r10
+	testq	$-16,%r10
+	movq	%r10,40(%rsp)
+	jnz	.Lcbc_fast_enc_loop
+	movq	56(%rsp),%rbp
+	movl	%eax,0(%rbp)
+	movl	%ebx,4(%rbp)
+	movl	%ecx,8(%rbp)
+	movl	%edx,12(%rbp)
+
+	jmp	.Lcbc_fast_cleanup
+
+
+.align	16
+.LFAST_DECRYPT:
+	cmpq	%r8,%r9
+	je	.Lcbc_fast_dec_in_place
+
+	movq	%rbp,64(%rsp)
+.align	4
+.Lcbc_fast_dec_loop:
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+
+	call	_x86_64_AES_decrypt
+
+	movq	64(%rsp),%rbp
+	movq	24(%rsp),%r8
+	movq	40(%rsp),%r10
+	xorl	0(%rbp),%eax
+	xorl	4(%rbp),%ebx
+	xorl	8(%rbp),%ecx
+	xorl	12(%rbp),%edx
+	movq	%r8,%rbp
+
+	subq	$16,%r10
+	movq	%r10,40(%rsp)
+	movq	%rbp,64(%rsp)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	jnz	.Lcbc_fast_dec_loop
+	movq	56(%rsp),%r12
+	movq	0(%rbp),%r10
+	movq	8(%rbp),%r11
+	movq	%r10,0(%r12)
+	movq	%r11,8(%r12)
+	jmp	.Lcbc_fast_cleanup
+
+.align	16
+.Lcbc_fast_dec_in_place:
+	movq	0(%rbp),%r10
+	movq	8(%rbp),%r11
+	movq	%r10,0+64(%rsp)
+	movq	%r11,8+64(%rsp)
+.align	4
+.Lcbc_fast_dec_in_place_loop:
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+
+	call	_x86_64_AES_decrypt
+
+	movq	24(%rsp),%r8
+	movq	40(%rsp),%r10
+	xorl	0+64(%rsp),%eax
+	xorl	4+64(%rsp),%ebx
+	xorl	8+64(%rsp),%ecx
+	xorl	12+64(%rsp),%edx
+
+	movq	0(%r8),%r11
+	movq	8(%r8),%r12
+	subq	$16,%r10
+	jz	.Lcbc_fast_dec_in_place_done
+
+	movq	%r11,0+64(%rsp)
+	movq	%r12,8+64(%rsp)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	movq	%r10,40(%rsp)
+	jmp	.Lcbc_fast_dec_in_place_loop
+.Lcbc_fast_dec_in_place_done:
+	movq	56(%rsp),%rdi
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+.align	4
+.Lcbc_fast_cleanup:
+	cmpl	$0,80+240(%rsp)
+	leaq	80(%rsp),%rdi
+	je	.Lcbc_exit
+	movl	$30,%ecx
+	xorq	%rax,%rax
+.long	0x90AB48F3	
+
+	jmp	.Lcbc_exit
+
+
+.align	16
+.Lcbc_slow_prologue:
+
+	leaq	-88(%rsp),%rbp
+	andq	$-64,%rbp
+
+	leaq	-88-63(%rcx),%r10
+	subq	%rbp,%r10
+	negq	%r10
+	andq	$960,%r10
+	subq	%r10,%rbp
+
+	xchgq	%rsp,%rbp
+
+	movq	%rbp,16(%rsp)
+.Lcbc_slow_body:
+
+
+
+
+	movq	%r8,56(%rsp)
+	movq	%r8,%rbp
+	movq	%r9,%rbx
+	movq	%rsi,%r9
+	movq	%rdi,%r8
+	movq	%rcx,%r15
+	movq	%rdx,%r10
+
+	movl	240(%r15),%eax
+	movq	%r15,0(%rsp)
+	shll	$4,%eax
+	leaq	(%r15,%rax,1),%rax
+	movq	%rax,8(%rsp)
+
+
+	leaq	2048(%r14),%r14
+	leaq	768-8(%rsp),%rax
+	subq	%r14,%rax
+	andq	$768,%rax
+	leaq	(%r14,%rax,1),%r14
+
+	cmpq	$0,%rbx
+	je	.LSLOW_DECRYPT
+
+
+	testq	$-16,%r10
+	movl	0(%rbp),%eax
+	movl	4(%rbp),%ebx
+	movl	8(%rbp),%ecx
+	movl	12(%rbp),%edx
+	jz	.Lcbc_slow_enc_tail	
+
+.align	4
+.Lcbc_slow_enc_loop:
+	xorl	0(%r8),%eax
+	xorl	4(%r8),%ebx
+	xorl	8(%r8),%ecx
+	xorl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+	movq	%r9,32(%rsp)
+	movq	%r10,40(%rsp)
+
+	call	_x86_64_AES_encrypt_compact
+
+	movq	24(%rsp),%r8
+	movq	32(%rsp),%r9
+	movq	40(%rsp),%r10
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	subq	$16,%r10
+	testq	$-16,%r10
+	jnz	.Lcbc_slow_enc_loop
+	testq	$15,%r10
+	jnz	.Lcbc_slow_enc_tail
+	movq	56(%rsp),%rbp
+	movl	%eax,0(%rbp)
+	movl	%ebx,4(%rbp)
+	movl	%ecx,8(%rbp)
+	movl	%edx,12(%rbp)
+
+	jmp	.Lcbc_exit
+
+.align	4
+.Lcbc_slow_enc_tail:
+	movq	%rax,%r11
+	movq	%rcx,%r12
+	movq	%r10,%rcx
+	movq	%r8,%rsi
+	movq	%r9,%rdi
+.long	0x9066A4F3		
+	movq	$16,%rcx
+	subq	%r10,%rcx
+	xorq	%rax,%rax
+.long	0x9066AAF3		
+	movq	%r9,%r8
+	movq	$16,%r10
+	movq	%r11,%rax
+	movq	%r12,%rcx
+	jmp	.Lcbc_slow_enc_loop	
+
+.align	16
+.LSLOW_DECRYPT:
+	shrq	$3,%rax
+	addq	%rax,%r14
+
+	movq	0(%rbp),%r11
+	movq	8(%rbp),%r12
+	movq	%r11,0+64(%rsp)
+	movq	%r12,8+64(%rsp)
+
+.align	4
+.Lcbc_slow_dec_loop:
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+	movq	%r9,32(%rsp)
+	movq	%r10,40(%rsp)
+
+	call	_x86_64_AES_decrypt_compact
+
+	movq	24(%rsp),%r8
+	movq	32(%rsp),%r9
+	movq	40(%rsp),%r10
+	xorl	0+64(%rsp),%eax
+	xorl	4+64(%rsp),%ebx
+	xorl	8+64(%rsp),%ecx
+	xorl	12+64(%rsp),%edx
+
+	movq	0(%r8),%r11
+	movq	8(%r8),%r12
+	subq	$16,%r10
+	jc	.Lcbc_slow_dec_partial
+	jz	.Lcbc_slow_dec_done
+
+	movq	%r11,0+64(%rsp)
+	movq	%r12,8+64(%rsp)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	jmp	.Lcbc_slow_dec_loop
+.Lcbc_slow_dec_done:
+	movq	56(%rsp),%rdi
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	jmp	.Lcbc_exit
+
+.align	4
+.Lcbc_slow_dec_partial:
+	movq	56(%rsp),%rdi
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+
+	movl	%eax,0+64(%rsp)
+	movl	%ebx,4+64(%rsp)
+	movl	%ecx,8+64(%rsp)
+	movl	%edx,12+64(%rsp)
+
+	movq	%r9,%rdi
+	leaq	64(%rsp),%rsi
+	leaq	16(%r10),%rcx
+.long	0x9066A4F3	
+	jmp	.Lcbc_exit
+
+.align	16
+.Lcbc_exit:
+	movq	16(%rsp),%rsi
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lcbc_popfq:
+	popfq
+.Lcbc_epilogue:
+	.byte	0xf3,0xc3
+.size	AES_cbc_encrypt,.-AES_cbc_encrypt
+.align	64
+.LAES_Te:
+.long	0xa56363c6,0xa56363c6
+.long	0x847c7cf8,0x847c7cf8
+.long	0x997777ee,0x997777ee
+.long	0x8d7b7bf6,0x8d7b7bf6
+.long	0x0df2f2ff,0x0df2f2ff
+.long	0xbd6b6bd6,0xbd6b6bd6
+.long	0xb16f6fde,0xb16f6fde
+.long	0x54c5c591,0x54c5c591
+.long	0x50303060,0x50303060
+.long	0x03010102,0x03010102
+.long	0xa96767ce,0xa96767ce
+.long	0x7d2b2b56,0x7d2b2b56
+.long	0x19fefee7,0x19fefee7
+.long	0x62d7d7b5,0x62d7d7b5
+.long	0xe6abab4d,0xe6abab4d
+.long	0x9a7676ec,0x9a7676ec
+.long	0x45caca8f,0x45caca8f
+.long	0x9d82821f,0x9d82821f
+.long	0x40c9c989,0x40c9c989
+.long	0x877d7dfa,0x877d7dfa
+.long	0x15fafaef,0x15fafaef
+.long	0xeb5959b2,0xeb5959b2
+.long	0xc947478e,0xc947478e
+.long	0x0bf0f0fb,0x0bf0f0fb
+.long	0xecadad41,0xecadad41
+.long	0x67d4d4b3,0x67d4d4b3
+.long	0xfda2a25f,0xfda2a25f
+.long	0xeaafaf45,0xeaafaf45
+.long	0xbf9c9c23,0xbf9c9c23
+.long	0xf7a4a453,0xf7a4a453
+.long	0x967272e4,0x967272e4
+.long	0x5bc0c09b,0x5bc0c09b
+.long	0xc2b7b775,0xc2b7b775
+.long	0x1cfdfde1,0x1cfdfde1
+.long	0xae93933d,0xae93933d
+.long	0x6a26264c,0x6a26264c
+.long	0x5a36366c,0x5a36366c
+.long	0x413f3f7e,0x413f3f7e
+.long	0x02f7f7f5,0x02f7f7f5
+.long	0x4fcccc83,0x4fcccc83
+.long	0x5c343468,0x5c343468
+.long	0xf4a5a551,0xf4a5a551
+.long	0x34e5e5d1,0x34e5e5d1
+.long	0x08f1f1f9,0x08f1f1f9
+.long	0x937171e2,0x937171e2
+.long	0x73d8d8ab,0x73d8d8ab
+.long	0x53313162,0x53313162
+.long	0x3f15152a,0x3f15152a
+.long	0x0c040408,0x0c040408
+.long	0x52c7c795,0x52c7c795
+.long	0x65232346,0x65232346
+.long	0x5ec3c39d,0x5ec3c39d
+.long	0x28181830,0x28181830
+.long	0xa1969637,0xa1969637
+.long	0x0f05050a,0x0f05050a
+.long	0xb59a9a2f,0xb59a9a2f
+.long	0x0907070e,0x0907070e
+.long	0x36121224,0x36121224
+.long	0x9b80801b,0x9b80801b
+.long	0x3de2e2df,0x3de2e2df
+.long	0x26ebebcd,0x26ebebcd
+.long	0x6927274e,0x6927274e
+.long	0xcdb2b27f,0xcdb2b27f
+.long	0x9f7575ea,0x9f7575ea
+.long	0x1b090912,0x1b090912
+.long	0x9e83831d,0x9e83831d
+.long	0x742c2c58,0x742c2c58
+.long	0x2e1a1a34,0x2e1a1a34
+.long	0x2d1b1b36,0x2d1b1b36
+.long	0xb26e6edc,0xb26e6edc
+.long	0xee5a5ab4,0xee5a5ab4
+.long	0xfba0a05b,0xfba0a05b
+.long	0xf65252a4,0xf65252a4
+.long	0x4d3b3b76,0x4d3b3b76
+.long	0x61d6d6b7,0x61d6d6b7
+.long	0xceb3b37d,0xceb3b37d
+.long	0x7b292952,0x7b292952
+.long	0x3ee3e3dd,0x3ee3e3dd
+.long	0x712f2f5e,0x712f2f5e
+.long	0x97848413,0x97848413
+.long	0xf55353a6,0xf55353a6
+.long	0x68d1d1b9,0x68d1d1b9
+.long	0x00000000,0x00000000
+.long	0x2cededc1,0x2cededc1
+.long	0x60202040,0x60202040
+.long	0x1ffcfce3,0x1ffcfce3
+.long	0xc8b1b179,0xc8b1b179
+.long	0xed5b5bb6,0xed5b5bb6
+.long	0xbe6a6ad4,0xbe6a6ad4
+.long	0x46cbcb8d,0x46cbcb8d
+.long	0xd9bebe67,0xd9bebe67
+.long	0x4b393972,0x4b393972
+.long	0xde4a4a94,0xde4a4a94
+.long	0xd44c4c98,0xd44c4c98
+.long	0xe85858b0,0xe85858b0
+.long	0x4acfcf85,0x4acfcf85
+.long	0x6bd0d0bb,0x6bd0d0bb
+.long	0x2aefefc5,0x2aefefc5
+.long	0xe5aaaa4f,0xe5aaaa4f
+.long	0x16fbfbed,0x16fbfbed
+.long	0xc5434386,0xc5434386
+.long	0xd74d4d9a,0xd74d4d9a
+.long	0x55333366,0x55333366
+.long	0x94858511,0x94858511
+.long	0xcf45458a,0xcf45458a
+.long	0x10f9f9e9,0x10f9f9e9
+.long	0x06020204,0x06020204
+.long	0x817f7ffe,0x817f7ffe
+.long	0xf05050a0,0xf05050a0
+.long	0x443c3c78,0x443c3c78
+.long	0xba9f9f25,0xba9f9f25
+.long	0xe3a8a84b,0xe3a8a84b
+.long	0xf35151a2,0xf35151a2
+.long	0xfea3a35d,0xfea3a35d
+.long	0xc0404080,0xc0404080
+.long	0x8a8f8f05,0x8a8f8f05
+.long	0xad92923f,0xad92923f
+.long	0xbc9d9d21,0xbc9d9d21
+.long	0x48383870,0x48383870
+.long	0x04f5f5f1,0x04f5f5f1
+.long	0xdfbcbc63,0xdfbcbc63
+.long	0xc1b6b677,0xc1b6b677
+.long	0x75dadaaf,0x75dadaaf
+.long	0x63212142,0x63212142
+.long	0x30101020,0x30101020
+.long	0x1affffe5,0x1affffe5
+.long	0x0ef3f3fd,0x0ef3f3fd
+.long	0x6dd2d2bf,0x6dd2d2bf
+.long	0x4ccdcd81,0x4ccdcd81
+.long	0x140c0c18,0x140c0c18
+.long	0x35131326,0x35131326
+.long	0x2fececc3,0x2fececc3
+.long	0xe15f5fbe,0xe15f5fbe
+.long	0xa2979735,0xa2979735
+.long	0xcc444488,0xcc444488
+.long	0x3917172e,0x3917172e
+.long	0x57c4c493,0x57c4c493
+.long	0xf2a7a755,0xf2a7a755
+.long	0x827e7efc,0x827e7efc
+.long	0x473d3d7a,0x473d3d7a
+.long	0xac6464c8,0xac6464c8
+.long	0xe75d5dba,0xe75d5dba
+.long	0x2b191932,0x2b191932
+.long	0x957373e6,0x957373e6
+.long	0xa06060c0,0xa06060c0
+.long	0x98818119,0x98818119
+.long	0xd14f4f9e,0xd14f4f9e
+.long	0x7fdcdca3,0x7fdcdca3
+.long	0x66222244,0x66222244
+.long	0x7e2a2a54,0x7e2a2a54
+.long	0xab90903b,0xab90903b
+.long	0x8388880b,0x8388880b
+.long	0xca46468c,0xca46468c
+.long	0x29eeeec7,0x29eeeec7
+.long	0xd3b8b86b,0xd3b8b86b
+.long	0x3c141428,0x3c141428
+.long	0x79dedea7,0x79dedea7
+.long	0xe25e5ebc,0xe25e5ebc
+.long	0x1d0b0b16,0x1d0b0b16
+.long	0x76dbdbad,0x76dbdbad
+.long	0x3be0e0db,0x3be0e0db
+.long	0x56323264,0x56323264
+.long	0x4e3a3a74,0x4e3a3a74
+.long	0x1e0a0a14,0x1e0a0a14
+.long	0xdb494992,0xdb494992
+.long	0x0a06060c,0x0a06060c
+.long	0x6c242448,0x6c242448
+.long	0xe45c5cb8,0xe45c5cb8
+.long	0x5dc2c29f,0x5dc2c29f
+.long	0x6ed3d3bd,0x6ed3d3bd
+.long	0xefacac43,0xefacac43
+.long	0xa66262c4,0xa66262c4
+.long	0xa8919139,0xa8919139
+.long	0xa4959531,0xa4959531
+.long	0x37e4e4d3,0x37e4e4d3
+.long	0x8b7979f2,0x8b7979f2
+.long	0x32e7e7d5,0x32e7e7d5
+.long	0x43c8c88b,0x43c8c88b
+.long	0x5937376e,0x5937376e
+.long	0xb76d6dda,0xb76d6dda
+.long	0x8c8d8d01,0x8c8d8d01
+.long	0x64d5d5b1,0x64d5d5b1
+.long	0xd24e4e9c,0xd24e4e9c
+.long	0xe0a9a949,0xe0a9a949
+.long	0xb46c6cd8,0xb46c6cd8
+.long	0xfa5656ac,0xfa5656ac
+.long	0x07f4f4f3,0x07f4f4f3
+.long	0x25eaeacf,0x25eaeacf
+.long	0xaf6565ca,0xaf6565ca
+.long	0x8e7a7af4,0x8e7a7af4
+.long	0xe9aeae47,0xe9aeae47
+.long	0x18080810,0x18080810
+.long	0xd5baba6f,0xd5baba6f
+.long	0x887878f0,0x887878f0
+.long	0x6f25254a,0x6f25254a
+.long	0x722e2e5c,0x722e2e5c
+.long	0x241c1c38,0x241c1c38
+.long	0xf1a6a657,0xf1a6a657
+.long	0xc7b4b473,0xc7b4b473
+.long	0x51c6c697,0x51c6c697
+.long	0x23e8e8cb,0x23e8e8cb
+.long	0x7cdddda1,0x7cdddda1
+.long	0x9c7474e8,0x9c7474e8
+.long	0x211f1f3e,0x211f1f3e
+.long	0xdd4b4b96,0xdd4b4b96
+.long	0xdcbdbd61,0xdcbdbd61
+.long	0x868b8b0d,0x868b8b0d
+.long	0x858a8a0f,0x858a8a0f
+.long	0x907070e0,0x907070e0
+.long	0x423e3e7c,0x423e3e7c
+.long	0xc4b5b571,0xc4b5b571
+.long	0xaa6666cc,0xaa6666cc
+.long	0xd8484890,0xd8484890
+.long	0x05030306,0x05030306
+.long	0x01f6f6f7,0x01f6f6f7
+.long	0x120e0e1c,0x120e0e1c
+.long	0xa36161c2,0xa36161c2
+.long	0x5f35356a,0x5f35356a
+.long	0xf95757ae,0xf95757ae
+.long	0xd0b9b969,0xd0b9b969
+.long	0x91868617,0x91868617
+.long	0x58c1c199,0x58c1c199
+.long	0x271d1d3a,0x271d1d3a
+.long	0xb99e9e27,0xb99e9e27
+.long	0x38e1e1d9,0x38e1e1d9
+.long	0x13f8f8eb,0x13f8f8eb
+.long	0xb398982b,0xb398982b
+.long	0x33111122,0x33111122
+.long	0xbb6969d2,0xbb6969d2
+.long	0x70d9d9a9,0x70d9d9a9
+.long	0x898e8e07,0x898e8e07
+.long	0xa7949433,0xa7949433
+.long	0xb69b9b2d,0xb69b9b2d
+.long	0x221e1e3c,0x221e1e3c
+.long	0x92878715,0x92878715
+.long	0x20e9e9c9,0x20e9e9c9
+.long	0x49cece87,0x49cece87
+.long	0xff5555aa,0xff5555aa
+.long	0x78282850,0x78282850
+.long	0x7adfdfa5,0x7adfdfa5
+.long	0x8f8c8c03,0x8f8c8c03
+.long	0xf8a1a159,0xf8a1a159
+.long	0x80898909,0x80898909
+.long	0x170d0d1a,0x170d0d1a
+.long	0xdabfbf65,0xdabfbf65
+.long	0x31e6e6d7,0x31e6e6d7
+.long	0xc6424284,0xc6424284
+.long	0xb86868d0,0xb86868d0
+.long	0xc3414182,0xc3414182
+.long	0xb0999929,0xb0999929
+.long	0x772d2d5a,0x772d2d5a
+.long	0x110f0f1e,0x110f0f1e
+.long	0xcbb0b07b,0xcbb0b07b
+.long	0xfc5454a8,0xfc5454a8
+.long	0xd6bbbb6d,0xd6bbbb6d
+.long	0x3a16162c,0x3a16162c
+.byte	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.long	0x00000001, 0x00000002, 0x00000004, 0x00000008
+.long	0x00000010, 0x00000020, 0x00000040, 0x00000080
+.long	0x0000001b, 0x00000036, 0x80808080, 0x80808080
+.long	0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
+.align	64
+.LAES_Td:
+.long	0x50a7f451,0x50a7f451
+.long	0x5365417e,0x5365417e
+.long	0xc3a4171a,0xc3a4171a
+.long	0x965e273a,0x965e273a
+.long	0xcb6bab3b,0xcb6bab3b
+.long	0xf1459d1f,0xf1459d1f
+.long	0xab58faac,0xab58faac
+.long	0x9303e34b,0x9303e34b
+.long	0x55fa3020,0x55fa3020
+.long	0xf66d76ad,0xf66d76ad
+.long	0x9176cc88,0x9176cc88
+.long	0x254c02f5,0x254c02f5
+.long	0xfcd7e54f,0xfcd7e54f
+.long	0xd7cb2ac5,0xd7cb2ac5
+.long	0x80443526,0x80443526
+.long	0x8fa362b5,0x8fa362b5
+.long	0x495ab1de,0x495ab1de
+.long	0x671bba25,0x671bba25
+.long	0x980eea45,0x980eea45
+.long	0xe1c0fe5d,0xe1c0fe5d
+.long	0x02752fc3,0x02752fc3
+.long	0x12f04c81,0x12f04c81
+.long	0xa397468d,0xa397468d
+.long	0xc6f9d36b,0xc6f9d36b
+.long	0xe75f8f03,0xe75f8f03
+.long	0x959c9215,0x959c9215
+.long	0xeb7a6dbf,0xeb7a6dbf
+.long	0xda595295,0xda595295
+.long	0x2d83bed4,0x2d83bed4
+.long	0xd3217458,0xd3217458
+.long	0x2969e049,0x2969e049
+.long	0x44c8c98e,0x44c8c98e
+.long	0x6a89c275,0x6a89c275
+.long	0x78798ef4,0x78798ef4
+.long	0x6b3e5899,0x6b3e5899
+.long	0xdd71b927,0xdd71b927
+.long	0xb64fe1be,0xb64fe1be
+.long	0x17ad88f0,0x17ad88f0
+.long	0x66ac20c9,0x66ac20c9
+.long	0xb43ace7d,0xb43ace7d
+.long	0x184adf63,0x184adf63
+.long	0x82311ae5,0x82311ae5
+.long	0x60335197,0x60335197
+.long	0x457f5362,0x457f5362
+.long	0xe07764b1,0xe07764b1
+.long	0x84ae6bbb,0x84ae6bbb
+.long	0x1ca081fe,0x1ca081fe
+.long	0x942b08f9,0x942b08f9
+.long	0x58684870,0x58684870
+.long	0x19fd458f,0x19fd458f
+.long	0x876cde94,0x876cde94
+.long	0xb7f87b52,0xb7f87b52
+.long	0x23d373ab,0x23d373ab
+.long	0xe2024b72,0xe2024b72
+.long	0x578f1fe3,0x578f1fe3
+.long	0x2aab5566,0x2aab5566
+.long	0x0728ebb2,0x0728ebb2
+.long	0x03c2b52f,0x03c2b52f
+.long	0x9a7bc586,0x9a7bc586
+.long	0xa50837d3,0xa50837d3
+.long	0xf2872830,0xf2872830
+.long	0xb2a5bf23,0xb2a5bf23
+.long	0xba6a0302,0xba6a0302
+.long	0x5c8216ed,0x5c8216ed
+.long	0x2b1ccf8a,0x2b1ccf8a
+.long	0x92b479a7,0x92b479a7
+.long	0xf0f207f3,0xf0f207f3
+.long	0xa1e2694e,0xa1e2694e
+.long	0xcdf4da65,0xcdf4da65
+.long	0xd5be0506,0xd5be0506
+.long	0x1f6234d1,0x1f6234d1
+.long	0x8afea6c4,0x8afea6c4
+.long	0x9d532e34,0x9d532e34
+.long	0xa055f3a2,0xa055f3a2
+.long	0x32e18a05,0x32e18a05
+.long	0x75ebf6a4,0x75ebf6a4
+.long	0x39ec830b,0x39ec830b
+.long	0xaaef6040,0xaaef6040
+.long	0x069f715e,0x069f715e
+.long	0x51106ebd,0x51106ebd
+.long	0xf98a213e,0xf98a213e
+.long	0x3d06dd96,0x3d06dd96
+.long	0xae053edd,0xae053edd
+.long	0x46bde64d,0x46bde64d
+.long	0xb58d5491,0xb58d5491
+.long	0x055dc471,0x055dc471
+.long	0x6fd40604,0x6fd40604
+.long	0xff155060,0xff155060
+.long	0x24fb9819,0x24fb9819
+.long	0x97e9bdd6,0x97e9bdd6
+.long	0xcc434089,0xcc434089
+.long	0x779ed967,0x779ed967
+.long	0xbd42e8b0,0xbd42e8b0
+.long	0x888b8907,0x888b8907
+.long	0x385b19e7,0x385b19e7
+.long	0xdbeec879,0xdbeec879
+.long	0x470a7ca1,0x470a7ca1
+.long	0xe90f427c,0xe90f427c
+.long	0xc91e84f8,0xc91e84f8
+.long	0x00000000,0x00000000
+.long	0x83868009,0x83868009
+.long	0x48ed2b32,0x48ed2b32
+.long	0xac70111e,0xac70111e
+.long	0x4e725a6c,0x4e725a6c
+.long	0xfbff0efd,0xfbff0efd
+.long	0x5638850f,0x5638850f
+.long	0x1ed5ae3d,0x1ed5ae3d
+.long	0x27392d36,0x27392d36
+.long	0x64d90f0a,0x64d90f0a
+.long	0x21a65c68,0x21a65c68
+.long	0xd1545b9b,0xd1545b9b
+.long	0x3a2e3624,0x3a2e3624
+.long	0xb1670a0c,0xb1670a0c
+.long	0x0fe75793,0x0fe75793
+.long	0xd296eeb4,0xd296eeb4
+.long	0x9e919b1b,0x9e919b1b
+.long	0x4fc5c080,0x4fc5c080
+.long	0xa220dc61,0xa220dc61
+.long	0x694b775a,0x694b775a
+.long	0x161a121c,0x161a121c
+.long	0x0aba93e2,0x0aba93e2
+.long	0xe52aa0c0,0xe52aa0c0
+.long	0x43e0223c,0x43e0223c
+.long	0x1d171b12,0x1d171b12
+.long	0x0b0d090e,0x0b0d090e
+.long	0xadc78bf2,0xadc78bf2
+.long	0xb9a8b62d,0xb9a8b62d
+.long	0xc8a91e14,0xc8a91e14
+.long	0x8519f157,0x8519f157
+.long	0x4c0775af,0x4c0775af
+.long	0xbbdd99ee,0xbbdd99ee
+.long	0xfd607fa3,0xfd607fa3
+.long	0x9f2601f7,0x9f2601f7
+.long	0xbcf5725c,0xbcf5725c
+.long	0xc53b6644,0xc53b6644
+.long	0x347efb5b,0x347efb5b
+.long	0x7629438b,0x7629438b
+.long	0xdcc623cb,0xdcc623cb
+.long	0x68fcedb6,0x68fcedb6
+.long	0x63f1e4b8,0x63f1e4b8
+.long	0xcadc31d7,0xcadc31d7
+.long	0x10856342,0x10856342
+.long	0x40229713,0x40229713
+.long	0x2011c684,0x2011c684
+.long	0x7d244a85,0x7d244a85
+.long	0xf83dbbd2,0xf83dbbd2
+.long	0x1132f9ae,0x1132f9ae
+.long	0x6da129c7,0x6da129c7
+.long	0x4b2f9e1d,0x4b2f9e1d
+.long	0xf330b2dc,0xf330b2dc
+.long	0xec52860d,0xec52860d
+.long	0xd0e3c177,0xd0e3c177
+.long	0x6c16b32b,0x6c16b32b
+.long	0x99b970a9,0x99b970a9
+.long	0xfa489411,0xfa489411
+.long	0x2264e947,0x2264e947
+.long	0xc48cfca8,0xc48cfca8
+.long	0x1a3ff0a0,0x1a3ff0a0
+.long	0xd82c7d56,0xd82c7d56
+.long	0xef903322,0xef903322
+.long	0xc74e4987,0xc74e4987
+.long	0xc1d138d9,0xc1d138d9
+.long	0xfea2ca8c,0xfea2ca8c
+.long	0x360bd498,0x360bd498
+.long	0xcf81f5a6,0xcf81f5a6
+.long	0x28de7aa5,0x28de7aa5
+.long	0x268eb7da,0x268eb7da
+.long	0xa4bfad3f,0xa4bfad3f
+.long	0xe49d3a2c,0xe49d3a2c
+.long	0x0d927850,0x0d927850
+.long	0x9bcc5f6a,0x9bcc5f6a
+.long	0x62467e54,0x62467e54
+.long	0xc2138df6,0xc2138df6
+.long	0xe8b8d890,0xe8b8d890
+.long	0x5ef7392e,0x5ef7392e
+.long	0xf5afc382,0xf5afc382
+.long	0xbe805d9f,0xbe805d9f
+.long	0x7c93d069,0x7c93d069
+.long	0xa92dd56f,0xa92dd56f
+.long	0xb31225cf,0xb31225cf
+.long	0x3b99acc8,0x3b99acc8
+.long	0xa77d1810,0xa77d1810
+.long	0x6e639ce8,0x6e639ce8
+.long	0x7bbb3bdb,0x7bbb3bdb
+.long	0x097826cd,0x097826cd
+.long	0xf418596e,0xf418596e
+.long	0x01b79aec,0x01b79aec
+.long	0xa89a4f83,0xa89a4f83
+.long	0x656e95e6,0x656e95e6
+.long	0x7ee6ffaa,0x7ee6ffaa
+.long	0x08cfbc21,0x08cfbc21
+.long	0xe6e815ef,0xe6e815ef
+.long	0xd99be7ba,0xd99be7ba
+.long	0xce366f4a,0xce366f4a
+.long	0xd4099fea,0xd4099fea
+.long	0xd67cb029,0xd67cb029
+.long	0xafb2a431,0xafb2a431
+.long	0x31233f2a,0x31233f2a
+.long	0x3094a5c6,0x3094a5c6
+.long	0xc066a235,0xc066a235
+.long	0x37bc4e74,0x37bc4e74
+.long	0xa6ca82fc,0xa6ca82fc
+.long	0xb0d090e0,0xb0d090e0
+.long	0x15d8a733,0x15d8a733
+.long	0x4a9804f1,0x4a9804f1
+.long	0xf7daec41,0xf7daec41
+.long	0x0e50cd7f,0x0e50cd7f
+.long	0x2ff69117,0x2ff69117
+.long	0x8dd64d76,0x8dd64d76
+.long	0x4db0ef43,0x4db0ef43
+.long	0x544daacc,0x544daacc
+.long	0xdf0496e4,0xdf0496e4
+.long	0xe3b5d19e,0xe3b5d19e
+.long	0x1b886a4c,0x1b886a4c
+.long	0xb81f2cc1,0xb81f2cc1
+.long	0x7f516546,0x7f516546
+.long	0x04ea5e9d,0x04ea5e9d
+.long	0x5d358c01,0x5d358c01
+.long	0x737487fa,0x737487fa
+.long	0x2e410bfb,0x2e410bfb
+.long	0x5a1d67b3,0x5a1d67b3
+.long	0x52d2db92,0x52d2db92
+.long	0x335610e9,0x335610e9
+.long	0x1347d66d,0x1347d66d
+.long	0x8c61d79a,0x8c61d79a
+.long	0x7a0ca137,0x7a0ca137
+.long	0x8e14f859,0x8e14f859
+.long	0x893c13eb,0x893c13eb
+.long	0xee27a9ce,0xee27a9ce
+.long	0x35c961b7,0x35c961b7
+.long	0xede51ce1,0xede51ce1
+.long	0x3cb1477a,0x3cb1477a
+.long	0x59dfd29c,0x59dfd29c
+.long	0x3f73f255,0x3f73f255
+.long	0x79ce1418,0x79ce1418
+.long	0xbf37c773,0xbf37c773
+.long	0xeacdf753,0xeacdf753
+.long	0x5baafd5f,0x5baafd5f
+.long	0x146f3ddf,0x146f3ddf
+.long	0x86db4478,0x86db4478
+.long	0x81f3afca,0x81f3afca
+.long	0x3ec468b9,0x3ec468b9
+.long	0x2c342438,0x2c342438
+.long	0x5f40a3c2,0x5f40a3c2
+.long	0x72c31d16,0x72c31d16
+.long	0x0c25e2bc,0x0c25e2bc
+.long	0x8b493c28,0x8b493c28
+.long	0x41950dff,0x41950dff
+.long	0x7101a839,0x7101a839
+.long	0xdeb30c08,0xdeb30c08
+.long	0x9ce4b4d8,0x9ce4b4d8
+.long	0x90c15664,0x90c15664
+.long	0x6184cb7b,0x6184cb7b
+.long	0x70b632d5,0x70b632d5
+.long	0x745c6c48,0x745c6c48
+.long	0x4257b8d0,0x4257b8d0
+.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte	65,69,83,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/crypto/aes/asm/aesni-x86_64.S b/crypto/aes/asm/aesni-x86_64.S
new file mode 100644
index 0000000..917c832
--- /dev/null
+++ b/crypto/aes/asm/aesni-x86_64.S
@@ -0,0 +1,2535 @@
+.text	
+.globl	aesni_encrypt
+.type	aesni_encrypt,@function
+.align	16
+aesni_encrypt:
+	movups	(%rdi),%xmm2
+	movl	240(%rdx),%eax
+	movups	(%rdx),%xmm0
+	movups	16(%rdx),%xmm1
+	leaq	32(%rdx),%rdx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_1:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	jnz	.Loop_enc1_1	
+.byte	102,15,56,221,209
+	movups	%xmm2,(%rsi)
+	.byte	0xf3,0xc3
+.size	aesni_encrypt,.-aesni_encrypt
+
+.globl	aesni_decrypt
+.type	aesni_decrypt,@function
+.align	16
+aesni_decrypt:
+	movups	(%rdi),%xmm2
+	movl	240(%rdx),%eax
+	movups	(%rdx),%xmm0
+	movups	16(%rdx),%xmm1
+	leaq	32(%rdx),%rdx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_2:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	jnz	.Loop_dec1_2	
+.byte	102,15,56,223,209
+	movups	%xmm2,(%rsi)
+	.byte	0xf3,0xc3
+.size	aesni_decrypt, .-aesni_decrypt
+.type	_aesni_encrypt3,@function
+.align	16
+_aesni_encrypt3:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	(%rcx),%xmm0
+
+.Lenc_loop3:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+	movups	(%rcx),%xmm0
+	jnz	.Lenc_loop3
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	.byte	0xf3,0xc3
+.size	_aesni_encrypt3,.-_aesni_encrypt3
+.type	_aesni_decrypt3,@function
+.align	16
+_aesni_decrypt3:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	(%rcx),%xmm0
+
+.Ldec_loop3:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+	movups	(%rcx),%xmm0
+	jnz	.Ldec_loop3
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+	.byte	0xf3,0xc3
+.size	_aesni_decrypt3,.-_aesni_decrypt3
+.type	_aesni_encrypt4,@function
+.align	16
+_aesni_encrypt4:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	(%rcx),%xmm0
+
+.Lenc_loop4:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	(%rcx),%xmm0
+	jnz	.Lenc_loop4
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	.byte	0xf3,0xc3
+.size	_aesni_encrypt4,.-_aesni_encrypt4
+.type	_aesni_decrypt4,@function
+.align	16
+_aesni_decrypt4:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	(%rcx),%xmm0
+
+.Ldec_loop4:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	(%rcx),%xmm0
+	jnz	.Ldec_loop4
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+	.byte	0xf3,0xc3
+.size	_aesni_decrypt4,.-_aesni_decrypt4
+.type	_aesni_encrypt6,@function
+.align	16
+_aesni_encrypt6:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,220,241
+	movups	(%rcx),%xmm0
+.byte	102,15,56,220,249
+	jmp	.Lenc_loop6_enter
+.align	16
+.Lenc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.Lenc_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	(%rcx),%xmm0
+	jnz	.Lenc_loop6
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	.byte	0xf3,0xc3
+.size	_aesni_encrypt6,.-_aesni_encrypt6
+.type	_aesni_decrypt6,@function
+.align	16
+_aesni_decrypt6:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,222,241
+	movups	(%rcx),%xmm0
+.byte	102,15,56,222,249
+	jmp	.Ldec_loop6_enter
+.align	16
+.Ldec_loop6:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.Ldec_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	(%rcx),%xmm0
+	jnz	.Ldec_loop6
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+	.byte	0xf3,0xc3
+.size	_aesni_decrypt6,.-_aesni_decrypt6
+.type	_aesni_encrypt8,@function
+.align	16
+_aesni_encrypt8:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,220,241
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,220,249
+	pxor	%xmm0,%xmm9
+	movups	(%rcx),%xmm0
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	16(%rcx),%xmm1
+	jmp	.Lenc_loop8_enter
+.align	16
+.Lenc_loop8:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	16(%rcx),%xmm1
+.Lenc_loop8_enter:
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	(%rcx),%xmm0
+	jnz	.Lenc_loop8
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+.byte	102,68,15,56,221,192
+.byte	102,68,15,56,221,200
+	.byte	0xf3,0xc3
+.size	_aesni_encrypt8,.-_aesni_encrypt8
+.type	_aesni_decrypt8,@function
+.align	16
+_aesni_decrypt8:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,222,241
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,222,249
+	pxor	%xmm0,%xmm9
+	movups	(%rcx),%xmm0
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	16(%rcx),%xmm1
+	jmp	.Ldec_loop8_enter
+.align	16
+.Ldec_loop8:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	16(%rcx),%xmm1
+.Ldec_loop8_enter:
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	(%rcx),%xmm0
+	jnz	.Ldec_loop8
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+.byte	102,68,15,56,223,192
+.byte	102,68,15,56,223,200
+	.byte	0xf3,0xc3
+.size	_aesni_decrypt8,.-_aesni_decrypt8
+.globl	aesni_ecb_encrypt
+.type	aesni_ecb_encrypt,@function
+.align	16
+aesni_ecb_encrypt:
+	andq	$-16,%rdx
+	jz	.Lecb_ret
+
+	movl	240(%rcx),%eax
+	movups	(%rcx),%xmm0
+	movq	%rcx,%r11
+	movl	%eax,%r10d
+	testl	%r8d,%r8d
+	jz	.Lecb_decrypt
+
+	cmpq	$128,%rdx
+	jb	.Lecb_enc_tail
+
+	movdqu	(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	movdqu	96(%rdi),%xmm8
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+	subq	$128,%rdx
+	jmp	.Lecb_enc_loop8_enter
+.align	16
+.Lecb_enc_loop8:
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movdqu	(%rdi),%xmm2
+	movl	%r10d,%eax
+	movups	%xmm3,16(%rsi)
+	movdqu	16(%rdi),%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqu	32(%rdi),%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqu	48(%rdi),%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqu	64(%rdi),%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqu	80(%rdi),%xmm7
+	movups	%xmm8,96(%rsi)
+	movdqu	96(%rdi),%xmm8
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+.Lecb_enc_loop8_enter:
+
+	call	_aesni_encrypt8
+
+	subq	$128,%rdx
+	jnc	.Lecb_enc_loop8
+
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movups	%xmm3,16(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	addq	$128,%rdx
+	jz	.Lecb_ret
+
+.Lecb_enc_tail:
+	movups	(%rdi),%xmm2
+	cmpq	$32,%rdx
+	jb	.Lecb_enc_one
+	movups	16(%rdi),%xmm3
+	je	.Lecb_enc_two
+	movups	32(%rdi),%xmm4
+	cmpq	$64,%rdx
+	jb	.Lecb_enc_three
+	movups	48(%rdi),%xmm5
+	je	.Lecb_enc_four
+	movups	64(%rdi),%xmm6
+	cmpq	$96,%rdx
+	jb	.Lecb_enc_five
+	movups	80(%rdi),%xmm7
+	je	.Lecb_enc_six
+	movdqu	96(%rdi),%xmm8
+	call	_aesni_encrypt8
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_3:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_3	
+.byte	102,15,56,221,209
+	movups	%xmm2,(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_encrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_three:
+	call	_aesni_encrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_four:
+	call	_aesni_encrypt4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_encrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_six:
+	call	_aesni_encrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	jmp	.Lecb_ret
+
+.align	16
+.Lecb_decrypt:
+	cmpq	$128,%rdx
+	jb	.Lecb_dec_tail
+
+	movdqu	(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	movdqu	96(%rdi),%xmm8
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+	subq	$128,%rdx
+	jmp	.Lecb_dec_loop8_enter
+.align	16
+.Lecb_dec_loop8:
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movdqu	(%rdi),%xmm2
+	movl	%r10d,%eax
+	movups	%xmm3,16(%rsi)
+	movdqu	16(%rdi),%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqu	32(%rdi),%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqu	48(%rdi),%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqu	64(%rdi),%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqu	80(%rdi),%xmm7
+	movups	%xmm8,96(%rsi)
+	movdqu	96(%rdi),%xmm8
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+.Lecb_dec_loop8_enter:
+
+	call	_aesni_decrypt8
+
+	movups	(%r11),%xmm0
+	subq	$128,%rdx
+	jnc	.Lecb_dec_loop8
+
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movups	%xmm3,16(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	addq	$128,%rdx
+	jz	.Lecb_ret
+
+.Lecb_dec_tail:
+	movups	(%rdi),%xmm2
+	cmpq	$32,%rdx
+	jb	.Lecb_dec_one
+	movups	16(%rdi),%xmm3
+	je	.Lecb_dec_two
+	movups	32(%rdi),%xmm4
+	cmpq	$64,%rdx
+	jb	.Lecb_dec_three
+	movups	48(%rdi),%xmm5
+	je	.Lecb_dec_four
+	movups	64(%rdi),%xmm6
+	cmpq	$96,%rdx
+	jb	.Lecb_dec_five
+	movups	80(%rdi),%xmm7
+	je	.Lecb_dec_six
+	movups	96(%rdi),%xmm8
+	movups	(%rcx),%xmm0
+	call	_aesni_decrypt8
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_4:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_4	
+.byte	102,15,56,223,209
+	movups	%xmm2,(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_decrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_three:
+	call	_aesni_decrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_four:
+	call	_aesni_decrypt4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_six:
+	call	_aesni_decrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+
+.Lecb_ret:
+	.byte	0xf3,0xc3
+.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
+.globl	aesni_ccm64_encrypt_blocks
+.type	aesni_ccm64_encrypt_blocks,@function
+.align	16
+aesni_ccm64_encrypt_blocks:
+	movl	240(%rcx),%eax
+	movdqu	(%r8),%xmm9
+	movdqa	.Lincrement64(%rip),%xmm6
+	movdqa	.Lbswap_mask(%rip),%xmm7
+
+	shrl	$1,%eax
+	leaq	0(%rcx),%r11
+	movdqu	(%r9),%xmm3
+	movdqa	%xmm9,%xmm2
+	movl	%eax,%r10d
+.byte	102,68,15,56,0,207
+	jmp	.Lccm64_enc_outer
+.align	16
+.Lccm64_enc_outer:
+	movups	(%r11),%xmm0
+	movl	%r10d,%eax
+	movups	(%rdi),%xmm8
+
+	xorps	%xmm0,%xmm2
+	movups	16(%r11),%xmm1
+	xorps	%xmm8,%xmm0
+	leaq	32(%r11),%rcx
+	xorps	%xmm0,%xmm3
+	movups	(%rcx),%xmm0
+
+.Lccm64_enc2_loop:
+.byte	102,15,56,220,209
+	decl	%eax
+.byte	102,15,56,220,217
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,216
+	movups	0(%rcx),%xmm0
+	jnz	.Lccm64_enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	paddq	%xmm6,%xmm9
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+
+	decq	%rdx
+	leaq	16(%rdi),%rdi
+	xorps	%xmm2,%xmm8
+	movdqa	%xmm9,%xmm2
+	movups	%xmm8,(%rsi)
+	leaq	16(%rsi),%rsi
+.byte	102,15,56,0,215
+	jnz	.Lccm64_enc_outer
+
+	movups	%xmm3,(%r9)
+	.byte	0xf3,0xc3
+.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+.globl	aesni_ccm64_decrypt_blocks
+.type	aesni_ccm64_decrypt_blocks,@function
+.align	16
+aesni_ccm64_decrypt_blocks:
+	movl	240(%rcx),%eax
+	movups	(%r8),%xmm9
+	movdqu	(%r9),%xmm3
+	movdqa	.Lincrement64(%rip),%xmm6
+	movdqa	.Lbswap_mask(%rip),%xmm7
+
+	movaps	%xmm9,%xmm2
+	movl	%eax,%r10d
+	movq	%rcx,%r11
+.byte	102,68,15,56,0,207
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_5:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_5	
+.byte	102,15,56,221,209
+	movups	(%rdi),%xmm8
+	paddq	%xmm6,%xmm9
+	leaq	16(%rdi),%rdi
+	jmp	.Lccm64_dec_outer
+.align	16
+.Lccm64_dec_outer:
+	xorps	%xmm2,%xmm8
+	movdqa	%xmm9,%xmm2
+	movl	%r10d,%eax
+	movups	%xmm8,(%rsi)
+	leaq	16(%rsi),%rsi
+.byte	102,15,56,0,215
+
+	subq	$1,%rdx
+	jz	.Lccm64_dec_break
+
+	movups	(%r11),%xmm0
+	shrl	$1,%eax
+	movups	16(%r11),%xmm1
+	xorps	%xmm0,%xmm8
+	leaq	32(%r11),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm8,%xmm3
+	movups	(%rcx),%xmm0
+
+.Lccm64_dec2_loop:
+.byte	102,15,56,220,209
+	decl	%eax
+.byte	102,15,56,220,217
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,216
+	movups	0(%rcx),%xmm0
+	jnz	.Lccm64_dec2_loop
+	movups	(%rdi),%xmm8
+	paddq	%xmm6,%xmm9
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	leaq	16(%rdi),%rdi
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	jmp	.Lccm64_dec_outer
+
+.align	16
+.Lccm64_dec_break:
+
+	movups	(%r11),%xmm0
+	movups	16(%r11),%xmm1
+	xorps	%xmm0,%xmm8
+	leaq	32(%r11),%r11
+	xorps	%xmm8,%xmm3
+.Loop_enc1_6:
+.byte	102,15,56,220,217
+	decl	%eax
+	movups	(%r11),%xmm1
+	leaq	16(%r11),%r11
+	jnz	.Loop_enc1_6	
+.byte	102,15,56,221,217
+	movups	%xmm3,(%r9)
+	.byte	0xf3,0xc3
+.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+.globl	aesni_ctr32_encrypt_blocks
+.type	aesni_ctr32_encrypt_blocks,@function
+.align	16
+aesni_ctr32_encrypt_blocks:
+	cmpq	$1,%rdx
+	je	.Lctr32_one_shortcut
+
+	movdqu	(%r8),%xmm14
+	movdqa	.Lbswap_mask(%rip),%xmm15
+	xorl	%eax,%eax
+.byte	102,69,15,58,22,242,3
+.byte	102,68,15,58,34,240,3
+
+	movl	240(%rcx),%eax
+	bswapl	%r10d
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+.byte	102,69,15,58,34,226,0
+	leaq	3(%r10),%r11
+.byte	102,69,15,58,34,235,0
+	incl	%r10d
+.byte	102,69,15,58,34,226,1
+	incq	%r11
+.byte	102,69,15,58,34,235,1
+	incl	%r10d
+.byte	102,69,15,58,34,226,2
+	incq	%r11
+.byte	102,69,15,58,34,235,2
+	movdqa	%xmm12,-40(%rsp)
+.byte	102,69,15,56,0,231
+	movdqa	%xmm13,-24(%rsp)
+.byte	102,69,15,56,0,239
+
+	pshufd	$192,%xmm12,%xmm2
+	pshufd	$128,%xmm12,%xmm3
+	pshufd	$64,%xmm12,%xmm4
+	cmpq	$6,%rdx
+	jb	.Lctr32_tail
+	shrl	$1,%eax
+	movq	%rcx,%r11
+	movl	%eax,%r10d
+	subq	$6,%rdx
+	jmp	.Lctr32_loop6
+
+.align	16
+.Lctr32_loop6:
+	pshufd	$192,%xmm13,%xmm5
+	por	%xmm14,%xmm2
+	movups	(%r11),%xmm0
+	pshufd	$128,%xmm13,%xmm6
+	por	%xmm14,%xmm3
+	movups	16(%r11),%xmm1
+	pshufd	$64,%xmm13,%xmm7
+	por	%xmm14,%xmm4
+	por	%xmm14,%xmm5
+	xorps	%xmm0,%xmm2
+	por	%xmm14,%xmm6
+	por	%xmm14,%xmm7
+
+
+
+
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,220,209
+	leaq	32(%r11),%rcx
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,217
+	movdqa	.Lincrement32(%rip),%xmm13
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,225
+	movdqa	-40(%rsp),%xmm12
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+	movups	(%rcx),%xmm0
+	decl	%eax
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	jmp	.Lctr32_enc_loop6_enter
+.align	16
+.Lctr32_enc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.Lctr32_enc_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	(%rcx),%xmm0
+	jnz	.Lctr32_enc_loop6
+
+.byte	102,15,56,220,209
+	paddd	%xmm13,%xmm12
+.byte	102,15,56,220,217
+	paddd	-24(%rsp),%xmm13
+.byte	102,15,56,220,225
+	movdqa	%xmm12,-40(%rsp)
+.byte	102,15,56,220,233
+	movdqa	%xmm13,-24(%rsp)
+.byte	102,15,56,220,241
+.byte	102,69,15,56,0,231
+.byte	102,15,56,220,249
+.byte	102,69,15,56,0,239
+
+.byte	102,15,56,221,208
+	movups	(%rdi),%xmm8
+.byte	102,15,56,221,216
+	movups	16(%rdi),%xmm9
+.byte	102,15,56,221,224
+	movups	32(%rdi),%xmm10
+.byte	102,15,56,221,232
+	movups	48(%rdi),%xmm11
+.byte	102,15,56,221,240
+	movups	64(%rdi),%xmm1
+.byte	102,15,56,221,248
+	movups	80(%rdi),%xmm0
+	leaq	96(%rdi),%rdi
+
+	xorps	%xmm2,%xmm8
+	pshufd	$192,%xmm12,%xmm2
+	xorps	%xmm3,%xmm9
+	pshufd	$128,%xmm12,%xmm3
+	movups	%xmm8,(%rsi)
+	xorps	%xmm4,%xmm10
+	pshufd	$64,%xmm12,%xmm4
+	movups	%xmm9,16(%rsi)
+	xorps	%xmm5,%xmm11
+	movups	%xmm10,32(%rsi)
+	xorps	%xmm6,%xmm1
+	movups	%xmm11,48(%rsi)
+	xorps	%xmm7,%xmm0
+	movups	%xmm1,64(%rsi)
+	movups	%xmm0,80(%rsi)
+	leaq	96(%rsi),%rsi
+	movl	%r10d,%eax
+	subq	$6,%rdx
+	jnc	.Lctr32_loop6
+
+	addq	$6,%rdx
+	jz	.Lctr32_done
+	movq	%r11,%rcx
+	leal	1(%rax,%rax,1),%eax
+
+.Lctr32_tail:
+	por	%xmm14,%xmm2
+	movups	(%rdi),%xmm8
+	cmpq	$2,%rdx
+	jb	.Lctr32_one
+
+	por	%xmm14,%xmm3
+	movups	16(%rdi),%xmm9
+	je	.Lctr32_two
+
+	pshufd	$192,%xmm13,%xmm5
+	por	%xmm14,%xmm4
+	movups	32(%rdi),%xmm10
+	cmpq	$4,%rdx
+	jb	.Lctr32_three
+
+	pshufd	$128,%xmm13,%xmm6
+	por	%xmm14,%xmm5
+	movups	48(%rdi),%xmm11
+	je	.Lctr32_four
+
+	por	%xmm14,%xmm6
+	xorps	%xmm7,%xmm7
+
+	call	_aesni_encrypt6
+
+	movups	64(%rdi),%xmm1
+	xorps	%xmm2,%xmm8
+	xorps	%xmm3,%xmm9
+	movups	%xmm8,(%rsi)
+	xorps	%xmm4,%xmm10
+	movups	%xmm9,16(%rsi)
+	xorps	%xmm5,%xmm11
+	movups	%xmm10,32(%rsi)
+	xorps	%xmm6,%xmm1
+	movups	%xmm11,48(%rsi)
+	movups	%xmm1,64(%rsi)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_one_shortcut:
+	movups	(%r8),%xmm2
+	movups	(%rdi),%xmm8
+	movl	240(%rcx),%eax
+.Lctr32_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_7:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_7	
+.byte	102,15,56,221,209
+	xorps	%xmm2,%xmm8
+	movups	%xmm8,(%rsi)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_encrypt3
+	xorps	%xmm2,%xmm8
+	xorps	%xmm3,%xmm9
+	movups	%xmm8,(%rsi)
+	movups	%xmm9,16(%rsi)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_three:
+	call	_aesni_encrypt3
+	xorps	%xmm2,%xmm8
+	xorps	%xmm3,%xmm9
+	movups	%xmm8,(%rsi)
+	xorps	%xmm4,%xmm10
+	movups	%xmm9,16(%rsi)
+	movups	%xmm10,32(%rsi)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_four:
+	call	_aesni_encrypt4
+	xorps	%xmm2,%xmm8
+	xorps	%xmm3,%xmm9
+	movups	%xmm8,(%rsi)
+	xorps	%xmm4,%xmm10
+	movups	%xmm9,16(%rsi)
+	xorps	%xmm5,%xmm11
+	movups	%xmm10,32(%rsi)
+	movups	%xmm11,48(%rsi)
+
+.Lctr32_done:
+	.byte	0xf3,0xc3
+.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+.globl	aesni_xts_encrypt
+.type	aesni_xts_encrypt,@function
+.align	16
+aesni_xts_encrypt:
+	leaq	-104(%rsp),%rsp
+	movups	(%r9),%xmm15
+	movl	240(%r8),%eax
+	movl	240(%rcx),%r10d
+	movups	(%r8),%xmm0
+	movups	16(%r8),%xmm1
+	leaq	32(%r8),%r8
+	xorps	%xmm0,%xmm15
+.Loop_enc1_8:
+.byte	102,68,15,56,220,249
+	decl	%eax
+	movups	(%r8),%xmm1
+	leaq	16(%r8),%r8
+	jnz	.Loop_enc1_8	
+.byte	102,68,15,56,221,249
+	movq	%rcx,%r11
+	movl	%r10d,%eax
+	movq	%rdx,%r9
+	andq	$-16,%rdx
+
+	movdqa	.Lxts_magic(%rip),%xmm8
+	pxor	%xmm14,%xmm14
+	pcmpgtd	%xmm15,%xmm14
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm11
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm12
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm13
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	subq	$96,%rdx
+	jc	.Lxts_enc_short
+
+	shrl	$1,%eax
+	subl	$1,%eax
+	movl	%eax,%r10d
+	jmp	.Lxts_enc_grandloop
+
+.align	16
+.Lxts_enc_grandloop:
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movdqu	0(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movdqu	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movdqu	32(%rdi),%xmm4
+	pxor	%xmm10,%xmm2
+	movdqu	48(%rdi),%xmm5
+	pxor	%xmm11,%xmm3
+	movdqu	64(%rdi),%xmm6
+	pxor	%xmm12,%xmm4
+	movdqu	80(%rdi),%xmm7
+	leaq	96(%rdi),%rdi
+	pxor	%xmm13,%xmm5
+	movups	(%r11),%xmm0
+	pxor	%xmm14,%xmm6
+	pxor	%xmm15,%xmm7
+
+
+
+	movups	16(%r11),%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movdqa	%xmm10,0(%rsp)
+.byte	102,15,56,220,209
+	leaq	32(%r11),%rcx
+	pxor	%xmm0,%xmm4
+	movdqa	%xmm11,16(%rsp)
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+	movdqa	%xmm12,32(%rsp)
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm13,48(%rsp)
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+	movups	(%rcx),%xmm0
+	decl	%eax
+	movdqa	%xmm14,64(%rsp)
+.byte	102,15,56,220,241
+	movdqa	%xmm15,80(%rsp)
+.byte	102,15,56,220,249
+	pxor	%xmm14,%xmm14
+	pcmpgtd	%xmm15,%xmm14
+	jmp	.Lxts_enc_loop6_enter
+
+.align	16
+.Lxts_enc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.Lxts_enc_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	(%rcx),%xmm0
+	jnz	.Lxts_enc_loop6
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,220,209
+	pand	%xmm8,%xmm9
+.byte	102,15,56,220,217
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,220,225
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	movups	16(%rcx),%xmm1
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,220,208
+	pand	%xmm8,%xmm9
+.byte	102,15,56,220,216
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,220,224
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	32(%rcx),%xmm0
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm11
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,220,209
+	pand	%xmm8,%xmm9
+.byte	102,15,56,220,217
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,220,225
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm12
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,221,208
+	pand	%xmm8,%xmm9
+.byte	102,15,56,221,216
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,221,224
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm13
+	paddq	%xmm15,%xmm15
+	xorps	0(%rsp),%xmm2
+	pand	%xmm8,%xmm9
+	xorps	16(%rsp),%xmm3
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+
+	xorps	32(%rsp),%xmm4
+	movups	%xmm2,0(%rsi)
+	xorps	48(%rsp),%xmm5
+	movups	%xmm3,16(%rsi)
+	xorps	64(%rsp),%xmm6
+	movups	%xmm4,32(%rsi)
+	xorps	80(%rsp),%xmm7
+	movups	%xmm5,48(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	leaq	96(%rsi),%rsi
+	subq	$96,%rdx
+	jnc	.Lxts_enc_grandloop
+
+	leal	3(%rax,%rax,1),%eax
+	movq	%r11,%rcx
+	movl	%eax,%r10d
+
+.Lxts_enc_short:
+	addq	$96,%rdx
+	jz	.Lxts_enc_done
+
+	cmpq	$32,%rdx
+	jb	.Lxts_enc_one
+	je	.Lxts_enc_two
+
+	cmpq	$64,%rdx
+	jb	.Lxts_enc_three
+	je	.Lxts_enc_four
+
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movdqu	(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movdqu	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movdqu	32(%rdi),%xmm4
+	pxor	%xmm10,%xmm2
+	movdqu	48(%rdi),%xmm5
+	pxor	%xmm11,%xmm3
+	movdqu	64(%rdi),%xmm6
+	leaq	80(%rdi),%rdi
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm5
+	pxor	%xmm14,%xmm6
+
+	call	_aesni_encrypt6
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm15,%xmm10
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	movdqu	%xmm2,(%rsi)
+	xorps	%xmm13,%xmm5
+	movdqu	%xmm3,16(%rsi)
+	xorps	%xmm14,%xmm6
+	movdqu	%xmm4,32(%rsi)
+	movdqu	%xmm5,48(%rsi)
+	movdqu	%xmm6,64(%rsi)
+	leaq	80(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_one:
+	movups	(%rdi),%xmm2
+	leaq	16(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_9:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_9	
+.byte	102,15,56,221,209
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm11,%xmm10
+	movups	%xmm2,(%rsi)
+	leaq	16(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_two:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	leaq	32(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+
+	call	_aesni_encrypt3
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm12,%xmm10
+	xorps	%xmm11,%xmm3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	leaq	32(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_three:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	movups	32(%rdi),%xmm4
+	leaq	48(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+
+	call	_aesni_encrypt3
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm13,%xmm10
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	leaq	48(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_four:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	movups	32(%rdi),%xmm4
+	xorps	%xmm10,%xmm2
+	movups	48(%rdi),%xmm5
+	leaq	64(%rdi),%rdi
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	xorps	%xmm13,%xmm5
+
+	call	_aesni_encrypt4
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm15,%xmm10
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	movups	%xmm2,(%rsi)
+	xorps	%xmm13,%xmm5
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	leaq	64(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_done:
+	andq	$15,%r9
+	jz	.Lxts_enc_ret
+	movq	%r9,%rdx
+
+.Lxts_enc_steal:
+	movzbl	(%rdi),%eax
+	movzbl	-16(%rsi),%ecx
+	leaq	1(%rdi),%rdi
+	movb	%al,-16(%rsi)
+	movb	%cl,0(%rsi)
+	leaq	1(%rsi),%rsi
+	subq	$1,%rdx
+	jnz	.Lxts_enc_steal
+
+	subq	%r9,%rsi
+	movq	%r11,%rcx
+	movl	%r10d,%eax
+
+	movups	-16(%rsi),%xmm2
+	xorps	%xmm10,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_10:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_10	
+.byte	102,15,56,221,209
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,-16(%rsi)
+
+.Lxts_enc_ret:
+	leaq	104(%rsp),%rsp
+.Lxts_enc_epilogue:
+	.byte	0xf3,0xc3
+.size	aesni_xts_encrypt,.-aesni_xts_encrypt
+.globl	aesni_xts_decrypt
+.type	aesni_xts_decrypt,@function
+.align	16
+aesni_xts_decrypt:
+	leaq	-104(%rsp),%rsp
+	movups	(%r9),%xmm15
+	movl	240(%r8),%eax
+	movl	240(%rcx),%r10d
+	movups	(%r8),%xmm0
+	movups	16(%r8),%xmm1
+	leaq	32(%r8),%r8
+	xorps	%xmm0,%xmm15
+.Loop_enc1_11:
+.byte	102,68,15,56,220,249
+	decl	%eax
+	movups	(%r8),%xmm1
+	leaq	16(%r8),%r8
+	jnz	.Loop_enc1_11	
+.byte	102,68,15,56,221,249
+	xorl	%eax,%eax
+	testq	$15,%rdx
+	setnz	%al
+	shlq	$4,%rax
+	subq	%rax,%rdx
+
+	movq	%rcx,%r11
+	movl	%r10d,%eax
+	movq	%rdx,%r9
+	andq	$-16,%rdx
+
+	movdqa	.Lxts_magic(%rip),%xmm8
+	pxor	%xmm14,%xmm14
+	pcmpgtd	%xmm15,%xmm14
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm11
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm12
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm13
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	subq	$96,%rdx
+	jc	.Lxts_dec_short
+
+	shrl	$1,%eax
+	subl	$1,%eax
+	movl	%eax,%r10d
+	jmp	.Lxts_dec_grandloop
+
+.align	16
+.Lxts_dec_grandloop:
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movdqu	0(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movdqu	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movdqu	32(%rdi),%xmm4
+	pxor	%xmm10,%xmm2
+	movdqu	48(%rdi),%xmm5
+	pxor	%xmm11,%xmm3
+	movdqu	64(%rdi),%xmm6
+	pxor	%xmm12,%xmm4
+	movdqu	80(%rdi),%xmm7
+	leaq	96(%rdi),%rdi
+	pxor	%xmm13,%xmm5
+	movups	(%r11),%xmm0
+	pxor	%xmm14,%xmm6
+	pxor	%xmm15,%xmm7
+
+
+
+	movups	16(%r11),%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movdqa	%xmm10,0(%rsp)
+.byte	102,15,56,222,209
+	leaq	32(%r11),%rcx
+	pxor	%xmm0,%xmm4
+	movdqa	%xmm11,16(%rsp)
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+	movdqa	%xmm12,32(%rsp)
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm13,48(%rsp)
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+	movups	(%rcx),%xmm0
+	decl	%eax
+	movdqa	%xmm14,64(%rsp)
+.byte	102,15,56,222,241
+	movdqa	%xmm15,80(%rsp)
+.byte	102,15,56,222,249
+	pxor	%xmm14,%xmm14
+	pcmpgtd	%xmm15,%xmm14
+	jmp	.Lxts_dec_loop6_enter
+
+.align	16
+.Lxts_dec_loop6:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.Lxts_dec_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	(%rcx),%xmm0
+	jnz	.Lxts_dec_loop6
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,222,209
+	pand	%xmm8,%xmm9
+.byte	102,15,56,222,217
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,222,225
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	movups	16(%rcx),%xmm1
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,222,208
+	pand	%xmm8,%xmm9
+.byte	102,15,56,222,216
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,222,224
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	32(%rcx),%xmm0
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm11
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,222,209
+	pand	%xmm8,%xmm9
+.byte	102,15,56,222,217
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,222,225
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm12
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,223,208
+	pand	%xmm8,%xmm9
+.byte	102,15,56,223,216
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,223,224
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm13
+	paddq	%xmm15,%xmm15
+	xorps	0(%rsp),%xmm2
+	pand	%xmm8,%xmm9
+	xorps	16(%rsp),%xmm3
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+
+	xorps	32(%rsp),%xmm4
+	movups	%xmm2,0(%rsi)
+	xorps	48(%rsp),%xmm5
+	movups	%xmm3,16(%rsi)
+	xorps	64(%rsp),%xmm6
+	movups	%xmm4,32(%rsi)
+	xorps	80(%rsp),%xmm7
+	movups	%xmm5,48(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	leaq	96(%rsi),%rsi
+	subq	$96,%rdx
+	jnc	.Lxts_dec_grandloop
+
+	leal	3(%rax,%rax,1),%eax
+	movq	%r11,%rcx
+	movl	%eax,%r10d
+
+.Lxts_dec_short:
+	addq	$96,%rdx
+	jz	.Lxts_dec_done
+
+	cmpq	$32,%rdx
+	jb	.Lxts_dec_one
+	je	.Lxts_dec_two
+
+	cmpq	$64,%rdx
+	jb	.Lxts_dec_three
+	je	.Lxts_dec_four
+
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movdqu	(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movdqu	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movdqu	32(%rdi),%xmm4
+	pxor	%xmm10,%xmm2
+	movdqu	48(%rdi),%xmm5
+	pxor	%xmm11,%xmm3
+	movdqu	64(%rdi),%xmm6
+	leaq	80(%rdi),%rdi
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm5
+	pxor	%xmm14,%xmm6
+
+	call	_aesni_decrypt6
+
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	movdqu	%xmm2,(%rsi)
+	xorps	%xmm13,%xmm5
+	movdqu	%xmm3,16(%rsi)
+	xorps	%xmm14,%xmm6
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm14,%xmm14
+	movdqu	%xmm5,48(%rsi)
+	pcmpgtd	%xmm15,%xmm14
+	movdqu	%xmm6,64(%rsi)
+	leaq	80(%rsi),%rsi
+	pshufd	$19,%xmm14,%xmm11
+	andq	$15,%r9
+	jz	.Lxts_dec_ret
+
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm11
+	pxor	%xmm15,%xmm11
+	jmp	.Lxts_dec_done2
+
+.align	16
+.Lxts_dec_one:
+	movups	(%rdi),%xmm2
+	leaq	16(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_12:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_12	
+.byte	102,15,56,223,209
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm11,%xmm10
+	movups	%xmm2,(%rsi)
+	movdqa	%xmm12,%xmm11
+	leaq	16(%rsi),%rsi
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_two:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	leaq	32(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+
+	call	_aesni_decrypt3
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm12,%xmm10
+	xorps	%xmm11,%xmm3
+	movdqa	%xmm13,%xmm11
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	leaq	32(%rsi),%rsi
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_three:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	movups	32(%rdi),%xmm4
+	leaq	48(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+
+	call	_aesni_decrypt3
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm13,%xmm10
+	xorps	%xmm11,%xmm3
+	movdqa	%xmm15,%xmm11
+	xorps	%xmm12,%xmm4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	leaq	48(%rsi),%rsi
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_four:
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movups	(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movups	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movups	32(%rdi),%xmm4
+	xorps	%xmm10,%xmm2
+	movups	48(%rdi),%xmm5
+	leaq	64(%rdi),%rdi
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	xorps	%xmm13,%xmm5
+
+	call	_aesni_decrypt4
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm14,%xmm10
+	xorps	%xmm11,%xmm3
+	movdqa	%xmm15,%xmm11
+	xorps	%xmm12,%xmm4
+	movups	%xmm2,(%rsi)
+	xorps	%xmm13,%xmm5
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	leaq	64(%rsi),%rsi
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_done:
+	andq	$15,%r9
+	jz	.Lxts_dec_ret
+.Lxts_dec_done2:
+	movq	%r9,%rdx
+	movq	%r11,%rcx
+	movl	%r10d,%eax
+
+	movups	(%rdi),%xmm2
+	xorps	%xmm11,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_13:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_13	
+.byte	102,15,56,223,209
+	xorps	%xmm11,%xmm2
+	movups	%xmm2,(%rsi)
+
+.Lxts_dec_steal:
+	movzbl	16(%rdi),%eax
+	movzbl	(%rsi),%ecx
+	leaq	1(%rdi),%rdi
+	movb	%al,(%rsi)
+	movb	%cl,16(%rsi)
+	leaq	1(%rsi),%rsi
+	subq	$1,%rdx
+	jnz	.Lxts_dec_steal
+
+	subq	%r9,%rsi
+	movq	%r11,%rcx
+	movl	%r10d,%eax
+
+	movups	(%rsi),%xmm2
+	xorps	%xmm10,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_14:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_14	
+.byte	102,15,56,223,209
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+
+.Lxts_dec_ret:
+	leaq	104(%rsp),%rsp
+.Lxts_dec_epilogue:
+	.byte	0xf3,0xc3
+.size	aesni_xts_decrypt,.-aesni_xts_decrypt
+.globl	aesni_cbc_encrypt
+.type	aesni_cbc_encrypt,@function
+.align	16
+aesni_cbc_encrypt:
+	testq	%rdx,%rdx
+	jz	.Lcbc_ret
+
+	movl	240(%rcx),%r10d
+	movq	%rcx,%r11
+	testl	%r9d,%r9d
+	jz	.Lcbc_decrypt
+
+	movups	(%r8),%xmm2
+	movl	%r10d,%eax
+	cmpq	$16,%rdx
+	jb	.Lcbc_enc_tail
+	subq	$16,%rdx
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movups	(%rdi),%xmm3
+	leaq	16(%rdi),%rdi
+
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm3
+	leaq	32(%rcx),%rcx
+	xorps	%xmm3,%xmm2
+.Loop_enc1_15:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_15	
+.byte	102,15,56,221,209
+	movl	%r10d,%eax
+	movq	%r11,%rcx
+	movups	%xmm2,0(%rsi)
+	leaq	16(%rsi),%rsi
+	subq	$16,%rdx
+	jnc	.Lcbc_enc_loop
+	addq	$16,%rdx
+	jnz	.Lcbc_enc_tail
+	movups	%xmm2,(%r8)
+	jmp	.Lcbc_ret
+
+.Lcbc_enc_tail:
+	movq	%rdx,%rcx
+	xchgq	%rdi,%rsi
+.long	0x9066A4F3	
+	movl	$16,%ecx
+	subq	%rdx,%rcx
+	xorl	%eax,%eax
+.long	0x9066AAF3	
+	leaq	-16(%rdi),%rdi
+	movl	%r10d,%eax
+	movq	%rdi,%rsi
+	movq	%r11,%rcx
+	xorq	%rdx,%rdx
+	jmp	.Lcbc_enc_loop	
+
+.align	16
+.Lcbc_decrypt:
+	movups	(%r8),%xmm9
+	movl	%r10d,%eax
+	cmpq	$112,%rdx
+	jbe	.Lcbc_dec_tail
+	shrl	$1,%r10d
+	subq	$112,%rdx
+	movl	%r10d,%eax
+	movaps	%xmm9,-24(%rsp)
+	jmp	.Lcbc_dec_loop8_enter
+.align	16
+.Lcbc_dec_loop8:
+	movaps	%xmm0,-24(%rsp)
+	movups	%xmm9,(%rsi)
+	leaq	16(%rsi),%rsi
+.Lcbc_dec_loop8_enter:
+	movups	(%rcx),%xmm0
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	movups	16(%rcx),%xmm1
+
+	leaq	32(%rcx),%rcx
+	movdqu	32(%rdi),%xmm4
+	xorps	%xmm0,%xmm2
+	movdqu	48(%rdi),%xmm5
+	xorps	%xmm0,%xmm3
+	movdqu	64(%rdi),%xmm6
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm4
+	movdqu	80(%rdi),%xmm7
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+	movdqu	96(%rdi),%xmm8
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+	movdqu	112(%rdi),%xmm9
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,222,241
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,222,249
+	pxor	%xmm0,%xmm9
+	movups	(%rcx),%xmm0
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	16(%rcx),%xmm1
+
+	call	.Ldec_loop8_enter
+
+	movups	(%rdi),%xmm1
+	movups	16(%rdi),%xmm0
+	xorps	-24(%rsp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%rdi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%rdi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%rdi),%xmm1
+	xorps	%xmm0,%xmm6
+	movups	80(%rdi),%xmm0
+	xorps	%xmm1,%xmm7
+	movups	96(%rdi),%xmm1
+	xorps	%xmm0,%xmm8
+	movups	112(%rdi),%xmm0
+	xorps	%xmm1,%xmm9
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm6,64(%rsi)
+	movq	%r11,%rcx
+	movups	%xmm7,80(%rsi)
+	leaq	128(%rdi),%rdi
+	movups	%xmm8,96(%rsi)
+	leaq	112(%rsi),%rsi
+	subq	$128,%rdx
+	ja	.Lcbc_dec_loop8
+
+	movaps	%xmm9,%xmm2
+	movaps	%xmm0,%xmm9
+	addq	$112,%rdx
+	jle	.Lcbc_dec_tail_collected
+	movups	%xmm2,(%rsi)
+	leal	1(%r10,%r10,1),%eax
+	leaq	16(%rsi),%rsi
+.Lcbc_dec_tail:
+	movups	(%rdi),%xmm2
+	movaps	%xmm2,%xmm8
+	cmpq	$16,%rdx
+	jbe	.Lcbc_dec_one
+
+	movups	16(%rdi),%xmm3
+	movaps	%xmm3,%xmm7
+	cmpq	$32,%rdx
+	jbe	.Lcbc_dec_two
+
+	movups	32(%rdi),%xmm4
+	movaps	%xmm4,%xmm6
+	cmpq	$48,%rdx
+	jbe	.Lcbc_dec_three
+
+	movups	48(%rdi),%xmm5
+	cmpq	$64,%rdx
+	jbe	.Lcbc_dec_four
+
+	movups	64(%rdi),%xmm6
+	cmpq	$80,%rdx
+	jbe	.Lcbc_dec_five
+
+	movups	80(%rdi),%xmm7
+	cmpq	$96,%rdx
+	jbe	.Lcbc_dec_six
+
+	movups	96(%rdi),%xmm8
+	movaps	%xmm9,-24(%rsp)
+	call	_aesni_decrypt8
+	movups	(%rdi),%xmm1
+	movups	16(%rdi),%xmm0
+	xorps	-24(%rsp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%rdi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%rdi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%rdi),%xmm1
+	xorps	%xmm0,%xmm6
+	movups	80(%rdi),%xmm0
+	xorps	%xmm1,%xmm7
+	movups	96(%rdi),%xmm9
+	xorps	%xmm0,%xmm8
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	leaq	96(%rsi),%rsi
+	movaps	%xmm8,%xmm2
+	subq	$112,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_16:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_16	
+.byte	102,15,56,223,209
+	xorps	%xmm9,%xmm2
+	movaps	%xmm8,%xmm9
+	subq	$16,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_decrypt3
+	xorps	%xmm9,%xmm2
+	xorps	%xmm8,%xmm3
+	movups	%xmm2,(%rsi)
+	movaps	%xmm7,%xmm9
+	movaps	%xmm3,%xmm2
+	leaq	16(%rsi),%rsi
+	subq	$32,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_three:
+	call	_aesni_decrypt3
+	xorps	%xmm9,%xmm2
+	xorps	%xmm8,%xmm3
+	movups	%xmm2,(%rsi)
+	xorps	%xmm7,%xmm4
+	movups	%xmm3,16(%rsi)
+	movaps	%xmm6,%xmm9
+	movaps	%xmm4,%xmm2
+	leaq	32(%rsi),%rsi
+	subq	$48,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_four:
+	call	_aesni_decrypt4
+	xorps	%xmm9,%xmm2
+	movups	48(%rdi),%xmm9
+	xorps	%xmm8,%xmm3
+	movups	%xmm2,(%rsi)
+	xorps	%xmm7,%xmm4
+	movups	%xmm3,16(%rsi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm4,32(%rsi)
+	movaps	%xmm5,%xmm2
+	leaq	48(%rsi),%rsi
+	subq	$64,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	16(%rdi),%xmm1
+	movups	32(%rdi),%xmm0
+	xorps	%xmm9,%xmm2
+	xorps	%xmm8,%xmm3
+	xorps	%xmm1,%xmm4
+	movups	48(%rdi),%xmm1
+	xorps	%xmm0,%xmm5
+	movups	64(%rdi),%xmm9
+	xorps	%xmm1,%xmm6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	leaq	64(%rsi),%rsi
+	movaps	%xmm6,%xmm2
+	subq	$80,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_six:
+	call	_aesni_decrypt6
+	movups	16(%rdi),%xmm1
+	movups	32(%rdi),%xmm0
+	xorps	%xmm9,%xmm2
+	xorps	%xmm8,%xmm3
+	xorps	%xmm1,%xmm4
+	movups	48(%rdi),%xmm1
+	xorps	%xmm0,%xmm5
+	movups	64(%rdi),%xmm0
+	xorps	%xmm1,%xmm6
+	movups	80(%rdi),%xmm9
+	xorps	%xmm0,%xmm7
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	leaq	80(%rsi),%rsi
+	movaps	%xmm7,%xmm2
+	subq	$96,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_tail_collected:
+	andq	$15,%rdx
+	movups	%xmm9,(%r8)
+	jnz	.Lcbc_dec_tail_partial
+	movups	%xmm2,(%rsi)
+	jmp	.Lcbc_dec_ret
+.align	16
+.Lcbc_dec_tail_partial:
+	movaps	%xmm2,-24(%rsp)
+	movq	$16,%rcx
+	movq	%rsi,%rdi
+	subq	%rdx,%rcx
+	leaq	-24(%rsp),%rsi
+.long	0x9066A4F3	
+
+.Lcbc_dec_ret:
+.Lcbc_ret:
+	.byte	0xf3,0xc3
+.size	aesni_cbc_encrypt,.-aesni_cbc_encrypt
+.globl	aesni_set_decrypt_key
+.type	aesni_set_decrypt_key,@function
+.align	16
+aesni_set_decrypt_key:
+.byte	0x48,0x83,0xEC,0x08	
+	call	__aesni_set_encrypt_key
+	shll	$4,%esi
+	testl	%eax,%eax
+	jnz	.Ldec_key_ret
+	leaq	16(%rdx,%rsi,1),%rdi
+
+	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm1
+	movups	%xmm0,(%rdi)
+	movups	%xmm1,(%rdx)
+	leaq	16(%rdx),%rdx
+	leaq	-16(%rdi),%rdi
+
+.Ldec_key_inverse:
+	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm1
+.byte	102,15,56,219,192
+.byte	102,15,56,219,201
+	leaq	16(%rdx),%rdx
+	leaq	-16(%rdi),%rdi
+	movups	%xmm0,16(%rdi)
+	movups	%xmm1,-16(%rdx)
+	cmpq	%rdx,%rdi
+	ja	.Ldec_key_inverse
+
+	movups	(%rdx),%xmm0
+.byte	102,15,56,219,192
+	movups	%xmm0,(%rdi)
+.Ldec_key_ret:
+	addq	$8,%rsp
+	.byte	0xf3,0xc3
+.LSEH_end_set_decrypt_key:
+.size	aesni_set_decrypt_key,.-aesni_set_decrypt_key
+.globl	aesni_set_encrypt_key
+.type	aesni_set_encrypt_key,@function
+.align	16
+aesni_set_encrypt_key:
+__aesni_set_encrypt_key:
+.byte	0x48,0x83,0xEC,0x08	
+	movq	$-1,%rax
+	testq	%rdi,%rdi
+	jz	.Lenc_key_ret
+	testq	%rdx,%rdx
+	jz	.Lenc_key_ret
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	.L14rounds
+	cmpl	$192,%esi
+	je	.L12rounds
+	cmpl	$128,%esi
+	jne	.Lbad_keybits
+
+.L10rounds:
+	movl	$9,%esi
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,200,1
+	call	.Lkey_expansion_128_cold
+.byte	102,15,58,223,200,2
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,4
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,8
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,16
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,32
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,64
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,128
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,27
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,54
+	call	.Lkey_expansion_128
+	movups	%xmm0,(%rax)
+	movl	%esi,80(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L12rounds:
+	movq	16(%rdi),%xmm2
+	movl	$11,%esi
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,202,1
+	call	.Lkey_expansion_192a_cold
+.byte	102,15,58,223,202,2
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,4
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,8
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,16
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,32
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,64
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,128
+	call	.Lkey_expansion_192b
+	movups	%xmm0,(%rax)
+	movl	%esi,48(%rax)
+	xorq	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L14rounds:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+	movups	%xmm0,(%rdx)
+	movups	%xmm2,16(%rdx)
+.byte	102,15,58,223,202,1
+	call	.Lkey_expansion_256a_cold
+.byte	102,15,58,223,200,1
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,2
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,2
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,4
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,4
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,8
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,8
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,16
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,16
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,32
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,32
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,64
+	call	.Lkey_expansion_256a
+	movups	%xmm0,(%rax)
+	movl	%esi,16(%rax)
+	xorq	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.Lbad_keybits:
+	movq	$-2,%rax
+.Lenc_key_ret:
+	addq	$8,%rsp
+	.byte	0xf3,0xc3
+.LSEH_end_set_encrypt_key:
+
+.align	16
+.Lkey_expansion_128:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	.byte	0xf3,0xc3
+
+.align	16
+.Lkey_expansion_192a:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_192a_cold:
+	movaps	%xmm2,%xmm5
+.Lkey_expansion_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	.byte	0xf3,0xc3
+
+.align	16
+.Lkey_expansion_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%rax)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%rax)
+	leaq	32(%rax),%rax
+	jmp	.Lkey_expansion_192b_warm
+
+.align	16
+.Lkey_expansion_256a:
+	movups	%xmm2,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	.byte	0xf3,0xc3
+
+.align	16
+.Lkey_expansion_256b:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	.byte	0xf3,0xc3
+.size	aesni_set_encrypt_key,.-aesni_set_encrypt_key
+.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+.long	6,6,6,0
+.Lincrement64:
+.long	1,0,0,0
+.Lxts_magic:
+.long	0x87,0,1,0
+
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/crypto/aes/asm/bsaes-x86_64.S b/crypto/aes/asm/bsaes-x86_64.S
new file mode 100644
index 0000000..6ceb3da
--- /dev/null
+++ b/crypto/aes/asm/bsaes-x86_64.S
@@ -0,0 +1,2561 @@
+.text	
+
+
+
+
+.type	_bsaes_encrypt8,@function
+.align	64
+_bsaes_encrypt8:
+	leaq	.LBS0(%rip),%r11
+
+	movdqa	(%rax),%xmm8
+	leaq	16(%rax),%rax
+	movdqa	80(%r11),%xmm7
+	pxor	%xmm8,%xmm15
+	pxor	%xmm8,%xmm0
+.byte	102,68,15,56,0,255
+	pxor	%xmm8,%xmm1
+.byte	102,15,56,0,199
+	pxor	%xmm8,%xmm2
+.byte	102,15,56,0,207
+	pxor	%xmm8,%xmm3
+.byte	102,15,56,0,215
+	pxor	%xmm8,%xmm4
+.byte	102,15,56,0,223
+	pxor	%xmm8,%xmm5
+.byte	102,15,56,0,231
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,239
+.byte	102,15,56,0,247
+_bsaes_encrypt8_bitslice:
+	movdqa	0(%r11),%xmm7
+	movdqa	16(%r11),%xmm8
+	movdqa	%xmm5,%xmm9
+	psrlq	$1,%xmm5
+	movdqa	%xmm3,%xmm10
+	psrlq	$1,%xmm3
+	pxor	%xmm6,%xmm5
+	pxor	%xmm4,%xmm3
+	pand	%xmm7,%xmm5
+	pand	%xmm7,%xmm3
+	pxor	%xmm5,%xmm6
+	psllq	$1,%xmm5
+	pxor	%xmm3,%xmm4
+	psllq	$1,%xmm3
+	pxor	%xmm9,%xmm5
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm1,%xmm9
+	psrlq	$1,%xmm1
+	movdqa	%xmm15,%xmm10
+	psrlq	$1,%xmm15
+	pxor	%xmm2,%xmm1
+	pxor	%xmm0,%xmm15
+	pand	%xmm7,%xmm1
+	pand	%xmm7,%xmm15
+	pxor	%xmm1,%xmm2
+	psllq	$1,%xmm1
+	pxor	%xmm15,%xmm0
+	psllq	$1,%xmm15
+	pxor	%xmm9,%xmm1
+	pxor	%xmm10,%xmm15
+	movdqa	32(%r11),%xmm7
+	movdqa	%xmm4,%xmm9
+	psrlq	$2,%xmm4
+	movdqa	%xmm3,%xmm10
+	psrlq	$2,%xmm3
+	pxor	%xmm6,%xmm4
+	pxor	%xmm5,%xmm3
+	pand	%xmm8,%xmm4
+	pand	%xmm8,%xmm3
+	pxor	%xmm4,%xmm6
+	psllq	$2,%xmm4
+	pxor	%xmm3,%xmm5
+	psllq	$2,%xmm3
+	pxor	%xmm9,%xmm4
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm0,%xmm9
+	psrlq	$2,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$2,%xmm15
+	pxor	%xmm2,%xmm0
+	pxor	%xmm1,%xmm15
+	pand	%xmm8,%xmm0
+	pand	%xmm8,%xmm15
+	pxor	%xmm0,%xmm2
+	psllq	$2,%xmm0
+	pxor	%xmm15,%xmm1
+	psllq	$2,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	%xmm2,%xmm9
+	psrlq	$4,%xmm2
+	movdqa	%xmm1,%xmm10
+	psrlq	$4,%xmm1
+	pxor	%xmm6,%xmm2
+	pxor	%xmm5,%xmm1
+	pand	%xmm7,%xmm2
+	pand	%xmm7,%xmm1
+	pxor	%xmm2,%xmm6
+	psllq	$4,%xmm2
+	pxor	%xmm1,%xmm5
+	psllq	$4,%xmm1
+	pxor	%xmm9,%xmm2
+	pxor	%xmm10,%xmm1
+	movdqa	%xmm0,%xmm9
+	psrlq	$4,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$4,%xmm15
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pand	%xmm7,%xmm0
+	pand	%xmm7,%xmm15
+	pxor	%xmm0,%xmm4
+	psllq	$4,%xmm0
+	pxor	%xmm15,%xmm3
+	psllq	$4,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	decl	%r10d
+	jmp	.Lenc_sbox
+.align	16
+.Lenc_loop:
+	pxor	0(%rax),%xmm15
+	pxor	16(%rax),%xmm0
+.byte	102,68,15,56,0,255
+	pxor	32(%rax),%xmm1
+.byte	102,15,56,0,199
+	pxor	48(%rax),%xmm2
+.byte	102,15,56,0,207
+	pxor	64(%rax),%xmm3
+.byte	102,15,56,0,215
+	pxor	80(%rax),%xmm4
+.byte	102,15,56,0,223
+	pxor	96(%rax),%xmm5
+.byte	102,15,56,0,231
+	pxor	112(%rax),%xmm6
+.byte	102,15,56,0,239
+	leaq	128(%rax),%rax
+.byte	102,15,56,0,247
+.Lenc_sbox:
+	pxor	%xmm5,%xmm4
+	pxor	%xmm0,%xmm1
+	pxor	%xmm15,%xmm2
+	pxor	%xmm1,%xmm5
+	pxor	%xmm15,%xmm4
+
+	pxor	%xmm2,%xmm5
+	pxor	%xmm6,%xmm2
+	pxor	%xmm4,%xmm6
+	pxor	%xmm3,%xmm2
+	pxor	%xmm4,%xmm3
+	pxor	%xmm0,%xmm2
+
+	pxor	%xmm6,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm6,%xmm10
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm4,%xmm8
+	movdqa	%xmm1,%xmm12
+	movdqa	%xmm5,%xmm11
+
+	pxor	%xmm3,%xmm10
+	pxor	%xmm1,%xmm9
+	pxor	%xmm2,%xmm8
+	movdqa	%xmm10,%xmm13
+	pxor	%xmm3,%xmm12
+	movdqa	%xmm9,%xmm7
+	pxor	%xmm15,%xmm11
+	movdqa	%xmm10,%xmm14
+
+	por	%xmm8,%xmm9
+	por	%xmm11,%xmm10
+	pxor	%xmm7,%xmm14
+	pand	%xmm11,%xmm13
+	pxor	%xmm8,%xmm11
+	pand	%xmm8,%xmm7
+	pand	%xmm11,%xmm14
+	movdqa	%xmm2,%xmm11
+	pxor	%xmm15,%xmm11
+	pand	%xmm11,%xmm12
+	pxor	%xmm12,%xmm10
+	pxor	%xmm12,%xmm9
+	movdqa	%xmm6,%xmm12
+	movdqa	%xmm4,%xmm11
+	pxor	%xmm0,%xmm12
+	pxor	%xmm5,%xmm11
+	movdqa	%xmm12,%xmm8
+	pand	%xmm11,%xmm12
+	por	%xmm11,%xmm8
+	pxor	%xmm12,%xmm7
+	pxor	%xmm14,%xmm10
+	pxor	%xmm13,%xmm9
+	pxor	%xmm14,%xmm8
+	movdqa	%xmm1,%xmm11
+	pxor	%xmm13,%xmm7
+	movdqa	%xmm3,%xmm12
+	pxor	%xmm13,%xmm8
+	movdqa	%xmm0,%xmm13
+	pand	%xmm2,%xmm11
+	movdqa	%xmm6,%xmm14
+	pand	%xmm15,%xmm12
+	pand	%xmm4,%xmm13
+	por	%xmm5,%xmm14
+	pxor	%xmm11,%xmm10
+	pxor	%xmm12,%xmm9
+	pxor	%xmm13,%xmm8
+	pxor	%xmm14,%xmm7
+
+
+
+
+
+	movdqa	%xmm10,%xmm11
+	pand	%xmm8,%xmm10
+	pxor	%xmm9,%xmm11
+
+	movdqa	%xmm7,%xmm13
+	movdqa	%xmm11,%xmm14
+	pxor	%xmm10,%xmm13
+	pand	%xmm13,%xmm14
+
+	movdqa	%xmm8,%xmm12
+	pxor	%xmm9,%xmm14
+	pxor	%xmm7,%xmm12
+
+	pxor	%xmm9,%xmm10
+
+	pand	%xmm10,%xmm12
+
+	movdqa	%xmm13,%xmm9
+	pxor	%xmm7,%xmm12
+
+	pxor	%xmm12,%xmm9
+	pxor	%xmm12,%xmm8
+
+	pand	%xmm7,%xmm9
+
+	pxor	%xmm9,%xmm13
+	pxor	%xmm9,%xmm8
+
+	pand	%xmm14,%xmm13
+
+	pxor	%xmm11,%xmm13
+	movdqa	%xmm5,%xmm11
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm14,%xmm9
+	pxor	%xmm13,%xmm9
+	pand	%xmm5,%xmm9
+	pxor	%xmm4,%xmm5
+	pand	%xmm14,%xmm4
+	pand	%xmm13,%xmm5
+	pxor	%xmm4,%xmm5
+	pxor	%xmm9,%xmm4
+	pxor	%xmm15,%xmm11
+	pxor	%xmm2,%xmm7
+	pxor	%xmm12,%xmm14
+	pxor	%xmm8,%xmm13
+	movdqa	%xmm14,%xmm10
+	movdqa	%xmm12,%xmm9
+	pxor	%xmm13,%xmm10
+	pxor	%xmm8,%xmm9
+	pand	%xmm11,%xmm10
+	pand	%xmm15,%xmm9
+	pxor	%xmm7,%xmm11
+	pxor	%xmm2,%xmm15
+	pand	%xmm14,%xmm7
+	pand	%xmm12,%xmm2
+	pand	%xmm13,%xmm11
+	pand	%xmm8,%xmm15
+	pxor	%xmm11,%xmm7
+	pxor	%xmm2,%xmm15
+	pxor	%xmm10,%xmm11
+	pxor	%xmm9,%xmm2
+	pxor	%xmm11,%xmm5
+	pxor	%xmm11,%xmm15
+	pxor	%xmm7,%xmm4
+	pxor	%xmm7,%xmm2
+
+	movdqa	%xmm6,%xmm11
+	movdqa	%xmm0,%xmm7
+	pxor	%xmm3,%xmm11
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm14,%xmm10
+	movdqa	%xmm12,%xmm9
+	pxor	%xmm13,%xmm10
+	pxor	%xmm8,%xmm9
+	pand	%xmm11,%xmm10
+	pand	%xmm3,%xmm9
+	pxor	%xmm7,%xmm11
+	pxor	%xmm1,%xmm3
+	pand	%xmm14,%xmm7
+	pand	%xmm12,%xmm1
+	pand	%xmm13,%xmm11
+	pand	%xmm8,%xmm3
+	pxor	%xmm11,%xmm7
+	pxor	%xmm1,%xmm3
+	pxor	%xmm10,%xmm11
+	pxor	%xmm9,%xmm1
+	pxor	%xmm12,%xmm14
+	pxor	%xmm8,%xmm13
+	movdqa	%xmm14,%xmm10
+	pxor	%xmm13,%xmm10
+	pand	%xmm6,%xmm10
+	pxor	%xmm0,%xmm6
+	pand	%xmm14,%xmm0
+	pand	%xmm13,%xmm6
+	pxor	%xmm0,%xmm6
+	pxor	%xmm10,%xmm0
+	pxor	%xmm11,%xmm6
+	pxor	%xmm11,%xmm3
+	pxor	%xmm7,%xmm0
+	pxor	%xmm7,%xmm1
+	pxor	%xmm15,%xmm6
+	pxor	%xmm5,%xmm0
+	pxor	%xmm6,%xmm3
+	pxor	%xmm15,%xmm5
+	pxor	%xmm0,%xmm15
+
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	pxor	%xmm2,%xmm1
+	pxor	%xmm4,%xmm2
+	pxor	%xmm4,%xmm3
+
+	pxor	%xmm2,%xmm5
+	decl	%r10d
+	jl	.Lenc_done
+	pshufd	$147,%xmm15,%xmm7
+	pshufd	$147,%xmm0,%xmm8
+	pxor	%xmm7,%xmm15
+	pshufd	$147,%xmm3,%xmm9
+	pxor	%xmm8,%xmm0
+	pshufd	$147,%xmm5,%xmm10
+	pxor	%xmm9,%xmm3
+	pshufd	$147,%xmm2,%xmm11
+	pxor	%xmm10,%xmm5
+	pshufd	$147,%xmm6,%xmm12
+	pxor	%xmm11,%xmm2
+	pshufd	$147,%xmm1,%xmm13
+	pxor	%xmm12,%xmm6
+	pshufd	$147,%xmm4,%xmm14
+	pxor	%xmm13,%xmm1
+	pxor	%xmm14,%xmm4
+
+	pxor	%xmm15,%xmm8
+	pxor	%xmm4,%xmm7
+	pxor	%xmm4,%xmm8
+	pshufd	$78,%xmm15,%xmm15
+	pxor	%xmm0,%xmm9
+	pshufd	$78,%xmm0,%xmm0
+	pxor	%xmm2,%xmm12
+	pxor	%xmm7,%xmm15
+	pxor	%xmm6,%xmm13
+	pxor	%xmm8,%xmm0
+	pxor	%xmm5,%xmm11
+	pshufd	$78,%xmm2,%xmm7
+	pxor	%xmm1,%xmm14
+	pshufd	$78,%xmm6,%xmm8
+	pxor	%xmm3,%xmm10
+	pshufd	$78,%xmm5,%xmm2
+	pxor	%xmm4,%xmm10
+	pshufd	$78,%xmm4,%xmm6
+	pxor	%xmm4,%xmm11
+	pshufd	$78,%xmm1,%xmm5
+	pxor	%xmm11,%xmm7
+	pshufd	$78,%xmm3,%xmm1
+	pxor	%xmm12,%xmm8
+
+	pxor	%xmm10,%xmm2
+	pxor	%xmm14,%xmm6
+	pxor	%xmm13,%xmm5
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm9,%xmm1
+	movdqa	%xmm8,%xmm4
+	movdqa	48(%r11),%xmm7
+	jnz	.Lenc_loop
+	movdqa	64(%r11),%xmm7
+	jmp	.Lenc_loop
+.align	16
+.Lenc_done:
+	movdqa	0(%r11),%xmm7
+	movdqa	16(%r11),%xmm8
+	movdqa	%xmm1,%xmm9
+	psrlq	$1,%xmm1
+	movdqa	%xmm2,%xmm10
+	psrlq	$1,%xmm2
+	pxor	%xmm4,%xmm1
+	pxor	%xmm6,%xmm2
+	pand	%xmm7,%xmm1
+	pand	%xmm7,%xmm2
+	pxor	%xmm1,%xmm4
+	psllq	$1,%xmm1
+	pxor	%xmm2,%xmm6
+	psllq	$1,%xmm2
+	pxor	%xmm9,%xmm1
+	pxor	%xmm10,%xmm2
+	movdqa	%xmm3,%xmm9
+	psrlq	$1,%xmm3
+	movdqa	%xmm15,%xmm10
+	psrlq	$1,%xmm15
+	pxor	%xmm5,%xmm3
+	pxor	%xmm0,%xmm15
+	pand	%xmm7,%xmm3
+	pand	%xmm7,%xmm15
+	pxor	%xmm3,%xmm5
+	psllq	$1,%xmm3
+	pxor	%xmm15,%xmm0
+	psllq	$1,%xmm15
+	pxor	%xmm9,%xmm3
+	pxor	%xmm10,%xmm15
+	movdqa	32(%r11),%xmm7
+	movdqa	%xmm6,%xmm9
+	psrlq	$2,%xmm6
+	movdqa	%xmm2,%xmm10
+	psrlq	$2,%xmm2
+	pxor	%xmm4,%xmm6
+	pxor	%xmm1,%xmm2
+	pand	%xmm8,%xmm6
+	pand	%xmm8,%xmm2
+	pxor	%xmm6,%xmm4
+	psllq	$2,%xmm6
+	pxor	%xmm2,%xmm1
+	psllq	$2,%xmm2
+	pxor	%xmm9,%xmm6
+	pxor	%xmm10,%xmm2
+	movdqa	%xmm0,%xmm9
+	psrlq	$2,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$2,%xmm15
+	pxor	%xmm5,%xmm0
+	pxor	%xmm3,%xmm15
+	pand	%xmm8,%xmm0
+	pand	%xmm8,%xmm15
+	pxor	%xmm0,%xmm5
+	psllq	$2,%xmm0
+	pxor	%xmm15,%xmm3
+	psllq	$2,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	%xmm5,%xmm9
+	psrlq	$4,%xmm5
+	movdqa	%xmm3,%xmm10
+	psrlq	$4,%xmm3
+	pxor	%xmm4,%xmm5
+	pxor	%xmm1,%xmm3
+	pand	%xmm7,%xmm5
+	pand	%xmm7,%xmm3
+	pxor	%xmm5,%xmm4
+	psllq	$4,%xmm5
+	pxor	%xmm3,%xmm1
+	psllq	$4,%xmm3
+	pxor	%xmm9,%xmm5
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm0,%xmm9
+	psrlq	$4,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$4,%xmm15
+	pxor	%xmm6,%xmm0
+	pxor	%xmm2,%xmm15
+	pand	%xmm7,%xmm0
+	pand	%xmm7,%xmm15
+	pxor	%xmm0,%xmm6
+	psllq	$4,%xmm0
+	pxor	%xmm15,%xmm2
+	psllq	$4,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	(%rax),%xmm7
+	pxor	%xmm7,%xmm3
+	pxor	%xmm7,%xmm5
+	pxor	%xmm7,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm7,%xmm1
+	pxor	%xmm7,%xmm4
+	pxor	%xmm7,%xmm15
+	pxor	%xmm7,%xmm0
+	.byte	0xf3,0xc3
+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type	_bsaes_decrypt8,@function
+.align	64
+_bsaes_decrypt8:
+	leaq	.LBS0(%rip),%r11
+
+	movdqa	(%rax),%xmm8
+	leaq	16(%rax),%rax
+	movdqa	-48(%r11),%xmm7
+	pxor	%xmm8,%xmm15
+	pxor	%xmm8,%xmm0
+.byte	102,68,15,56,0,255
+	pxor	%xmm8,%xmm1
+.byte	102,15,56,0,199
+	pxor	%xmm8,%xmm2
+.byte	102,15,56,0,207
+	pxor	%xmm8,%xmm3
+.byte	102,15,56,0,215
+	pxor	%xmm8,%xmm4
+.byte	102,15,56,0,223
+	pxor	%xmm8,%xmm5
+.byte	102,15,56,0,231
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,239
+.byte	102,15,56,0,247
+	movdqa	0(%r11),%xmm7
+	movdqa	16(%r11),%xmm8
+	movdqa	%xmm5,%xmm9
+	psrlq	$1,%xmm5
+	movdqa	%xmm3,%xmm10
+	psrlq	$1,%xmm3
+	pxor	%xmm6,%xmm5
+	pxor	%xmm4,%xmm3
+	pand	%xmm7,%xmm5
+	pand	%xmm7,%xmm3
+	pxor	%xmm5,%xmm6
+	psllq	$1,%xmm5
+	pxor	%xmm3,%xmm4
+	psllq	$1,%xmm3
+	pxor	%xmm9,%xmm5
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm1,%xmm9
+	psrlq	$1,%xmm1
+	movdqa	%xmm15,%xmm10
+	psrlq	$1,%xmm15
+	pxor	%xmm2,%xmm1
+	pxor	%xmm0,%xmm15
+	pand	%xmm7,%xmm1
+	pand	%xmm7,%xmm15
+	pxor	%xmm1,%xmm2
+	psllq	$1,%xmm1
+	pxor	%xmm15,%xmm0
+	psllq	$1,%xmm15
+	pxor	%xmm9,%xmm1
+	pxor	%xmm10,%xmm15
+	movdqa	32(%r11),%xmm7
+	movdqa	%xmm4,%xmm9
+	psrlq	$2,%xmm4
+	movdqa	%xmm3,%xmm10
+	psrlq	$2,%xmm3
+	pxor	%xmm6,%xmm4
+	pxor	%xmm5,%xmm3
+	pand	%xmm8,%xmm4
+	pand	%xmm8,%xmm3
+	pxor	%xmm4,%xmm6
+	psllq	$2,%xmm4
+	pxor	%xmm3,%xmm5
+	psllq	$2,%xmm3
+	pxor	%xmm9,%xmm4
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm0,%xmm9
+	psrlq	$2,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$2,%xmm15
+	pxor	%xmm2,%xmm0
+	pxor	%xmm1,%xmm15
+	pand	%xmm8,%xmm0
+	pand	%xmm8,%xmm15
+	pxor	%xmm0,%xmm2
+	psllq	$2,%xmm0
+	pxor	%xmm15,%xmm1
+	psllq	$2,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	%xmm2,%xmm9
+	psrlq	$4,%xmm2
+	movdqa	%xmm1,%xmm10
+	psrlq	$4,%xmm1
+	pxor	%xmm6,%xmm2
+	pxor	%xmm5,%xmm1
+	pand	%xmm7,%xmm2
+	pand	%xmm7,%xmm1
+	pxor	%xmm2,%xmm6
+	psllq	$4,%xmm2
+	pxor	%xmm1,%xmm5
+	psllq	$4,%xmm1
+	pxor	%xmm9,%xmm2
+	pxor	%xmm10,%xmm1
+	movdqa	%xmm0,%xmm9
+	psrlq	$4,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$4,%xmm15
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pand	%xmm7,%xmm0
+	pand	%xmm7,%xmm15
+	pxor	%xmm0,%xmm4
+	psllq	$4,%xmm0
+	pxor	%xmm15,%xmm3
+	psllq	$4,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	decl	%r10d
+	jmp	.Ldec_sbox
+.align	16
+.Ldec_loop:
+	pxor	0(%rax),%xmm15
+	pxor	16(%rax),%xmm0
+.byte	102,68,15,56,0,255
+	pxor	32(%rax),%xmm1
+.byte	102,15,56,0,199
+	pxor	48(%rax),%xmm2
+.byte	102,15,56,0,207
+	pxor	64(%rax),%xmm3
+.byte	102,15,56,0,215
+	pxor	80(%rax),%xmm4
+.byte	102,15,56,0,223
+	pxor	96(%rax),%xmm5
+.byte	102,15,56,0,231
+	pxor	112(%rax),%xmm6
+.byte	102,15,56,0,239
+	leaq	128(%rax),%rax
+.byte	102,15,56,0,247
+.Ldec_sbox:
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm6,%xmm3
+	pxor	%xmm6,%xmm1
+	pxor	%xmm3,%xmm5
+	pxor	%xmm5,%xmm6
+	pxor	%xmm6,%xmm0
+
+	pxor	%xmm0,%xmm15
+	pxor	%xmm4,%xmm1
+	pxor	%xmm15,%xmm2
+	pxor	%xmm15,%xmm4
+	pxor	%xmm2,%xmm0
+	movdqa	%xmm2,%xmm10
+	movdqa	%xmm6,%xmm9
+	movdqa	%xmm0,%xmm8
+	movdqa	%xmm3,%xmm12
+	movdqa	%xmm4,%xmm11
+
+	pxor	%xmm15,%xmm10
+	pxor	%xmm3,%xmm9
+	pxor	%xmm5,%xmm8
+	movdqa	%xmm10,%xmm13
+	pxor	%xmm15,%xmm12
+	movdqa	%xmm9,%xmm7
+	pxor	%xmm1,%xmm11
+	movdqa	%xmm10,%xmm14
+
+	por	%xmm8,%xmm9
+	por	%xmm11,%xmm10
+	pxor	%xmm7,%xmm14
+	pand	%xmm11,%xmm13
+	pxor	%xmm8,%xmm11
+	pand	%xmm8,%xmm7
+	pand	%xmm11,%xmm14
+	movdqa	%xmm5,%xmm11
+	pxor	%xmm1,%xmm11
+	pand	%xmm11,%xmm12
+	pxor	%xmm12,%xmm10
+	pxor	%xmm12,%xmm9
+	movdqa	%xmm2,%xmm12
+	movdqa	%xmm0,%xmm11
+	pxor	%xmm6,%xmm12
+	pxor	%xmm4,%xmm11
+	movdqa	%xmm12,%xmm8
+	pand	%xmm11,%xmm12
+	por	%xmm11,%xmm8
+	pxor	%xmm12,%xmm7
+	pxor	%xmm14,%xmm10
+	pxor	%xmm13,%xmm9
+	pxor	%xmm14,%xmm8
+	movdqa	%xmm3,%xmm11
+	pxor	%xmm13,%xmm7
+	movdqa	%xmm15,%xmm12
+	pxor	%xmm13,%xmm8
+	movdqa	%xmm6,%xmm13
+	pand	%xmm5,%xmm11
+	movdqa	%xmm2,%xmm14
+	pand	%xmm1,%xmm12
+	pand	%xmm0,%xmm13
+	por	%xmm4,%xmm14
+	pxor	%xmm11,%xmm10
+	pxor	%xmm12,%xmm9
+	pxor	%xmm13,%xmm8
+	pxor	%xmm14,%xmm7
+
+
+
+
+
+	movdqa	%xmm10,%xmm11
+	pand	%xmm8,%xmm10
+	pxor	%xmm9,%xmm11
+
+	movdqa	%xmm7,%xmm13
+	movdqa	%xmm11,%xmm14
+	pxor	%xmm10,%xmm13
+	pand	%xmm13,%xmm14
+
+	movdqa	%xmm8,%xmm12
+	pxor	%xmm9,%xmm14
+	pxor	%xmm7,%xmm12
+
+	pxor	%xmm9,%xmm10
+
+	pand	%xmm10,%xmm12
+
+	movdqa	%xmm13,%xmm9
+	pxor	%xmm7,%xmm12
+
+	pxor	%xmm12,%xmm9
+	pxor	%xmm12,%xmm8
+
+	pand	%xmm7,%xmm9
+
+	pxor	%xmm9,%xmm13
+	pxor	%xmm9,%xmm8
+
+	pand	%xmm14,%xmm13
+
+	pxor	%xmm11,%xmm13
+	movdqa	%xmm4,%xmm11
+	movdqa	%xmm0,%xmm7
+	movdqa	%xmm14,%xmm9
+	pxor	%xmm13,%xmm9
+	pand	%xmm4,%xmm9
+	pxor	%xmm0,%xmm4
+	pand	%xmm14,%xmm0
+	pand	%xmm13,%xmm4
+	pxor	%xmm0,%xmm4
+	pxor	%xmm9,%xmm0
+	pxor	%xmm1,%xmm11
+	pxor	%xmm5,%xmm7
+	pxor	%xmm12,%xmm14
+	pxor	%xmm8,%xmm13
+	movdqa	%xmm14,%xmm10
+	movdqa	%xmm12,%xmm9
+	pxor	%xmm13,%xmm10
+	pxor	%xmm8,%xmm9
+	pand	%xmm11,%xmm10
+	pand	%xmm1,%xmm9
+	pxor	%xmm7,%xmm11
+	pxor	%xmm5,%xmm1
+	pand	%xmm14,%xmm7
+	pand	%xmm12,%xmm5
+	pand	%xmm13,%xmm11
+	pand	%xmm8,%xmm1
+	pxor	%xmm11,%xmm7
+	pxor	%xmm5,%xmm1
+	pxor	%xmm10,%xmm11
+	pxor	%xmm9,%xmm5
+	pxor	%xmm11,%xmm4
+	pxor	%xmm11,%xmm1
+	pxor	%xmm7,%xmm0
+	pxor	%xmm7,%xmm5
+
+	movdqa	%xmm2,%xmm11
+	movdqa	%xmm6,%xmm7
+	pxor	%xmm15,%xmm11
+	pxor	%xmm3,%xmm7
+	movdqa	%xmm14,%xmm10
+	movdqa	%xmm12,%xmm9
+	pxor	%xmm13,%xmm10
+	pxor	%xmm8,%xmm9
+	pand	%xmm11,%xmm10
+	pand	%xmm15,%xmm9
+	pxor	%xmm7,%xmm11
+	pxor	%xmm3,%xmm15
+	pand	%xmm14,%xmm7
+	pand	%xmm12,%xmm3
+	pand	%xmm13,%xmm11
+	pand	%xmm8,%xmm15
+	pxor	%xmm11,%xmm7
+	pxor	%xmm3,%xmm15
+	pxor	%xmm10,%xmm11
+	pxor	%xmm9,%xmm3
+	pxor	%xmm12,%xmm14
+	pxor	%xmm8,%xmm13
+	movdqa	%xmm14,%xmm10
+	pxor	%xmm13,%xmm10
+	pand	%xmm2,%xmm10
+	pxor	%xmm6,%xmm2
+	pand	%xmm14,%xmm6
+	pand	%xmm13,%xmm2
+	pxor	%xmm6,%xmm2
+	pxor	%xmm10,%xmm6
+	pxor	%xmm11,%xmm2
+	pxor	%xmm11,%xmm15
+	pxor	%xmm7,%xmm6
+	pxor	%xmm7,%xmm3
+	pxor	%xmm6,%xmm0
+	pxor	%xmm4,%xmm5
+
+	pxor	%xmm0,%xmm3
+	pxor	%xmm6,%xmm1
+	pxor	%xmm6,%xmm4
+	pxor	%xmm1,%xmm3
+	pxor	%xmm15,%xmm6
+	pxor	%xmm4,%xmm3
+	pxor	%xmm5,%xmm2
+	pxor	%xmm0,%xmm5
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm15,%xmm3
+	pxor	%xmm2,%xmm6
+	decl	%r10d
+	jl	.Ldec_done
+
+	pshufd	$147,%xmm4,%xmm14
+	movdqa	%xmm5,%xmm9
+	pxor	%xmm6,%xmm4
+	pxor	%xmm6,%xmm5
+	pshufd	$147,%xmm15,%xmm7
+	movdqa	%xmm6,%xmm12
+	pxor	%xmm15,%xmm6
+	pxor	%xmm0,%xmm15
+	pshufd	$147,%xmm0,%xmm8
+	pxor	%xmm5,%xmm0
+	pxor	%xmm2,%xmm15
+	pxor	%xmm3,%xmm0
+	pshufd	$147,%xmm3,%xmm10
+	pxor	%xmm15,%xmm5
+	pxor	%xmm4,%xmm3
+	pxor	%xmm2,%xmm4
+	pshufd	$147,%xmm2,%xmm13
+	movdqa	%xmm1,%xmm11
+	pxor	%xmm1,%xmm2
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm3
+	pxor	%xmm12,%xmm2
+	pxor	%xmm9,%xmm3
+	pxor	%xmm11,%xmm3
+	pshufd	$147,%xmm12,%xmm12
+
+	pxor	%xmm4,%xmm6
+	pxor	%xmm7,%xmm4
+	pxor	%xmm8,%xmm6
+	pshufd	$147,%xmm9,%xmm9
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm6
+	pxor	%xmm14,%xmm4
+	pshufd	$147,%xmm11,%xmm11
+	pxor	%xmm13,%xmm14
+	pxor	%xmm4,%xmm6
+
+	pxor	%xmm7,%xmm5
+	pshufd	$147,%xmm7,%xmm7
+	pxor	%xmm8,%xmm15
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm15
+	pshufd	$147,%xmm8,%xmm8
+	pxor	%xmm9,%xmm5
+	pxor	%xmm9,%xmm3
+	pxor	%xmm14,%xmm15
+	pshufd	$147,%xmm9,%xmm9
+	pxor	%xmm10,%xmm5
+	pxor	%xmm10,%xmm1
+	pxor	%xmm10,%xmm0
+	pshufd	$147,%xmm10,%xmm10
+	pxor	%xmm11,%xmm2
+	pxor	%xmm11,%xmm3
+	pxor	%xmm14,%xmm2
+	pxor	%xmm12,%xmm5
+	pxor	%xmm11,%xmm0
+	pxor	%xmm12,%xmm14
+
+	pxor	%xmm14,%xmm3
+	pshufd	$147,%xmm11,%xmm11
+	pxor	%xmm14,%xmm1
+	pxor	%xmm14,%xmm0
+
+	pxor	%xmm12,%xmm14
+	pshufd	$147,%xmm12,%xmm12
+	pxor	%xmm13,%xmm14
+
+
+	pxor	%xmm2,%xmm0
+	pxor	%xmm11,%xmm2
+	pshufd	$147,%xmm13,%xmm13
+	pxor	%xmm7,%xmm15
+	pxor	%xmm12,%xmm2
+	pxor	%xmm9,%xmm15
+	pshufd	$147,%xmm14,%xmm14
+
+	pxor	%xmm6,%xmm5
+	pxor	%xmm8,%xmm6
+	pxor	%xmm7,%xmm4
+	pxor	%xmm7,%xmm5
+	pxor	%xmm12,%xmm6
+	pxor	%xmm12,%xmm4
+	pxor	%xmm14,%xmm6
+	pshufd	$147,%xmm7,%xmm7
+	pxor	%xmm13,%xmm4
+	pxor	%xmm6,%xmm5
+	pxor	%xmm8,%xmm0
+	pshufd	$147,%xmm8,%xmm8
+
+	pxor	%xmm14,%xmm2
+	pxor	%xmm9,%xmm0
+	pxor	%xmm9,%xmm3
+	pshufd	$147,%xmm9,%xmm9
+	pxor	%xmm13,%xmm15
+	pxor	%xmm10,%xmm13
+	pxor	%xmm2,%xmm0
+	pxor	%xmm13,%xmm5
+
+	pxor	%xmm13,%xmm1
+	pxor	%xmm12,%xmm3
+	pxor	%xmm11,%xmm1
+	pshufd	$147,%xmm11,%xmm11
+	pxor	%xmm13,%xmm3
+	pxor	%xmm14,%xmm1
+	pxor	%xmm10,%xmm13
+
+	pshufd	$147,%xmm12,%xmm12
+	pshufd	$147,%xmm13,%xmm13
+	pshufd	$147,%xmm14,%xmm14
+	pshufd	$147,%xmm10,%xmm10
+
+
+	pxor	%xmm6,%xmm0
+	pxor	%xmm6,%xmm8
+	pxor	%xmm12,%xmm7
+	pxor	%xmm12,%xmm8
+	pxor	%xmm7,%xmm5
+	pxor	%xmm4,%xmm7
+	pxor	%xmm13,%xmm8
+	pxor	%xmm14,%xmm13
+	pxor	%xmm8,%xmm0
+	pxor	%xmm11,%xmm2
+	pxor	%xmm0,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm5,%xmm10
+	pxor	%xmm9,%xmm3
+	pxor	%xmm15,%xmm9
+	pxor	%xmm14,%xmm10
+	pxor	%xmm3,%xmm12
+	pxor	%xmm13,%xmm9
+	pxor	%xmm13,%xmm12
+	pxor	%xmm1,%xmm13
+	pxor	%xmm2,%xmm14
+
+	movdqa	%xmm7,%xmm15
+	movdqa	%xmm8,%xmm0
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm10,%xmm2
+	movdqa	%xmm11,%xmm3
+	movdqa	%xmm12,%xmm4
+	movdqa	%xmm13,%xmm5
+	movdqa	%xmm14,%xmm6
+	movdqa	-16(%r11),%xmm7
+	jnz	.Ldec_loop
+	movdqa	-32(%r11),%xmm7
+	jmp	.Ldec_loop
+.align	16
+.Ldec_done:
+	movdqa	0(%r11),%xmm7
+	movdqa	16(%r11),%xmm8
+	movdqa	%xmm2,%xmm9
+	psrlq	$1,%xmm2
+	movdqa	%xmm1,%xmm10
+	psrlq	$1,%xmm1
+	pxor	%xmm4,%xmm2
+	pxor	%xmm6,%xmm1
+	pand	%xmm7,%xmm2
+	pand	%xmm7,%xmm1
+	pxor	%xmm2,%xmm4
+	psllq	$1,%xmm2
+	pxor	%xmm1,%xmm6
+	psllq	$1,%xmm1
+	pxor	%xmm9,%xmm2
+	pxor	%xmm10,%xmm1
+	movdqa	%xmm5,%xmm9
+	psrlq	$1,%xmm5
+	movdqa	%xmm15,%xmm10
+	psrlq	$1,%xmm15
+	pxor	%xmm3,%xmm5
+	pxor	%xmm0,%xmm15
+	pand	%xmm7,%xmm5
+	pand	%xmm7,%xmm15
+	pxor	%xmm5,%xmm3
+	psllq	$1,%xmm5
+	pxor	%xmm15,%xmm0
+	psllq	$1,%xmm15
+	pxor	%xmm9,%xmm5
+	pxor	%xmm10,%xmm15
+	movdqa	32(%r11),%xmm7
+	movdqa	%xmm6,%xmm9
+	psrlq	$2,%xmm6
+	movdqa	%xmm1,%xmm10
+	psrlq	$2,%xmm1
+	pxor	%xmm4,%xmm6
+	pxor	%xmm2,%xmm1
+	pand	%xmm8,%xmm6
+	pand	%xmm8,%xmm1
+	pxor	%xmm6,%xmm4
+	psllq	$2,%xmm6
+	pxor	%xmm1,%xmm2
+	psllq	$2,%xmm1
+	pxor	%xmm9,%xmm6
+	pxor	%xmm10,%xmm1
+	movdqa	%xmm0,%xmm9
+	psrlq	$2,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$2,%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm15
+	pand	%xmm8,%xmm0
+	pand	%xmm8,%xmm15
+	pxor	%xmm0,%xmm3
+	psllq	$2,%xmm0
+	pxor	%xmm15,%xmm5
+	psllq	$2,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	%xmm3,%xmm9
+	psrlq	$4,%xmm3
+	movdqa	%xmm5,%xmm10
+	psrlq	$4,%xmm5
+	pxor	%xmm4,%xmm3
+	pxor	%xmm2,%xmm5
+	pand	%xmm7,%xmm3
+	pand	%xmm7,%xmm5
+	pxor	%xmm3,%xmm4
+	psllq	$4,%xmm3
+	pxor	%xmm5,%xmm2
+	psllq	$4,%xmm5
+	pxor	%xmm9,%xmm3
+	pxor	%xmm10,%xmm5
+	movdqa	%xmm0,%xmm9
+	psrlq	$4,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$4,%xmm15
+	pxor	%xmm6,%xmm0
+	pxor	%xmm1,%xmm15
+	pand	%xmm7,%xmm0
+	pand	%xmm7,%xmm15
+	pxor	%xmm0,%xmm6
+	psllq	$4,%xmm0
+	pxor	%xmm15,%xmm1
+	psllq	$4,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	(%rax),%xmm7
+	pxor	%xmm7,%xmm5
+	pxor	%xmm7,%xmm3
+	pxor	%xmm7,%xmm1
+	pxor	%xmm7,%xmm6
+	pxor	%xmm7,%xmm2
+	pxor	%xmm7,%xmm4
+	pxor	%xmm7,%xmm15
+	pxor	%xmm7,%xmm0
+	.byte	0xf3,0xc3
+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
+.type	_bsaes_key_convert,@function
+.align	16
+_bsaes_key_convert:
+	leaq	.Lmasks(%rip),%r11
+	movdqu	(%rcx),%xmm7
+	leaq	16(%rcx),%rcx
+	movdqa	0(%r11),%xmm0
+	movdqa	16(%r11),%xmm1
+	movdqa	32(%r11),%xmm2
+	movdqa	48(%r11),%xmm3
+	movdqa	64(%r11),%xmm4
+	pcmpeqd	%xmm5,%xmm5
+
+	movdqu	(%rcx),%xmm6
+	movdqa	%xmm7,(%rax)
+	leaq	16(%rax),%rax
+	decl	%r10d
+	jmp	.Lkey_loop
+.align	16
+.Lkey_loop:
+.byte	102,15,56,0,244
+
+	movdqa	%xmm0,%xmm8
+	movdqa	%xmm1,%xmm9
+
+	pand	%xmm6,%xmm8
+	pand	%xmm6,%xmm9
+	movdqa	%xmm2,%xmm10
+	pcmpeqb	%xmm0,%xmm8
+	psllq	$4,%xmm0
+	movdqa	%xmm3,%xmm11
+	pcmpeqb	%xmm1,%xmm9
+	psllq	$4,%xmm1
+
+	pand	%xmm6,%xmm10
+	pand	%xmm6,%xmm11
+	movdqa	%xmm0,%xmm12
+	pcmpeqb	%xmm2,%xmm10
+	psllq	$4,%xmm2
+	movdqa	%xmm1,%xmm13
+	pcmpeqb	%xmm3,%xmm11
+	psllq	$4,%xmm3
+
+	movdqa	%xmm2,%xmm14
+	movdqa	%xmm3,%xmm15
+	pxor	%xmm5,%xmm8
+	pxor	%xmm5,%xmm9
+
+	pand	%xmm6,%xmm12
+	pand	%xmm6,%xmm13
+	movdqa	%xmm8,0(%rax)
+	pcmpeqb	%xmm0,%xmm12
+	psrlq	$4,%xmm0
+	movdqa	%xmm9,16(%rax)
+	pcmpeqb	%xmm1,%xmm13
+	psrlq	$4,%xmm1
+	leaq	16(%rcx),%rcx
+
+	pand	%xmm6,%xmm14
+	pand	%xmm6,%xmm15
+	movdqa	%xmm10,32(%rax)
+	pcmpeqb	%xmm2,%xmm14
+	psrlq	$4,%xmm2
+	movdqa	%xmm11,48(%rax)
+	pcmpeqb	%xmm3,%xmm15
+	psrlq	$4,%xmm3
+	movdqu	(%rcx),%xmm6
+
+	pxor	%xmm5,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm12,64(%rax)
+	movdqa	%xmm13,80(%rax)
+	movdqa	%xmm14,96(%rax)
+	movdqa	%xmm15,112(%rax)
+	leaq	128(%rax),%rax
+	decl	%r10d
+	jnz	.Lkey_loop
+
+	movdqa	80(%r11),%xmm7
+
+	.byte	0xf3,0xc3
+.size	_bsaes_key_convert,.-_bsaes_key_convert
+
+.globl	bsaes_cbc_encrypt
+.type	bsaes_cbc_encrypt,@function
+.align	16
+bsaes_cbc_encrypt:
+	cmpl	$0,%r9d
+	jne	asm_AES_cbc_encrypt
+	cmpq	$128,%rdx
+	jb	asm_AES_cbc_encrypt
+
+	movq	%rsp,%rax
+.Lcbc_dec_prologue:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-72(%rsp),%rsp
+	movq	%rsp,%rbp
+	movl	240(%rcx),%eax
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+	movq	%r8,%rbx
+	shrq	$4,%r14
+
+	movl	%eax,%edx
+	shlq	$7,%rax
+	subq	$96,%rax
+	subq	%rax,%rsp
+
+	movq	%rsp,%rax
+	movq	%r15,%rcx
+	movl	%edx,%r10d
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7
+	movdqa	%xmm6,(%rax)
+	movdqa	%xmm7,(%rsp)
+
+	movdqu	(%rbx),%xmm14
+	subq	$8,%r14
+.Lcbc_dec_loop:
+	movdqu	0(%r12),%xmm15
+	movdqu	16(%r12),%xmm0
+	movdqu	32(%r12),%xmm1
+	movdqu	48(%r12),%xmm2
+	movdqu	64(%r12),%xmm3
+	movdqu	80(%r12),%xmm4
+	movq	%rsp,%rax
+	movdqu	96(%r12),%xmm5
+	movl	%edx,%r10d
+	movdqu	112(%r12),%xmm6
+	movdqa	%xmm14,32(%rbp)
+
+	call	_bsaes_decrypt8
+
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm3
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm1
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm11,%xmm6
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm2
+	movdqu	112(%r12),%xmm14
+	pxor	%xmm13,%xmm4
+	movdqu	%xmm15,0(%r13)
+	leaq	128(%r12),%r12
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	movdqu	%xmm2,96(%r13)
+	movdqu	%xmm4,112(%r13)
+	leaq	128(%r13),%r13
+	subq	$8,%r14
+	jnc	.Lcbc_dec_loop
+
+	addq	$8,%r14
+	jz	.Lcbc_dec_done
+
+	movdqu	0(%r12),%xmm15
+	movq	%rsp,%rax
+	movl	%edx,%r10d
+	cmpq	$2,%r14
+	jb	.Lcbc_dec_one
+	movdqu	16(%r12),%xmm0
+	je	.Lcbc_dec_two
+	movdqu	32(%r12),%xmm1
+	cmpq	$4,%r14
+	jb	.Lcbc_dec_three
+	movdqu	48(%r12),%xmm2
+	je	.Lcbc_dec_four
+	movdqu	64(%r12),%xmm3
+	cmpq	$6,%r14
+	jb	.Lcbc_dec_five
+	movdqu	80(%r12),%xmm4
+	je	.Lcbc_dec_six
+	movdqu	96(%r12),%xmm5
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm3
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm1
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm11,%xmm6
+	movdqu	96(%r12),%xmm14
+	pxor	%xmm12,%xmm2
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	movdqu	%xmm2,96(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_six:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm3
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm1
+	movdqu	80(%r12),%xmm14
+	pxor	%xmm11,%xmm6
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_five:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm3
+	movdqu	64(%r12),%xmm14
+	pxor	%xmm10,%xmm1
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_four:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm14
+	pxor	%xmm9,%xmm3
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_three:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm14
+	pxor	%xmm8,%xmm5
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_two:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm14
+	pxor	%xmm7,%xmm0
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_one:
+	leaq	(%r12),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r15),%rdx
+	call	asm_AES_decrypt		
+	pxor	32(%rbp),%xmm14
+	movdqu	%xmm14,(%r13)
+	movdqa	%xmm15,%xmm14
+
+.Lcbc_dec_done:
+	movdqu	%xmm14,(%rbx)
+	leaq	(%rsp),%rax
+	pxor	%xmm0,%xmm0
+.Lcbc_dec_bzero:
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	leaq	32(%rax),%rax
+	cmpq	%rax,%rbp
+	ja	.Lcbc_dec_bzero
+
+	leaq	(%rbp),%rsp
+	movq	72(%rsp),%r15
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rax
+	leaq	120(%rsp),%rsp
+	movq	%rax,%rbp
+.Lcbc_dec_epilogue:
+	.byte	0xf3,0xc3
+.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl	bsaes_ctr32_encrypt_blocks
+.type	bsaes_ctr32_encrypt_blocks,@function
+.align	16
+bsaes_ctr32_encrypt_blocks:
+	movq	%rsp,%rax
+.Lctr_enc_prologue:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-72(%rsp),%rsp
+	movq	%rsp,%rbp
+	movdqu	(%r8),%xmm0
+	movl	240(%rcx),%eax
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+	movdqa	%xmm0,32(%rbp)
+	cmpq	$8,%rdx
+	jb	.Lctr_enc_short
+
+	movl	%eax,%ebx
+	shlq	$7,%rax
+	subq	$96,%rax
+	subq	%rax,%rsp
+
+	movq	%rsp,%rax
+	movq	%r15,%rcx
+	movl	%ebx,%r10d
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7
+	movdqa	%xmm7,(%rax)
+
+	movdqa	(%rsp),%xmm8
+	leaq	.LADD1(%rip),%r11
+	movdqa	32(%rbp),%xmm15
+	movdqa	-32(%r11),%xmm7
+.byte	102,68,15,56,0,199
+.byte	102,68,15,56,0,255
+	movdqa	%xmm8,(%rsp)
+	jmp	.Lctr_enc_loop
+.align	16
+.Lctr_enc_loop:
+	movdqa	%xmm15,32(%rbp)
+	movdqa	%xmm15,%xmm0
+	movdqa	%xmm15,%xmm1
+	paddd	0(%r11),%xmm0
+	movdqa	%xmm15,%xmm2
+	paddd	16(%r11),%xmm1
+	movdqa	%xmm15,%xmm3
+	paddd	32(%r11),%xmm2
+	movdqa	%xmm15,%xmm4
+	paddd	48(%r11),%xmm3
+	movdqa	%xmm15,%xmm5
+	paddd	64(%r11),%xmm4
+	movdqa	%xmm15,%xmm6
+	paddd	80(%r11),%xmm5
+	paddd	96(%r11),%xmm6
+
+
+
+	movdqa	(%rsp),%xmm8
+	leaq	16(%rsp),%rax
+	movdqa	-16(%r11),%xmm7
+	pxor	%xmm8,%xmm15
+	pxor	%xmm8,%xmm0
+.byte	102,68,15,56,0,255
+	pxor	%xmm8,%xmm1
+.byte	102,15,56,0,199
+	pxor	%xmm8,%xmm2
+.byte	102,15,56,0,207
+	pxor	%xmm8,%xmm3
+.byte	102,15,56,0,215
+	pxor	%xmm8,%xmm4
+.byte	102,15,56,0,223
+	pxor	%xmm8,%xmm5
+.byte	102,15,56,0,231
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,239
+	leaq	.LBS0(%rip),%r11
+.byte	102,15,56,0,247
+	movl	%ebx,%r10d
+
+	call	_bsaes_encrypt8_bitslice
+
+	subq	$8,%r14
+	jc	.Lctr_enc_loop_done
+
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	movdqu	32(%r12),%xmm9
+	movdqu	48(%r12),%xmm10
+	movdqu	64(%r12),%xmm11
+	movdqu	80(%r12),%xmm12
+	movdqu	96(%r12),%xmm13
+	movdqu	112(%r12),%xmm14
+	leaq	128(%r12),%r12
+	pxor	%xmm15,%xmm7
+	movdqa	32(%rbp),%xmm15
+	pxor	%xmm8,%xmm0
+	movdqu	%xmm7,0(%r13)
+	pxor	%xmm9,%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	%xmm10,%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	%xmm11,%xmm2
+	movdqu	%xmm5,48(%r13)
+	pxor	%xmm12,%xmm6
+	movdqu	%xmm2,64(%r13)
+	pxor	%xmm13,%xmm1
+	movdqu	%xmm6,80(%r13)
+	pxor	%xmm14,%xmm4
+	movdqu	%xmm1,96(%r13)
+	leaq	.LADD1(%rip),%r11
+	movdqu	%xmm4,112(%r13)
+	leaq	128(%r13),%r13
+	paddd	112(%r11),%xmm15
+	jnz	.Lctr_enc_loop
+
+	jmp	.Lctr_enc_done
+.align	16
+.Lctr_enc_loop_done:
+	addq	$8,%r14
+	movdqu	0(%r12),%xmm7
+	pxor	%xmm7,%xmm15
+	movdqu	%xmm15,0(%r13)
+	cmpq	$2,%r14
+	jb	.Lctr_enc_done
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm8,%xmm0
+	movdqu	%xmm0,16(%r13)
+	je	.Lctr_enc_done
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm9,%xmm3
+	movdqu	%xmm3,32(%r13)
+	cmpq	$4,%r14
+	jb	.Lctr_enc_done
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm10,%xmm5
+	movdqu	%xmm5,48(%r13)
+	je	.Lctr_enc_done
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm11,%xmm2
+	movdqu	%xmm2,64(%r13)
+	cmpq	$6,%r14
+	jb	.Lctr_enc_done
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm12,%xmm6
+	movdqu	%xmm6,80(%r13)
+	je	.Lctr_enc_done
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm13,%xmm1
+	movdqu	%xmm1,96(%r13)
+	jmp	.Lctr_enc_done
+
+.align	16
+.Lctr_enc_short:
+	leaq	32(%rbp),%rdi
+	leaq	48(%rbp),%rsi
+	leaq	(%r15),%rdx
+	call	asm_AES_encrypt
+	movdqu	(%r12),%xmm0
+	leaq	16(%r12),%r12
+	movl	44(%rbp),%eax
+	bswapl	%eax
+	pxor	48(%rbp),%xmm0
+	incl	%eax
+	movdqu	%xmm0,(%r13)
+	bswapl	%eax
+	leaq	16(%r13),%r13
+	movl	%eax,44(%rsp)
+	decq	%r14
+	jnz	.Lctr_enc_short
+
+.Lctr_enc_done:
+	leaq	(%rsp),%rax
+	pxor	%xmm0,%xmm0
+.Lctr_enc_bzero:
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	leaq	32(%rax),%rax
+	cmpq	%rax,%rbp
+	ja	.Lctr_enc_bzero
+
+	leaq	(%rbp),%rsp
+	movq	72(%rsp),%r15
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rax
+	leaq	120(%rsp),%rsp
+	movq	%rax,%rbp
+.Lctr_enc_epilogue:
+	.byte	0xf3,0xc3
+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+.globl	bsaes_xts_encrypt
+.type	bsaes_xts_encrypt,@function
+.align	16
+bsaes_xts_encrypt:
+	movq	%rsp,%rax
+.Lxts_enc_prologue:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-72(%rsp),%rsp
+	movq	%rsp,%rbp
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+
+	leaq	(%r9),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r8),%rdx
+	call	asm_AES_encrypt		
+
+	movl	240(%r15),%eax
+	movq	%r14,%rbx
+
+	movl	%eax,%edx
+	shlq	$7,%rax
+	subq	$96,%rax
+	subq	%rax,%rsp
+
+	movq	%rsp,%rax
+	movq	%r15,%rcx
+	movl	%edx,%r10d
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7
+	movdqa	%xmm7,(%rax)
+
+	andq	$-16,%r14
+	subq	$128,%rsp
+	movdqa	32(%rbp),%xmm6
+
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+
+	subq	$128,%r14
+	jc	.Lxts_enc_short
+	jmp	.Lxts_enc_loop
+
+.align	16
+.Lxts_enc_loop:
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm15
+	movdqa	%xmm6,0(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm6,16(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	0(%r12),%xmm7
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm1
+	movdqa	%xmm6,32(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm15
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,48(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm0
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm3
+	movdqa	%xmm6,64(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm6,80(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm2
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,96(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm11,%xmm3
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm4
+	movdqu	112(%r12),%xmm14
+	leaq	128(%r12),%r12
+	movdqa	%xmm6,112(%rsp)
+	pxor	%xmm13,%xmm5
+	leaq	128(%rsp),%rax
+	pxor	%xmm14,%xmm6
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	64(%rsp),%xmm2
+	movdqu	%xmm5,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm2,64(%r13)
+	pxor	96(%rsp),%xmm1
+	movdqu	%xmm6,80(%r13)
+	pxor	112(%rsp),%xmm4
+	movdqu	%xmm1,96(%r13)
+	movdqu	%xmm4,112(%r13)
+	leaq	128(%r13),%r13
+
+	movdqa	112(%rsp),%xmm6
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+
+	subq	$128,%r14
+	jnc	.Lxts_enc_loop
+
+.Lxts_enc_short:
+	addq	$128,%r14
+	jz	.Lxts_enc_done
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm15
+	movdqa	%xmm6,0(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm6,16(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	0(%r12),%xmm7
+	cmpq	$16,%r14
+	je	.Lxts_enc_1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm1
+	movdqa	%xmm6,32(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	16(%r12),%xmm8
+	cmpq	$32,%r14
+	je	.Lxts_enc_2
+	pxor	%xmm7,%xmm15
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,48(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	32(%r12),%xmm9
+	cmpq	$48,%r14
+	je	.Lxts_enc_3
+	pxor	%xmm8,%xmm0
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm3
+	movdqa	%xmm6,64(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	48(%r12),%xmm10
+	cmpq	$64,%r14
+	je	.Lxts_enc_4
+	pxor	%xmm9,%xmm1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm6,80(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	64(%r12),%xmm11
+	cmpq	$80,%r14
+	je	.Lxts_enc_5
+	pxor	%xmm10,%xmm2
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,96(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	80(%r12),%xmm12
+	cmpq	$96,%r14
+	je	.Lxts_enc_6
+	pxor	%xmm11,%xmm3
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm4
+	movdqa	%xmm6,112(%rsp)
+	leaq	112(%r12),%r12
+	pxor	%xmm13,%xmm5
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	64(%rsp),%xmm2
+	movdqu	%xmm5,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm2,64(%r13)
+	pxor	96(%rsp),%xmm1
+	movdqu	%xmm6,80(%r13)
+	movdqu	%xmm1,96(%r13)
+	leaq	112(%r13),%r13
+
+	movdqa	112(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_6:
+	pxor	%xmm11,%xmm3
+	leaq	96(%r12),%r12
+	pxor	%xmm12,%xmm4
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	64(%rsp),%xmm2
+	movdqu	%xmm5,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm2,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	leaq	96(%r13),%r13
+
+	movdqa	96(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_5:
+	pxor	%xmm10,%xmm2
+	leaq	80(%r12),%r12
+	pxor	%xmm11,%xmm3
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	64(%rsp),%xmm2
+	movdqu	%xmm5,48(%r13)
+	movdqu	%xmm2,64(%r13)
+	leaq	80(%r13),%r13
+
+	movdqa	80(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_4:
+	pxor	%xmm9,%xmm1
+	leaq	64(%r12),%r12
+	pxor	%xmm10,%xmm2
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	movdqu	%xmm5,48(%r13)
+	leaq	64(%r13),%r13
+
+	movdqa	64(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_3:
+	pxor	%xmm8,%xmm0
+	leaq	48(%r12),%r12
+	pxor	%xmm9,%xmm1
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm3,32(%r13)
+	leaq	48(%r13),%r13
+
+	movdqa	48(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_2:
+	pxor	%xmm7,%xmm15
+	leaq	32(%r12),%r12
+	pxor	%xmm8,%xmm0
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	leaq	32(%r13),%r13
+
+	movdqa	32(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_1:
+	pxor	%xmm15,%xmm7
+	leaq	16(%r12),%r12
+	movdqa	%xmm7,32(%rbp)
+	leaq	32(%rbp),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r15),%rdx
+	call	asm_AES_encrypt		
+	pxor	32(%rbp),%xmm15
+
+
+
+
+
+	movdqu	%xmm15,0(%r13)
+	leaq	16(%r13),%r13
+
+	movdqa	16(%rsp),%xmm6
+
+.Lxts_enc_done:
+	andl	$15,%ebx
+	jz	.Lxts_enc_ret
+	movq	%r13,%rdx
+
+.Lxts_enc_steal:
+	movzbl	(%r12),%eax
+	movzbl	-16(%rdx),%ecx
+	leaq	1(%r12),%r12
+	movb	%al,-16(%rdx)
+	movb	%cl,0(%rdx)
+	leaq	1(%rdx),%rdx
+	subl	$1,%ebx
+	jnz	.Lxts_enc_steal
+
+	movdqu	-16(%r13),%xmm15
+	leaq	32(%rbp),%rdi
+	pxor	%xmm6,%xmm15
+	leaq	32(%rbp),%rsi
+	movdqa	%xmm15,32(%rbp)
+	leaq	(%r15),%rdx
+	call	asm_AES_encrypt		
+	pxor	32(%rbp),%xmm6
+	movdqu	%xmm6,-16(%r13)
+
+.Lxts_enc_ret:
+	leaq	(%rsp),%rax
+	pxor	%xmm0,%xmm0
+.Lxts_enc_bzero:
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	leaq	32(%rax),%rax
+	cmpq	%rax,%rbp
+	ja	.Lxts_enc_bzero
+
+	leaq	(%rbp),%rsp
+	movq	72(%rsp),%r15
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rax
+	leaq	120(%rsp),%rsp
+	movq	%rax,%rbp
+.Lxts_enc_epilogue:
+	.byte	0xf3,0xc3
+.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl	bsaes_xts_decrypt
+.type	bsaes_xts_decrypt,@function
+.align	16
+bsaes_xts_decrypt:
+	movq	%rsp,%rax
+.Lxts_dec_prologue:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-72(%rsp),%rsp
+	movq	%rsp,%rbp
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+
+	leaq	(%r9),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r8),%rdx
+	call	asm_AES_encrypt		
+
+	movl	240(%r15),%eax
+	movq	%r14,%rbx
+
+	movl	%eax,%edx
+	shlq	$7,%rax
+	subq	$96,%rax
+	subq	%rax,%rsp
+
+	movq	%rsp,%rax
+	movq	%r15,%rcx
+	movl	%edx,%r10d
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7
+	movdqa	%xmm6,(%rax)
+	movdqa	%xmm7,(%rsp)
+
+	xorl	%eax,%eax
+	andq	$-16,%r14
+	testl	$15,%ebx
+	setnz	%al
+	shlq	$4,%rax
+	subq	%rax,%r14
+
+	subq	$128,%rsp
+	movdqa	32(%rbp),%xmm6
+
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+
+	subq	$128,%r14
+	jc	.Lxts_dec_short
+	jmp	.Lxts_dec_loop
+
+.align	16
+.Lxts_dec_loop:
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm15
+	movdqa	%xmm6,0(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm6,16(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	0(%r12),%xmm7
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm1
+	movdqa	%xmm6,32(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm15
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,48(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm0
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm3
+	movdqa	%xmm6,64(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm6,80(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm2
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,96(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm11,%xmm3
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm4
+	movdqu	112(%r12),%xmm14
+	leaq	128(%r12),%r12
+	movdqa	%xmm6,112(%rsp)
+	pxor	%xmm13,%xmm5
+	leaq	128(%rsp),%rax
+	pxor	%xmm14,%xmm6
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	pxor	64(%rsp),%xmm1
+	movdqu	%xmm3,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm1,64(%r13)
+	pxor	96(%rsp),%xmm2
+	movdqu	%xmm6,80(%r13)
+	pxor	112(%rsp),%xmm4
+	movdqu	%xmm2,96(%r13)
+	movdqu	%xmm4,112(%r13)
+	leaq	128(%r13),%r13
+
+	movdqa	112(%rsp),%xmm6
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+
+	subq	$128,%r14
+	jnc	.Lxts_dec_loop
+
+.Lxts_dec_short:
+	addq	$128,%r14
+	jz	.Lxts_dec_done
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm15
+	movdqa	%xmm6,0(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm6,16(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	0(%r12),%xmm7
+	cmpq	$16,%r14
+	je	.Lxts_dec_1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm1
+	movdqa	%xmm6,32(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	16(%r12),%xmm8
+	cmpq	$32,%r14
+	je	.Lxts_dec_2
+	pxor	%xmm7,%xmm15
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,48(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	32(%r12),%xmm9
+	cmpq	$48,%r14
+	je	.Lxts_dec_3
+	pxor	%xmm8,%xmm0
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm3
+	movdqa	%xmm6,64(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	48(%r12),%xmm10
+	cmpq	$64,%r14
+	je	.Lxts_dec_4
+	pxor	%xmm9,%xmm1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm6,80(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	64(%r12),%xmm11
+	cmpq	$80,%r14
+	je	.Lxts_dec_5
+	pxor	%xmm10,%xmm2
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,96(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	80(%r12),%xmm12
+	cmpq	$96,%r14
+	je	.Lxts_dec_6
+	pxor	%xmm11,%xmm3
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm4
+	movdqa	%xmm6,112(%rsp)
+	leaq	112(%r12),%r12
+	pxor	%xmm13,%xmm5
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	pxor	64(%rsp),%xmm1
+	movdqu	%xmm3,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm1,64(%r13)
+	pxor	96(%rsp),%xmm2
+	movdqu	%xmm6,80(%r13)
+	movdqu	%xmm2,96(%r13)
+	leaq	112(%r13),%r13
+
+	movdqa	112(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_6:
+	pxor	%xmm11,%xmm3
+	leaq	96(%r12),%r12
+	pxor	%xmm12,%xmm4
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	pxor	64(%rsp),%xmm1
+	movdqu	%xmm3,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm1,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	leaq	96(%r13),%r13
+
+	movdqa	96(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_5:
+	pxor	%xmm10,%xmm2
+	leaq	80(%r12),%r12
+	pxor	%xmm11,%xmm3
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	pxor	64(%rsp),%xmm1
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	leaq	80(%r13),%r13
+
+	movdqa	80(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_4:
+	pxor	%xmm9,%xmm1
+	leaq	64(%r12),%r12
+	pxor	%xmm10,%xmm2
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	leaq	64(%r13),%r13
+
+	movdqa	64(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_3:
+	pxor	%xmm8,%xmm0
+	leaq	48(%r12),%r12
+	pxor	%xmm9,%xmm1
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	leaq	48(%r13),%r13
+
+	movdqa	48(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_2:
+	pxor	%xmm7,%xmm15
+	leaq	32(%r12),%r12
+	pxor	%xmm8,%xmm0
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	leaq	32(%r13),%r13
+
+	movdqa	32(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_1:
+	pxor	%xmm15,%xmm7
+	leaq	16(%r12),%r12
+	movdqa	%xmm7,32(%rbp)
+	leaq	32(%rbp),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r15),%rdx
+	call	asm_AES_decrypt		
+	pxor	32(%rbp),%xmm15
+
+
+
+
+
+	movdqu	%xmm15,0(%r13)
+	leaq	16(%r13),%r13
+
+	movdqa	16(%rsp),%xmm6
+
+.Lxts_dec_done:
+	andl	$15,%ebx
+	jz	.Lxts_dec_ret
+
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+	pshufd	$19,%xmm14,%xmm13
+	movdqa	%xmm6,%xmm5
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	movdqu	(%r12),%xmm15
+	pxor	%xmm13,%xmm6
+
+	leaq	32(%rbp),%rdi
+	pxor	%xmm6,%xmm15
+	leaq	32(%rbp),%rsi
+	movdqa	%xmm15,32(%rbp)
+	leaq	(%r15),%rdx
+	call	asm_AES_decrypt		
+	pxor	32(%rbp),%xmm6
+	movq	%r13,%rdx
+	movdqu	%xmm6,(%r13)
+
+.Lxts_dec_steal:
+	movzbl	16(%r12),%eax
+	movzbl	(%rdx),%ecx
+	leaq	1(%r12),%r12
+	movb	%al,(%rdx)
+	movb	%cl,16(%rdx)
+	leaq	1(%rdx),%rdx
+	subl	$1,%ebx
+	jnz	.Lxts_dec_steal
+
+	movdqu	(%r13),%xmm15
+	leaq	32(%rbp),%rdi
+	pxor	%xmm5,%xmm15
+	leaq	32(%rbp),%rsi
+	movdqa	%xmm15,32(%rbp)
+	leaq	(%r15),%rdx
+	call	asm_AES_decrypt		
+	pxor	32(%rbp),%xmm5
+	movdqu	%xmm5,(%r13)
+
+.Lxts_dec_ret:
+	leaq	(%rsp),%rax
+	pxor	%xmm0,%xmm0
+.Lxts_dec_bzero:
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	leaq	32(%rax),%rax
+	cmpq	%rax,%rbp
+	ja	.Lxts_dec_bzero
+
+	leaq	(%rbp),%rsp
+	movq	72(%rsp),%r15
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rax
+	leaq	120(%rsp),%rsp
+	movq	%rax,%rbp
+.Lxts_dec_epilogue:
+	.byte	0xf3,0xc3
+.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
+.type	_bsaes_const,@object
+.align	64
+_bsaes_const:
+.LM0ISR:
+.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0:
+.quad	0x5555555555555555, 0x5555555555555555
+.LBS1:
+.quad	0x3333333333333333, 0x3333333333333333
+.LBS2:
+.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR:
+.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0SR:
+.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+.LSWPUP:
+.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+.quad	0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1:
+.quad	0x0000000000000000, 0x0000000100000000
+.LADD2:
+.quad	0x0000000000000000, 0x0000000200000000
+.LADD3:
+.quad	0x0000000000000000, 0x0000000300000000
+.LADD4:
+.quad	0x0000000000000000, 0x0000000400000000
+.LADD5:
+.quad	0x0000000000000000, 0x0000000500000000
+.LADD6:
+.quad	0x0000000000000000, 0x0000000600000000
+.LADD7:
+.quad	0x0000000000000000, 0x0000000700000000
+.LADD8:
+.quad	0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+.long	0x87,0,1,0
+.Lmasks:
+.quad	0x0101010101010101, 0x0101010101010101
+.quad	0x0202020202020202, 0x0202020202020202
+.quad	0x0404040404040404, 0x0404040404040404
+.quad	0x0808080808080808, 0x0808080808080808
+.LM0:
+.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+.L63:
+.quad	0x6363636363636363, 0x6363636363636363
+.byte	66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0
+.align	64
+.size	_bsaes_const,.-_bsaes_const
diff --git a/crypto/aes/asm/vpaes-x86_64.S b/crypto/aes/asm/vpaes-x86_64.S
new file mode 100644
index 0000000..2b68e61
--- /dev/null
+++ b/crypto/aes/asm/vpaes-x86_64.S
@@ -0,0 +1,828 @@
+.text	
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_encrypt_core,@function
+.align	16
+_vpaes_encrypt_core:
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_ipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movdqu	(%r9),%xmm5
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_ipt+16(%rip),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm5,%xmm2
+	pxor	%xmm2,%xmm0
+	addq	$16,%r9
+	leaq	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc_entry
+
+.align	16
+.Lenc_loop:
+
+	movdqa	%xmm13,%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm15,%xmm5
+.byte	102,15,56,0,234
+	movdqa	-64(%r11,%r10,1),%xmm1
+	movdqa	%xmm14,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm5,%xmm2
+	movdqa	(%r11,%r10,1),%xmm4
+	movdqa	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andq	$48,%r11
+	pxor	%xmm3,%xmm0
+	subq	$1,%rax
+
+.Lenc_entry:
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm5
+.byte	102,15,56,0,232
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+	movdqu	(%r9),%xmm5
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	jnz	.Lenc_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%r11,%r10,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	.byte	0xf3,0xc3
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+
+
+
+
+
+.type	_vpaes_decrypt_core,@function
+.align	16
+_vpaes_decrypt_core:
+	movq	%rdx,%r9
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_dipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movq	%rax,%r11
+	psrld	$4,%xmm1
+	movdqu	(%r9),%xmm5
+	shlq	$4,%r11
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_dipt+16(%rip),%xmm0
+	xorq	$48,%r11
+	leaq	.Lk_dsbd(%rip),%r10
+.byte	102,15,56,0,193
+	andq	$48,%r11
+	pxor	%xmm5,%xmm2
+	movdqa	.Lk_mc_forward+48(%rip),%xmm5
+	pxor	%xmm2,%xmm0
+	addq	$16,%r9
+	addq	%r10,%r11
+	jmp	.Ldec_entry
+
+.align	16
+.Ldec_loop:
+
+
+
+	movdqa	-32(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	-16(%r10),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	addq	$16,%r9
+
+.byte	102,15,56,0,197
+	movdqa	0(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	16(%r10),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	subq	$1,%rax
+
+.byte	102,15,56,0,197
+	movdqa	32(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	48(%r10),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+.byte	102,15,56,0,197
+	movdqa	64(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	80(%r10),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+.byte	102,15,58,15,237,12
+
+.Ldec_entry:
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqu	(%r9),%xmm0
+	jnz	.Ldec_loop
+
+
+	movdqa	96(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%r10),%xmm0
+	movdqa	-352(%r11),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	.byte	0xf3,0xc3
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+
+
+
+
+
+.type	_vpaes_schedule_core,@function
+.align	16
+_vpaes_schedule_core:
+
+
+
+
+
+	call	_vpaes_preheat		
+	movdqa	.Lk_rcon(%rip),%xmm8
+	movdqu	(%rdi),%xmm0
+
+
+	movdqa	%xmm0,%xmm3
+	leaq	.Lk_ipt(%rip),%r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+
+	leaq	.Lk_sr(%rip),%r10
+	testq	%rcx,%rcx
+	jnz	.Lschedule_am_decrypting
+
+
+	movdqu	%xmm0,(%rdx)
+	jmp	.Lschedule_go
+
+.Lschedule_am_decrypting:
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%rdx)
+	xorq	$48,%r8
+
+.Lschedule_go:
+	cmpl	$192,%esi
+	ja	.Lschedule_256
+	je	.Lschedule_192
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+	movl	$10,%esi
+
+.Loop_schedule_128:
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	
+	jmp	.Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_192:
+	movdqu	8(%rdi),%xmm0
+	call	_vpaes_schedule_transform	
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%esi
+
+.Loop_schedule_192:
+	call	_vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_192_smear
+	jmp	.Loop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_256:
+	movdqu	16(%rdi),%xmm0
+	call	_vpaes_schedule_transform	
+	movl	$7,%esi
+
+.Loop_schedule_256:
+	call	_vpaes_schedule_mangle	
+	movdqa	%xmm0,%xmm6
+
+
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	
+
+
+	pshufd	$255,%xmm0,%xmm0
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm6,%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,%xmm7
+
+	jmp	.Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_mangle_last:
+
+	leaq	.Lk_deskew(%rip),%r11
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_last_dec
+
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,193
+	leaq	.Lk_opt(%rip),%r11
+	addq	$32,%rdx
+
+.Lschedule_mangle_last_dec:
+	addq	$-16,%rdx
+	pxor	.Lk_s63(%rip),%xmm0
+	call	_vpaes_schedule_transform 
+	movdqu	%xmm0,(%rdx)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_192_smear,@function
+.align	16
+_vpaes_schedule_192_smear:
+	pshufd	$128,%xmm6,%xmm0
+	pxor	%xmm0,%xmm6
+	pshufd	$254,%xmm7,%xmm0
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	pxor	%xmm1,%xmm1
+	movhlps	%xmm1,%xmm6
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_round,@function
+.align	16
+_vpaes_schedule_round:
+
+	pxor	%xmm1,%xmm1
+.byte	102,65,15,58,15,200,15
+.byte	102,69,15,58,15,192,15
+	pxor	%xmm1,%xmm7
+
+
+	pshufd	$255,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	.Lk_s63(%rip),%xmm7
+
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	%xmm13,%xmm4
+.byte	102,15,56,0,226
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_transform,@function
+.align	16
+_vpaes_schedule_transform:
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	(%r11),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%r11),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_mangle,@function
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,%xmm4
+	movdqa	.Lk_mc_forward(%rip),%xmm5
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_dec
+
+
+	addq	$16,%rdx
+	pxor	.Lk_s63(%rip),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+
+	jmp	.Lschedule_mangle_both
+.align	16
+.Lschedule_mangle_dec:
+
+	leaq	.Lk_dksd(%rip),%r11
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm4
+
+	movdqa	0(%r11),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	32(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	64(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	96(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+
+	addq	$-16,%rdx
+
+.Lschedule_mangle_both:
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	addq	$-16,%r8
+	andq	$48,%r8
+	movdqu	%xmm3,(%rdx)
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+
+
+
+.globl	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,@function
+.align	16
+vpaes_set_encrypt_key:
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+
+	movl	$0,%ecx
+	movl	$48,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,@function
+.align	16
+vpaes_set_decrypt_key:
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+	shll	$4,%eax
+	leaq	16(%rdx,%rax,1),%rdx
+
+	movl	$1,%ecx
+	movl	%esi,%r8d
+	shrl	$1,%r8d
+	andl	$32,%r8d
+	xorl	$32,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+.globl	vpaes_encrypt
+.type	vpaes_encrypt,@function
+.align	16
+vpaes_encrypt:
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%rsi)
+	.byte	0xf3,0xc3
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+.globl	vpaes_decrypt
+.type	vpaes_decrypt,@function
+.align	16
+vpaes_decrypt:
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%rsi)
+	.byte	0xf3,0xc3
+.size	vpaes_decrypt,.-vpaes_decrypt
+.globl	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,@function
+.align	16
+vpaes_cbc_encrypt:
+	xchgq	%rcx,%rdx
+	subq	$16,%rcx
+	jc	.Lcbc_abort
+	movdqu	(%r8),%xmm6
+	subq	%rdi,%rsi
+	call	_vpaes_preheat
+	cmpl	$0,%r9d
+	je	.Lcbc_dec_loop
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movdqu	(%rdi),%xmm0
+	pxor	%xmm6,%xmm0
+	call	_vpaes_encrypt_core
+	movdqa	%xmm0,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_enc_loop
+	jmp	.Lcbc_done
+.align	16
+.Lcbc_dec_loop:
+	movdqu	(%rdi),%xmm0
+	movdqa	%xmm0,%xmm7
+	call	_vpaes_decrypt_core
+	pxor	%xmm6,%xmm0
+	movdqa	%xmm7,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_dec_loop
+.Lcbc_done:
+	movdqu	%xmm6,(%r8)
+.Lcbc_abort:
+	.byte	0xf3,0xc3
+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+
+
+
+
+
+.type	_vpaes_preheat,@function
+.align	16
+_vpaes_preheat:
+	leaq	.Lk_s0F(%rip),%r10
+	movdqa	-32(%r10),%xmm10
+	movdqa	-16(%r10),%xmm11
+	movdqa	0(%r10),%xmm9
+	movdqa	48(%r10),%xmm13
+	movdqa	64(%r10),%xmm12
+	movdqa	80(%r10),%xmm15
+	movdqa	96(%r10),%xmm14
+	.byte	0xf3,0xc3
+.size	_vpaes_preheat,.-_vpaes_preheat
+
+
+
+
+
+.type	_vpaes_consts,@object
+.align	64
+_vpaes_consts:
+.Lk_inv:
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+.Lk_dksd:
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+.Lk_dipt:
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	64
+.size	_vpaes_consts,.-_vpaes_consts
diff --git a/crypto/bn/asm/modexp512-x86_64.S b/crypto/bn/asm/modexp512-x86_64.S
new file mode 100644
index 0000000..6cccafb
--- /dev/null
+++ b/crypto/bn/asm/modexp512-x86_64.S
@@ -0,0 +1,1773 @@
+.text	
+
+.type	MULADD_128x512,@function
+.align	16
+MULADD_128x512:
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	movq	%r8,0(%rcx)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+	movq	8(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	movq	%r9,8(%rcx)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	.byte	0xf3,0xc3
+.size	MULADD_128x512,.-MULADD_128x512
+.type	mont_reduce,@function
+.align	16
+mont_reduce:
+	leaq	192(%rsp),%rdi
+	movq	32(%rsp),%rsi
+	addq	$576,%rsi
+	leaq	520(%rsp),%rcx
+
+	movq	96(%rcx),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	movq	(%rcx),%r8
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	movq	%r8,0(%rdi)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	movq	8(%rcx),%r9
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	movq	16(%rcx),%r10
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	movq	24(%rcx),%r11
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	movq	32(%rcx),%r12
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	movq	40(%rcx),%r13
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	movq	48(%rcx),%r14
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	movq	56(%rcx),%r15
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+	movq	104(%rcx),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	movq	%r9,8(%rdi)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	movq	112(%rcx),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%r10,16(%rdi)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	120(%rcx),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%r11,24(%rdi)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	xorq	%rax,%rax
+
+	addq	64(%rcx),%r8
+	adcq	72(%rcx),%r9
+	adcq	80(%rcx),%r10
+	adcq	88(%rcx),%r11
+	adcq	$0,%rax
+
+
+
+
+	movq	%r8,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r10,%rbp
+	movq	%r11,88(%rdi)
+
+	movq	%rax,384(%rsp)
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+
+
+
+
+
+
+
+
+	addq	$80,%rdi
+
+	addq	$64,%rsi
+	leaq	296(%rsp),%rcx
+
+	call	MULADD_128x512			
+
+	movq	384(%rsp),%rax
+
+
+	addq	-16(%rdi),%r8
+	adcq	-8(%rdi),%r9
+	movq	%r8,64(%rcx)
+	movq	%r9,72(%rcx)
+
+	adcq	%rax,%rax
+	movq	%rax,384(%rsp)
+
+	leaq	192(%rsp),%rdi
+	addq	$64,%rsi
+
+
+
+
+
+	movq	(%rsi),%r8
+	movq	8(%rsi),%rbx
+
+	movq	(%rcx),%rax
+	mulq	%r8
+	movq	%rax,%rbp
+	movq	%rdx,%r9
+
+	movq	8(%rcx),%rax
+	mulq	%r8
+	addq	%rax,%r9
+
+	movq	(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r9
+
+	movq	%r9,8(%rdi)
+
+
+	subq	$192,%rsi
+
+	movq	(%rcx),%r8
+	movq	8(%rcx),%r9
+
+	call	MULADD_128x512			
+
+
+
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rbx
+	movq	16(%rsi),%rdi
+	movq	24(%rsi),%rdx
+
+
+	movq	384(%rsp),%rbp
+
+	addq	64(%rcx),%r8
+	adcq	72(%rcx),%r9
+
+
+	adcq	%rbp,%rbp
+
+
+
+	shlq	$3,%rbp
+	movq	32(%rsp),%rcx
+	addq	%rcx,%rbp
+
+
+	xorq	%rsi,%rsi
+
+	addq	0(%rbp),%r10
+	adcq	64(%rbp),%r11
+	adcq	128(%rbp),%r12
+	adcq	192(%rbp),%r13
+	adcq	256(%rbp),%r14
+	adcq	320(%rbp),%r15
+	adcq	384(%rbp),%r8
+	adcq	448(%rbp),%r9
+
+
+
+	sbbq	$0,%rsi
+
+
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rdi
+	andq	%rsi,%rdx
+
+	movq	$1,%rbp
+	subq	%rax,%r10
+	sbbq	%rbx,%r11
+	sbbq	%rdi,%r12
+	sbbq	%rdx,%r13
+
+
+
+
+	sbbq	$0,%rbp
+
+
+
+	addq	$512,%rcx
+	movq	32(%rcx),%rax
+	movq	40(%rcx),%rbx
+	movq	48(%rcx),%rdi
+	movq	56(%rcx),%rdx
+
+
+
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rdi
+	andq	%rsi,%rdx
+
+
+
+	subq	$1,%rbp
+
+	sbbq	%rax,%r14
+	sbbq	%rbx,%r15
+	sbbq	%rdi,%r8
+	sbbq	%rdx,%r9
+
+
+
+	movq	144(%rsp),%rsi
+	movq	%r10,0(%rsi)
+	movq	%r11,8(%rsi)
+	movq	%r12,16(%rsi)
+	movq	%r13,24(%rsi)
+	movq	%r14,32(%rsi)
+	movq	%r15,40(%rsi)
+	movq	%r8,48(%rsi)
+	movq	%r9,56(%rsi)
+
+	.byte	0xf3,0xc3
+.size	mont_reduce,.-mont_reduce
+.type	mont_mul_a3b,@function
+.align	16
+mont_mul_a3b:
+
+
+
+
+	movq	0(%rdi),%rbp
+
+	movq	%r10,%rax
+	mulq	%rbp
+	movq	%rax,520(%rsp)
+	movq	%rdx,%r10
+	movq	%r11,%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	movq	%r12,%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%r13,%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	movq	%r14,%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	movq	%r15,%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	movq	%r8,%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+	movq	%r9,%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	movq	8(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%r10,528(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	16(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%r11,536(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	movq	24(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	movq	%r12,544(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	32(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	movq	%r13,552(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	movq	40(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%r14,560(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	movq	48(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%r15,568(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	movq	56(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	movq	%r8,576(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+	movq	%r9,584(%rsp)
+	movq	%r10,592(%rsp)
+	movq	%r11,600(%rsp)
+	movq	%r12,608(%rsp)
+	movq	%r13,616(%rsp)
+	movq	%r14,624(%rsp)
+	movq	%r15,632(%rsp)
+	movq	%r8,640(%rsp)
+
+
+
+
+
+	jmp	mont_reduce
+
+
+.size	mont_mul_a3b,.-mont_mul_a3b
+.type	sqr_reduce,@function
+.align	16
+sqr_reduce:
+	movq	16(%rsp),%rcx
+
+
+
+	movq	%r10,%rbx
+
+	movq	%r11,%rax
+	mulq	%rbx
+	movq	%rax,528(%rsp)
+	movq	%rdx,%r10
+	movq	%r12,%rax
+	mulq	%rbx
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	movq	%r13,%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%r14,%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	movq	%r15,%rax
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rsi
+
+	movq	%r10,536(%rsp)
+
+
+
+
+
+	movq	8(%rcx),%rbx
+
+	movq	16(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%r11,544(%rsp)
+
+	movq	%rdx,%r10
+	movq	24(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%r12,552(%rsp)
+
+	movq	%rdx,%r10
+	movq	32(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	40(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%rsi
+	adcq	$0,%rdx
+	addq	%r10,%rsi
+	adcq	$0,%rdx
+
+	movq	%rdx,%r11
+
+
+
+
+	movq	16(%rcx),%rbx
+
+	movq	24(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	movq	%r13,560(%rsp)
+
+	movq	%rdx,%r10
+	movq	32(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%r14,568(%rsp)
+
+	movq	%rdx,%r10
+	movq	40(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%rsi
+	adcq	$0,%rdx
+	addq	%r10,%rsi
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%r10,%r11
+	adcq	$0,%rdx
+
+	movq	%rdx,%r12
+
+
+
+
+
+	movq	24(%rcx),%rbx
+
+	movq	32(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%r15,576(%rsp)
+
+	movq	%rdx,%r10
+	movq	40(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%rsi
+	adcq	$0,%rdx
+	addq	%r10,%rsi
+	adcq	$0,%rdx
+	movq	%rsi,584(%rsp)
+
+	movq	%rdx,%r10
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%r10,%r11
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+
+	movq	%rdx,%r15
+
+
+
+
+	movq	32(%rcx),%rbx
+
+	movq	40(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%r11,592(%rsp)
+
+	movq	%rdx,%r10
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%r12,600(%rsp)
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+
+	movq	%rdx,%r11
+
+
+
+
+	movq	40(%rcx),%rbx
+
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%r15,608(%rsp)
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r11,616(%rsp)
+
+	movq	%rdx,%r12
+
+
+
+
+	movq	%r8,%rbx
+
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	movq	%r12,624(%rsp)
+
+	movq	%rdx,632(%rsp)
+
+
+	movq	528(%rsp),%r10
+	movq	536(%rsp),%r11
+	movq	544(%rsp),%r12
+	movq	552(%rsp),%r13
+	movq	560(%rsp),%r14
+	movq	568(%rsp),%r15
+
+	movq	24(%rcx),%rax
+	mulq	%rax
+	movq	%rax,%rdi
+	movq	%rdx,%r8
+
+	addq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	$0,%r8
+
+	movq	0(%rcx),%rax
+	mulq	%rax
+	movq	%rax,520(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rcx),%rax
+	mulq	%rax
+
+	addq	%rbx,%r10
+	adcq	%rax,%r11
+	adcq	$0,%rdx
+
+	movq	%rdx,%rbx
+	movq	%r10,528(%rsp)
+	movq	%r11,536(%rsp)
+
+	movq	16(%rcx),%rax
+	mulq	%rax
+
+	addq	%rbx,%r12
+	adcq	%rax,%r13
+	adcq	$0,%rdx
+
+	movq	%rdx,%rbx
+
+	movq	%r12,544(%rsp)
+	movq	%r13,552(%rsp)
+
+	xorq	%rbp,%rbp
+	addq	%rbx,%r14
+	adcq	%rdi,%r15
+	adcq	$0,%rbp
+
+	movq	%r14,560(%rsp)
+	movq	%r15,568(%rsp)
+
+
+
+
+	movq	576(%rsp),%r10
+	movq	584(%rsp),%r11
+	movq	592(%rsp),%r12
+	movq	600(%rsp),%r13
+	movq	608(%rsp),%r14
+	movq	616(%rsp),%r15
+	movq	624(%rsp),%rdi
+	movq	632(%rsp),%rsi
+
+	movq	%r9,%rax
+	mulq	%rax
+	movq	%rax,%r9
+	movq	%rdx,%rbx
+
+	addq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%rdi,%rdi
+	adcq	%rsi,%rsi
+	adcq	$0,%rbx
+
+	addq	%rbp,%r10
+
+	movq	32(%rcx),%rax
+	mulq	%rax
+
+	addq	%r8,%r10
+	adcq	%rax,%r11
+	adcq	$0,%rdx
+
+	movq	%rdx,%rbp
+
+	movq	%r10,576(%rsp)
+	movq	%r11,584(%rsp)
+
+	movq	40(%rcx),%rax
+	mulq	%rax
+
+	addq	%rbp,%r12
+	adcq	%rax,%r13
+	adcq	$0,%rdx
+
+	movq	%rdx,%rbp
+
+	movq	%r12,592(%rsp)
+	movq	%r13,600(%rsp)
+
+	movq	48(%rcx),%rax
+	mulq	%rax
+
+	addq	%rbp,%r14
+	adcq	%rax,%r15
+	adcq	$0,%rdx
+
+	movq	%r14,608(%rsp)
+	movq	%r15,616(%rsp)
+
+	addq	%rdx,%rdi
+	adcq	%r9,%rsi
+	adcq	$0,%rbx
+
+	movq	%rdi,624(%rsp)
+	movq	%rsi,632(%rsp)
+	movq	%rbx,640(%rsp)
+
+	jmp	mont_reduce
+
+
+.size	sqr_reduce,.-sqr_reduce
+.globl	mod_exp_512
+.type	mod_exp_512,@function
+mod_exp_512:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+
+	movq	%rsp,%r8
+	subq	$2688,%rsp
+	andq	$-64,%rsp
+
+
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rcx,24(%rsp)
+.Lbody:
+
+
+
+	pxor	%xmm4,%xmm4
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqa	%xmm4,512(%rsp)
+	movdqa	%xmm4,528(%rsp)
+	movdqa	%xmm4,608(%rsp)
+	movdqa	%xmm4,624(%rsp)
+	movdqa	%xmm0,544(%rsp)
+	movdqa	%xmm1,560(%rsp)
+	movdqa	%xmm2,576(%rsp)
+	movdqa	%xmm3,592(%rsp)
+
+
+	movdqu	0(%rdx),%xmm0
+	movdqu	16(%rdx),%xmm1
+	movdqu	32(%rdx),%xmm2
+	movdqu	48(%rdx),%xmm3
+
+	leaq	384(%rsp),%rbx
+	movq	%rbx,136(%rsp)
+	call	mont_reduce
+
+
+	leaq	448(%rsp),%rcx
+	xorq	%rax,%rax
+	movq	%rax,0(%rcx)
+	movq	%rax,8(%rcx)
+	movq	%rax,24(%rcx)
+	movq	%rax,32(%rcx)
+	movq	%rax,40(%rcx)
+	movq	%rax,48(%rcx)
+	movq	%rax,56(%rcx)
+	movq	%rax,128(%rsp)
+	movq	$1,16(%rcx)
+
+	leaq	640(%rsp),%rbp
+	movq	%rcx,%rsi
+	movq	%rbp,%rdi
+	movq	$8,%rax
+loop_0:
+	movq	(%rcx),%rbx
+	movw	%bx,(%rdi)
+	shrq	$16,%rbx
+	movw	%bx,64(%rdi)
+	shrq	$16,%rbx
+	movw	%bx,128(%rdi)
+	shrq	$16,%rbx
+	movw	%bx,192(%rdi)
+	leaq	8(%rcx),%rcx
+	leaq	256(%rdi),%rdi
+	decq	%rax
+	jnz	loop_0
+	movq	$31,%rax
+	movq	%rax,32(%rsp)
+	movq	%rbp,40(%rsp)
+
+	movq	%rsi,136(%rsp)
+	movq	0(%rsi),%r10
+	movq	8(%rsi),%r11
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+	movq	32(%rsi),%r14
+	movq	40(%rsi),%r15
+	movq	48(%rsi),%r8
+	movq	56(%rsi),%r9
+init_loop:
+	leaq	384(%rsp),%rdi
+	call	mont_mul_a3b
+	leaq	448(%rsp),%rsi
+	movq	40(%rsp),%rbp
+	addq	$2,%rbp
+	movq	%rbp,40(%rsp)
+	movq	%rsi,%rcx
+	movq	$8,%rax
+loop_1:
+	movq	(%rcx),%rbx
+	movw	%bx,(%rbp)
+	shrq	$16,%rbx
+	movw	%bx,64(%rbp)
+	shrq	$16,%rbx
+	movw	%bx,128(%rbp)
+	shrq	$16,%rbx
+	movw	%bx,192(%rbp)
+	leaq	8(%rcx),%rcx
+	leaq	256(%rbp),%rbp
+	decq	%rax
+	jnz	loop_1
+	movq	32(%rsp),%rax
+	subq	$1,%rax
+	movq	%rax,32(%rsp)
+	jne	init_loop
+
+
+
+	movdqa	%xmm0,64(%rsp)
+	movdqa	%xmm1,80(%rsp)
+	movdqa	%xmm2,96(%rsp)
+	movdqa	%xmm3,112(%rsp)
+
+
+
+
+
+	movl	126(%rsp),%eax
+	movq	%rax,%rdx
+	shrq	$11,%rax
+	andl	$2047,%edx
+	movl	%edx,126(%rsp)
+	leaq	640(%rsp,%rax,2),%rsi
+	movq	8(%rsp),%rdx
+	movq	$4,%rbp
+loop_2:
+	movzwq	192(%rsi),%rbx
+	movzwq	448(%rsi),%rax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	128(%rsi),%bx
+	movw	384(%rsi),%ax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	64(%rsi),%bx
+	movw	320(%rsi),%ax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	0(%rsi),%bx
+	movw	256(%rsi),%ax
+	movq	%rbx,0(%rdx)
+	movq	%rax,8(%rdx)
+	leaq	512(%rsi),%rsi
+	leaq	16(%rdx),%rdx
+	subq	$1,%rbp
+	jnz	loop_2
+	movq	$505,48(%rsp)
+
+	movq	8(%rsp),%rcx
+	movq	%rcx,136(%rsp)
+	movq	0(%rcx),%r10
+	movq	8(%rcx),%r11
+	movq	16(%rcx),%r12
+	movq	24(%rcx),%r13
+	movq	32(%rcx),%r14
+	movq	40(%rcx),%r15
+	movq	48(%rcx),%r8
+	movq	56(%rcx),%r9
+	jmp	sqr_2
+
+main_loop_a3b:
+	call	sqr_reduce
+	call	sqr_reduce
+	call	sqr_reduce
+sqr_2:
+	call	sqr_reduce
+	call	sqr_reduce
+
+
+
+	movq	48(%rsp),%rcx
+	movq	%rcx,%rax
+	shrq	$4,%rax
+	movl	64(%rsp,%rax,2),%edx
+	andq	$15,%rcx
+	shrq	%cl,%rdx
+	andq	$31,%rdx
+
+	leaq	640(%rsp,%rdx,2),%rsi
+	leaq	448(%rsp),%rdx
+	movq	%rdx,%rdi
+	movq	$4,%rbp
+loop_3:
+	movzwq	192(%rsi),%rbx
+	movzwq	448(%rsi),%rax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	128(%rsi),%bx
+	movw	384(%rsi),%ax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	64(%rsi),%bx
+	movw	320(%rsi),%ax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	0(%rsi),%bx
+	movw	256(%rsi),%ax
+	movq	%rbx,0(%rdx)
+	movq	%rax,8(%rdx)
+	leaq	512(%rsi),%rsi
+	leaq	16(%rdx),%rdx
+	subq	$1,%rbp
+	jnz	loop_3
+	movq	8(%rsp),%rsi
+	call	mont_mul_a3b
+
+
+
+	movq	48(%rsp),%rcx
+	subq	$5,%rcx
+	movq	%rcx,48(%rsp)
+	jge	main_loop_a3b
+
+
+
+end_main_loop_a3b:
+
+
+	movq	8(%rsp),%rdx
+	pxor	%xmm4,%xmm4
+	movdqu	0(%rdx),%xmm0
+	movdqu	16(%rdx),%xmm1
+	movdqu	32(%rdx),%xmm2
+	movdqu	48(%rdx),%xmm3
+	movdqa	%xmm4,576(%rsp)
+	movdqa	%xmm4,592(%rsp)
+	movdqa	%xmm4,608(%rsp)
+	movdqa	%xmm4,624(%rsp)
+	movdqa	%xmm0,512(%rsp)
+	movdqa	%xmm1,528(%rsp)
+	movdqa	%xmm2,544(%rsp)
+	movdqa	%xmm3,560(%rsp)
+	call	mont_reduce
+
+
+
+	movq	8(%rsp),%rax
+	movq	0(%rax),%r8
+	movq	8(%rax),%r9
+	movq	16(%rax),%r10
+	movq	24(%rax),%r11
+	movq	32(%rax),%r12
+	movq	40(%rax),%r13
+	movq	48(%rax),%r14
+	movq	56(%rax),%r15
+
+
+	movq	24(%rsp),%rbx
+	addq	$512,%rbx
+
+	subq	0(%rbx),%r8
+	sbbq	8(%rbx),%r9
+	sbbq	16(%rbx),%r10
+	sbbq	24(%rbx),%r11
+	sbbq	32(%rbx),%r12
+	sbbq	40(%rbx),%r13
+	sbbq	48(%rbx),%r14
+	sbbq	56(%rbx),%r15
+
+
+	movq	0(%rax),%rsi
+	movq	8(%rax),%rdi
+	movq	16(%rax),%rcx
+	movq	24(%rax),%rdx
+	cmovncq	%r8,%rsi
+	cmovncq	%r9,%rdi
+	cmovncq	%r10,%rcx
+	cmovncq	%r11,%rdx
+	movq	%rsi,0(%rax)
+	movq	%rdi,8(%rax)
+	movq	%rcx,16(%rax)
+	movq	%rdx,24(%rax)
+
+	movq	32(%rax),%rsi
+	movq	40(%rax),%rdi
+	movq	48(%rax),%rcx
+	movq	56(%rax),%rdx
+	cmovncq	%r12,%rsi
+	cmovncq	%r13,%rdi
+	cmovncq	%r14,%rcx
+	cmovncq	%r15,%rdx
+	movq	%rsi,32(%rax)
+	movq	%rdi,40(%rax)
+	movq	%rcx,48(%rax)
+	movq	%rdx,56(%rax)
+
+	movq	0(%rsp),%rsi
+	movq	0(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbx
+	movq	40(%rsi),%rbp
+	leaq	48(%rsi),%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	mod_exp_512, . - mod_exp_512
diff --git a/crypto/bn/asm/x86_64-gf2m.S b/crypto/bn/asm/x86_64-gf2m.S
new file mode 100644
index 0000000..ccd2ed7
--- /dev/null
+++ b/crypto/bn/asm/x86_64-gf2m.S
@@ -0,0 +1,291 @@
+.text	
+
+.type	_mul_1x1,@function
+.align	16
+_mul_1x1:
+	subq	$128+8,%rsp
+	movq	$-1,%r9
+	leaq	(%rax,%rax,1),%rsi
+	shrq	$3,%r9
+	leaq	(,%rax,4),%rdi
+	andq	%rax,%r9
+	leaq	(,%rax,8),%r12
+	sarq	$63,%rax
+	leaq	(%r9,%r9,1),%r10
+	sarq	$63,%rsi
+	leaq	(,%r9,4),%r11
+	andq	%rbp,%rax
+	sarq	$63,%rdi
+	movq	%rax,%rdx
+	shlq	$63,%rax
+	andq	%rbp,%rsi
+	shrq	$1,%rdx
+	movq	%rsi,%rcx
+	shlq	$62,%rsi
+	andq	%rbp,%rdi
+	shrq	$2,%rcx
+	xorq	%rsi,%rax
+	movq	%rdi,%rbx
+	shlq	$61,%rdi
+	xorq	%rcx,%rdx
+	shrq	$3,%rbx
+	xorq	%rdi,%rax
+	xorq	%rbx,%rdx
+
+	movq	%r9,%r13
+	movq	$0,0(%rsp)
+	xorq	%r10,%r13
+	movq	%r9,8(%rsp)
+	movq	%r11,%r14
+	movq	%r10,16(%rsp)
+	xorq	%r12,%r14
+	movq	%r13,24(%rsp)
+
+	xorq	%r11,%r9
+	movq	%r11,32(%rsp)
+	xorq	%r11,%r10
+	movq	%r9,40(%rsp)
+	xorq	%r11,%r13
+	movq	%r10,48(%rsp)
+	xorq	%r14,%r9
+	movq	%r13,56(%rsp)
+	xorq	%r14,%r10
+
+	movq	%r12,64(%rsp)
+	xorq	%r14,%r13
+	movq	%r9,72(%rsp)
+	xorq	%r11,%r9
+	movq	%r10,80(%rsp)
+	xorq	%r11,%r10
+	movq	%r13,88(%rsp)
+
+	xorq	%r11,%r13
+	movq	%r14,96(%rsp)
+	movq	%r8,%rsi
+	movq	%r9,104(%rsp)
+	andq	%rbp,%rsi
+	movq	%r10,112(%rsp)
+	shrq	$4,%rbp
+	movq	%r13,120(%rsp)
+	movq	%r8,%rdi
+	andq	%rbp,%rdi
+	shrq	$4,%rbp
+
+	movq	(%rsp,%rsi,8),%xmm0
+	movq	%r8,%rsi
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$4,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$60,%rbx
+	xorq	%rcx,%rax
+	pslldq	$1,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$12,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$52,%rbx
+	xorq	%rcx,%rax
+	pslldq	$2,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$20,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$44,%rbx
+	xorq	%rcx,%rax
+	pslldq	$3,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$28,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$36,%rbx
+	xorq	%rcx,%rax
+	pslldq	$4,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$36,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$28,%rbx
+	xorq	%rcx,%rax
+	pslldq	$5,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$44,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$20,%rbx
+	xorq	%rcx,%rax
+	pslldq	$6,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$52,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$12,%rbx
+	xorq	%rcx,%rax
+	pslldq	$7,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%rcx,%rbx
+	shlq	$60,%rcx
+.byte	102,72,15,126,198
+	shrq	$4,%rbx
+	xorq	%rcx,%rax
+	psrldq	$8,%xmm0
+	xorq	%rbx,%rdx
+.byte	102,72,15,126,199
+	xorq	%rsi,%rax
+	xorq	%rdi,%rdx
+
+	addq	$128+8,%rsp
+	.byte	0xf3,0xc3
+.Lend_mul_1x1:
+.size	_mul_1x1,.-_mul_1x1
+
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,@function
+.align	16
+bn_GF2m_mul_2x2:
+	movq	OPENSSL_ia32cap_P(%rip),%rax
+	btq	$33,%rax
+	jnc	.Lvanilla_mul_2x2
+
+.byte	102,72,15,110,198
+.byte	102,72,15,110,201
+.byte	102,72,15,110,210
+.byte	102,73,15,110,216
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm1,%xmm5
+.byte	102,15,58,68,193,0
+	pxor	%xmm2,%xmm4
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,68,211,0
+.byte	102,15,58,68,229,0
+	xorps	%xmm0,%xmm4
+	xorps	%xmm2,%xmm4
+	movdqa	%xmm4,%xmm5
+	pslldq	$8,%xmm4
+	psrldq	$8,%xmm5
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm0
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	.byte	0xf3,0xc3
+
+.align	16
+.Lvanilla_mul_2x2:
+	leaq	-136(%rsp),%rsp
+	movq	%r14,80(%rsp)
+	movq	%r13,88(%rsp)
+	movq	%r12,96(%rsp)
+	movq	%rbp,104(%rsp)
+	movq	%rbx,112(%rsp)
+.Lbody_mul_2x2:
+	movq	%rdi,32(%rsp)
+	movq	%rsi,40(%rsp)
+	movq	%rdx,48(%rsp)
+	movq	%rcx,56(%rsp)
+	movq	%r8,64(%rsp)
+
+	movq	$15,%r8
+	movq	%rsi,%rax
+	movq	%rcx,%rbp
+	call	_mul_1x1		
+	movq	%rax,16(%rsp)
+	movq	%rdx,24(%rsp)
+
+	movq	48(%rsp),%rax
+	movq	64(%rsp),%rbp
+	call	_mul_1x1		
+	movq	%rax,0(%rsp)
+	movq	%rdx,8(%rsp)
+
+	movq	40(%rsp),%rax
+	movq	56(%rsp),%rbp
+	xorq	48(%rsp),%rax
+	xorq	64(%rsp),%rbp
+	call	_mul_1x1		
+	movq	0(%rsp),%rbx
+	movq	8(%rsp),%rcx
+	movq	16(%rsp),%rdi
+	movq	24(%rsp),%rsi
+	movq	32(%rsp),%rbp
+
+	xorq	%rdx,%rax
+	xorq	%rcx,%rdx
+	xorq	%rbx,%rax
+	movq	%rbx,0(%rbp)
+	xorq	%rdi,%rdx
+	movq	%rsi,24(%rbp)
+	xorq	%rsi,%rax
+	xorq	%rsi,%rdx
+	xorq	%rdx,%rax
+	movq	%rdx,16(%rbp)
+	movq	%rax,8(%rbp)
+
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbp
+	movq	112(%rsp),%rbx
+	leaq	136(%rsp),%rsp
+	.byte	0xf3,0xc3
+.Lend_mul_2x2:
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.byte	71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	16
diff --git a/crypto/bn/asm/x86_64-mont.S b/crypto/bn/asm/x86_64-mont.S
new file mode 100644
index 0000000..95e2905
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont.S
@@ -0,0 +1,1374 @@
+.text	
+
+.globl	bn_mul_mont
+.type	bn_mul_mont,@function
+.align	16
+bn_mul_mont:
+	testl	$3,%r9d
+	jnz	.Lmul_enter
+	cmpl	$8,%r9d
+	jb	.Lmul_enter
+	cmpq	%rsi,%rdx
+	jne	.Lmul4x_enter
+	jmp	.Lsqr4x_enter
+
+.align	16
+.Lmul_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	movl	%r9d,%r9d
+	leaq	2(%r9),%r10
+	movq	%rsp,%r11
+	negq	%r10
+	leaq	(%rsp,%r10,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%r11,8(%rsp,%r9,8)
+.Lmul_body:
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.L1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.L1st
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	.Louter
+.align	16
+.Louter:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.Linner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.Linner
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jl	.Louter
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	leaq	(%rsp),%rsi
+	movq	%r9,%r15
+	jmp	.Lsub
+.align	16
+.Lsub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsi,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	.Lsub
+
+	sbbq	$0,%rax
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	movq	%r9,%r15
+	orq	%rcx,%rsi
+.align	16
+.Lcopy:
+	movq	(%rsi,%r14,8),%rax
+	movq	%r14,(%rsp,%r14,8)
+	movq	%rax,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	.Lcopy
+
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul_mont,.-bn_mul_mont
+.type	bn_mul4x_mont,@function
+.align	16
+bn_mul4x_mont:
+.Lmul4x_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	movl	%r9d,%r9d
+	leaq	4(%r9),%r10
+	movq	%rsp,%r11
+	negq	%r10
+	leaq	(%rsp,%r10,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%r11,8(%rsp,%r9,8)
+.Lmul4x_body:
+	movq	%rdi,16(%rsp,%r9,8)
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	leaq	1(%r14),%r14
+.align	4
+.Louter4x:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	(%rsp),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	1(%r14),%r14
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%rsp,%r9,8),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	cmpq	%r9,%r14
+	jl	.Louter4x
+	movq	16(%rsp,%r9,8),%rdi
+	movq	0(%rsp),%rax
+	pxor	%xmm0,%xmm0
+	movq	8(%rsp),%rdx
+	shrq	$2,%r9
+	leaq	(%rsp),%rsi
+	xorq	%r14,%r14
+
+	subq	0(%rcx),%rax
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rbp
+	sbbq	8(%rcx),%rdx
+	leaq	-1(%r9),%r15
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	movq	%rax,0(%rdi,%r14,8)
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	32(%rsi,%r14,8),%rax
+	movq	40(%rsi,%r14,8),%rdx
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+	movq	%rbp,24(%rdi,%r14,8)
+	sbbq	32(%rcx,%r14,8),%rax
+	movq	48(%rsi,%r14,8),%rbx
+	movq	56(%rsi,%r14,8),%rbp
+	sbbq	40(%rcx,%r14,8),%rdx
+	leaq	4(%r14),%r14
+	decq	%r15
+	jnz	.Lsub4x
+
+	movq	%rax,0(%rdi,%r14,8)
+	movq	32(%rsi,%r14,8),%rax
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+
+	sbbq	$0,%rax
+	movq	%rbp,24(%rdi,%r14,8)
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	leaq	-1(%r9),%r15
+	orq	%rcx,%rsi
+
+	movdqu	(%rsi),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,(%rdi)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqu	32(%rsi,%r14,1),%xmm1
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movdqa	%xmm0,32(%rsp,%r14,1)
+	movdqu	%xmm1,32(%rdi,%r14,1)
+	leaq	32(%r14),%r14
+	decq	%r15
+	jnz	.Lcopy4x
+
+	shlq	$2,%r9
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+.type	bn_sqr4x_mont,@function
+.align	16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	shll	$3,%r9d
+	xorq	%r10,%r10
+	movq	%rsp,%r11
+	subq	%r9,%r10
+	movq	(%r8),%r8
+	leaq	-72(%rsp,%r10,2),%rsp
+	andq	$-1024,%rsp
+
+
+
+
+
+
+
+
+
+
+
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+	movq	%r8,48(%rsp)
+	movq	%r11,56(%rsp)
+.Lsqr4x_body:
+
+
+
+
+
+
+
+	leaq	32(%r10),%rbp
+	leaq	(%rsi,%r9,1),%rsi
+
+	movq	%r9,%rcx
+
+
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	64(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	xorq	%r10,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	leaq	-16(%rbp),%rcx
+
+
+	movq	8(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	movq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+	jmp	.Lsqr4x_1st
+
+.align	16
+.Lsqr4x_1st:
+	movq	(%rsi,%rcx,1),%rbx
+	xorq	%r12,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,(%rdi,%rcx,1)
+
+
+	movq	8(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,8(%rdi,%rcx,1)
+
+	movq	16(%rsi,%rcx,1),%rbx
+	xorq	%r12,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,16(%rdi,%rcx,1)
+
+
+	movq	24(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	leaq	32(%rcx),%rcx
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_1st
+
+	xorq	%r12,%r12
+	addq	%r11,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	%rdx,%r12
+
+	movq	%r13,(%rdi)
+	leaq	16(%rbp),%rbp
+	movq	%r12,8(%rdi)
+	jmp	.Lsqr4x_outer
+
+.align	16
+.Lsqr4x_outer:
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	64(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	movq	-24(%rdi,%rbp,1),%r10
+	xorq	%r11,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	xorq	%r10,%r10
+	addq	-16(%rdi,%rbp,1),%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	leaq	-16(%rbp),%rcx
+	xorq	%r12,%r12
+
+
+	movq	8(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,8(%rdi,%rcx,1)
+
+	leaq	16(%rcx),%rcx
+	jmp	.Lsqr4x_inner
+
+.align	16
+.Lsqr4x_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	xorq	%r12,%r12
+	addq	(%rdi,%rcx,1),%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,(%rdi,%rcx,1)
+
+	movq	8(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_inner
+
+	xorq	%r12,%r12
+	addq	%r11,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	%rdx,%r12
+
+	movq	%r13,(%rdi)
+	movq	%r12,8(%rdi)
+
+	addq	$16,%rbp
+	jnz	.Lsqr4x_outer
+
+
+	movq	-32(%rsi),%r14
+	leaq	64(%rsp,%r9,2),%rdi
+	movq	-24(%rsi),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi),%rbx
+	movq	%rax,%r15
+
+	xorq	%r11,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-24(%rdi)
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,-16(%rdi)
+
+	movq	-8(%rsi),%rbx
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	movq	%rdx,%r13
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi)
+
+	xorq	%r12,%r12
+	addq	%r11,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	-16(%rsi),%rax
+	adcq	%rdx,%r12
+
+	movq	%r13,(%rdi)
+	movq	%r12,8(%rdi)
+
+	mulq	%rbx
+	addq	$16,%rbp
+	xorq	%r14,%r14
+	subq	%r9,%rbp
+	xorq	%r15,%r15
+
+	addq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rax,8(%rdi)
+	movq	%rdx,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	movq	-16(%rsi,%rbp,1),%rax
+	leaq	64(%rsp,%r9,2),%rdi
+	xorq	%r10,%r10
+	movq	-24(%rdi,%rbp,2),%r11
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi,%rbp,2),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi,%rbp,2)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi,%rbp,2),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi,%rbp,2)
+	adcq	%rdx,%r8
+	leaq	16(%rbp),%rbp
+	movq	%r8,-40(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	jmp	.Lsqr4x_shift_n_add
+
+.align	16
+.Lsqr4x_shift_n_add:
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi,%rbp,2),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi,%rbp,2)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi,%rbp,2),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi,%rbp,2)
+	adcq	%rdx,%r8
+
+	leaq	(%r14,%r10,2),%r12
+	movq	%r8,-8(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi,%rbp,2),%r11
+	adcq	%rax,%r12
+	movq	8(%rsi,%rbp,1),%rax
+	movq	%r12,0(%rdi,%rbp,2)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi,%rbp,2),%r11
+	adcq	%rax,%rbx
+	movq	16(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi,%rbp,2)
+	adcq	%rdx,%r8
+	movq	%r8,24(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	addq	$32,%rbp
+	jnz	.Lsqr4x_shift_n_add
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	mulq	%rax
+	negq	%r15
+	adcq	%rax,%rbx
+	adcq	%rdx,%r8
+	movq	%rbx,-16(%rdi)
+	movq	%r8,-8(%rdi)
+	movq	40(%rsp),%rsi
+	movq	48(%rsp),%r8
+	xorq	%rcx,%rcx
+	movq	%r9,0(%rsp)
+	subq	%r9,%rcx
+	movq	64(%rsp),%r10
+	movq	%r8,%r14
+	leaq	64(%rsp,%r9,2),%rax
+	leaq	64(%rsp,%r9,1),%rdi
+	movq	%rax,8(%rsp)
+	leaq	(%rsi,%r9,1),%rsi
+	xorq	%rbp,%rbp
+
+	movq	0(%rsi,%rcx,1),%rax
+	movq	8(%rsi,%rcx,1),%r9
+	imulq	%r10,%r14
+	movq	%rax,%rbx
+	jmp	.Lsqr4x_mont_outer
+
+.align	16
+.Lsqr4x_mont_outer:
+	xorq	%r11,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+	movq	%r8,%r15
+
+	xorq	%r10,%r10
+	addq	8(%rdi,%rcx,1),%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+
+	imulq	%r11,%r15
+
+	movq	16(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+	movq	%r12,8(%rdi,%rcx,1)
+
+	xorq	%r11,%r11
+	addq	16(%rdi,%rcx,1),%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+
+	movq	24(%rsi,%rcx,1),%r9
+	xorq	%r12,%r12
+	addq	%r10,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r12
+	movq	%r13,16(%rdi,%rcx,1)
+
+	xorq	%r10,%r10
+	addq	24(%rdi,%rcx,1),%r11
+	leaq	32(%rcx),%rcx
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	jmp	.Lsqr4x_mont_inner
+
+.align	16
+.Lsqr4x_mont_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+	movq	%r12,-8(%rdi,%rcx,1)
+
+	xorq	%r11,%r11
+	addq	(%rdi,%rcx,1),%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+
+	movq	8(%rsi,%rcx,1),%r9
+	xorq	%r12,%r12
+	addq	%r10,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r12
+	movq	%r13,(%rdi,%rcx,1)
+
+	xorq	%r10,%r10
+	addq	8(%rdi,%rcx,1),%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+
+
+	movq	16(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+	movq	%r12,8(%rdi,%rcx,1)
+
+	xorq	%r11,%r11
+	addq	16(%rdi,%rcx,1),%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+
+	movq	24(%rsi,%rcx,1),%r9
+	xorq	%r12,%r12
+	addq	%r10,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r12
+	movq	%r13,16(%rdi,%rcx,1)
+
+	xorq	%r10,%r10
+	addq	24(%rdi,%rcx,1),%r11
+	leaq	32(%rcx),%rcx
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_mont_inner
+
+	subq	0(%rsp),%rcx
+	movq	%r8,%r14
+
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	movq	%r12,-8(%rdi)
+
+	xorq	%r11,%r11
+	addq	(%rdi),%r10
+	adcq	$0,%r11
+	movq	0(%rsi,%rcx,1),%rbx
+	addq	%rbp,%r10
+	adcq	$0,%r11
+
+	imulq	16(%rdi,%rcx,1),%r14
+	xorq	%r12,%r12
+	movq	8(%rsi,%rcx,1),%r9
+	addq	%r10,%r13
+	movq	16(%rdi,%rcx,1),%r10
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+	movq	%r13,(%rdi)
+
+	xorq	%rbp,%rbp
+	addq	8(%rdi),%r12
+	adcq	%rbp,%rbp
+	addq	%r11,%r12
+	leaq	16(%rdi),%rdi
+	adcq	$0,%rbp
+	movq	%r12,-8(%rdi)
+	cmpq	8(%rsp),%rdi
+	jb	.Lsqr4x_mont_outer
+
+	movq	0(%rsp),%r9
+	movq	%rbp,(%rdi)
+	movq	64(%rsp,%r9,1),%rax
+	leaq	64(%rsp,%r9,1),%rbx
+	movq	40(%rsp),%rsi
+	shrq	$5,%r9
+	movq	8(%rbx),%rdx
+	xorq	%rbp,%rbp
+
+	movq	32(%rsp),%rdi
+	subq	0(%rsi),%rax
+	movq	16(%rbx),%r10
+	movq	24(%rbx),%r11
+	sbbq	8(%rsi),%rdx
+	leaq	-1(%r9),%rcx
+	jmp	.Lsqr4x_sub
+.align	16
+.Lsqr4x_sub:
+	movq	%rax,0(%rdi,%rbp,8)
+	movq	%rdx,8(%rdi,%rbp,8)
+	sbbq	16(%rsi,%rbp,8),%r10
+	movq	32(%rbx,%rbp,8),%rax
+	movq	40(%rbx,%rbp,8),%rdx
+	sbbq	24(%rsi,%rbp,8),%r11
+	movq	%r10,16(%rdi,%rbp,8)
+	movq	%r11,24(%rdi,%rbp,8)
+	sbbq	32(%rsi,%rbp,8),%rax
+	movq	48(%rbx,%rbp,8),%r10
+	movq	56(%rbx,%rbp,8),%r11
+	sbbq	40(%rsi,%rbp,8),%rdx
+	leaq	4(%rbp),%rbp
+	decq	%rcx
+	jnz	.Lsqr4x_sub
+
+	movq	%rax,0(%rdi,%rbp,8)
+	movq	32(%rbx,%rbp,8),%rax
+	sbbq	16(%rsi,%rbp,8),%r10
+	movq	%rdx,8(%rdi,%rbp,8)
+	sbbq	24(%rsi,%rbp,8),%r11
+	movq	%r10,16(%rdi,%rbp,8)
+
+	sbbq	$0,%rax
+	movq	%r11,24(%rdi,%rbp,8)
+	xorq	%rbp,%rbp
+	andq	%rax,%rbx
+	notq	%rax
+	movq	%rdi,%rsi
+	andq	%rax,%rsi
+	leaq	-1(%r9),%rcx
+	orq	%rsi,%rbx
+
+	pxor	%xmm0,%xmm0
+	leaq	64(%rsp,%r9,8),%rsi
+	movdqu	(%rbx),%xmm1
+	leaq	(%rsi,%r9,8),%rsi
+	movdqa	%xmm0,64(%rsp)
+	movdqa	%xmm0,(%rsi)
+	movdqu	%xmm1,(%rdi)
+	jmp	.Lsqr4x_copy
+.align	16
+.Lsqr4x_copy:
+	movdqu	16(%rbx,%rbp,1),%xmm2
+	movdqu	32(%rbx,%rbp,1),%xmm1
+	movdqa	%xmm0,80(%rsp,%rbp,1)
+	movdqa	%xmm0,96(%rsp,%rbp,1)
+	movdqa	%xmm0,16(%rsi,%rbp,1)
+	movdqa	%xmm0,32(%rsi,%rbp,1)
+	movdqu	%xmm2,16(%rdi,%rbp,1)
+	movdqu	%xmm1,32(%rdi,%rbp,1)
+	leaq	32(%rbp),%rbp
+	decq	%rcx
+	jnz	.Lsqr4x_copy
+
+	movdqu	16(%rbx,%rbp,1),%xmm2
+	movdqa	%xmm0,80(%rsp,%rbp,1)
+	movdqa	%xmm0,16(%rsi,%rbp,1)
+	movdqu	%xmm2,16(%rdi,%rbp,1)
+	movq	56(%rsp),%rsi
+	movq	$1,%rax
+	movq	0(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lsqr4x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_sqr4x_mont,.-bn_sqr4x_mont
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	16
diff --git a/crypto/bn/asm/x86_64-mont5.S b/crypto/bn/asm/x86_64-mont5.S
new file mode 100644
index 0000000..49ec6ac
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont5.S
@@ -0,0 +1,784 @@
+.text	
+
+.globl	bn_mul_mont_gather5
+.type	bn_mul_mont_gather5,@function
+.align	64
+bn_mul_mont_gather5:
+	testl	$3,%r9d
+	jnz	.Lmul_enter
+	cmpl	$8,%r9d
+	jb	.Lmul_enter
+	jmp	.Lmul4x_enter
+
+.align	16
+.Lmul_enter:
+	movl	%r9d,%r9d
+	movl	8(%rsp),%r10d
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%rax
+	leaq	2(%r9),%r11
+	negq	%r11
+	leaq	(%rsp,%r11,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%rax,8(%rsp,%r9,8)
+.Lmul_body:
+	movq	%rdx,%r12
+	movq	%r10,%r11
+	shrq	$3,%r10
+	andq	$7,%r11
+	notq	%r10
+	leaq	.Lmagic_masks(%rip),%rax
+	andq	$3,%r10
+	leaq	96(%r12,%r11,8),%r12
+	movq	0(%rax,%r10,8),%xmm4
+	movq	8(%rax,%r10,8),%xmm5
+	movq	16(%rax,%r10,8),%xmm6
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+.byte	102,72,15,126,195
+
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.L1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.L1st
+
+.byte	102,72,15,126,195
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	.Louter
+.align	16
+.Louter:
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.Linner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.Linner
+
+.byte	102,72,15,126,195
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jl	.Louter
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	leaq	(%rsp),%rsi
+	movq	%r9,%r15
+	jmp	.Lsub
+.align	16
+.Lsub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsi,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	.Lsub
+
+	sbbq	$0,%rax
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	movq	%r9,%r15
+	orq	%rcx,%rsi
+.align	16
+.Lcopy:
+	movq	(%rsi,%r14,8),%rax
+	movq	%r14,(%rsp,%r14,8)
+	movq	%rax,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	.Lcopy
+
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.type	bn_mul4x_mont_gather5,@function
+.align	16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+	movl	%r9d,%r9d
+	movl	8(%rsp),%r10d
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%rax
+	leaq	4(%r9),%r11
+	negq	%r11
+	leaq	(%rsp,%r11,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%rax,8(%rsp,%r9,8)
+.Lmul4x_body:
+	movq	%rdi,16(%rsp,%r9,8)
+	movq	%rdx,%r12
+	movq	%r10,%r11
+	shrq	$3,%r10
+	andq	$7,%r11
+	notq	%r10
+	leaq	.Lmagic_masks(%rip),%rax
+	andq	$3,%r10
+	leaq	96(%r12,%r11,8),%r12
+	movq	0(%rax,%r10,8),%xmm4
+	movq	8(%rax,%r10,8),%xmm5
+	movq	16(%rax,%r10,8),%xmm6
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+.byte	102,72,15,126,195
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.byte	102,72,15,126,195
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	leaq	1(%r14),%r14
+.align	4
+.Louter4x:
+	xorq	%r15,%r15
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	movq	(%rsp),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-40(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	1(%r14),%r14
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.byte	102,72,15,126,195
+	movq	%rdi,-16(%rsp,%r15,8)
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%rsp,%r9,8),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	cmpq	%r9,%r14
+	jl	.Louter4x
+	movq	16(%rsp,%r9,8),%rdi
+	movq	0(%rsp),%rax
+	pxor	%xmm0,%xmm0
+	movq	8(%rsp),%rdx
+	shrq	$2,%r9
+	leaq	(%rsp),%rsi
+	xorq	%r14,%r14
+
+	subq	0(%rcx),%rax
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rbp
+	sbbq	8(%rcx),%rdx
+	leaq	-1(%r9),%r15
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	movq	%rax,0(%rdi,%r14,8)
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	32(%rsi,%r14,8),%rax
+	movq	40(%rsi,%r14,8),%rdx
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+	movq	%rbp,24(%rdi,%r14,8)
+	sbbq	32(%rcx,%r14,8),%rax
+	movq	48(%rsi,%r14,8),%rbx
+	movq	56(%rsi,%r14,8),%rbp
+	sbbq	40(%rcx,%r14,8),%rdx
+	leaq	4(%r14),%r14
+	decq	%r15
+	jnz	.Lsub4x
+
+	movq	%rax,0(%rdi,%r14,8)
+	movq	32(%rsi,%r14,8),%rax
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+
+	sbbq	$0,%rax
+	movq	%rbp,24(%rdi,%r14,8)
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	leaq	-1(%r9),%r15
+	orq	%rcx,%rsi
+
+	movdqu	(%rsi),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,(%rdi)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqu	32(%rsi,%r14,1),%xmm1
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movdqa	%xmm0,32(%rsp,%r14,1)
+	movdqu	%xmm1,32(%rdi,%r14,1)
+	leaq	32(%r14),%r14
+	decq	%r15
+	jnz	.Lcopy4x
+
+	shlq	$2,%r9
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+.globl	bn_scatter5
+.type	bn_scatter5,@function
+.align	16
+bn_scatter5:
+	cmpq	$0,%rsi
+	jz	.Lscatter_epilogue
+	leaq	(%rdx,%rcx,8),%rdx
+.Lscatter:
+	movq	(%rdi),%rax
+	leaq	8(%rdi),%rdi
+	movq	%rax,(%rdx)
+	leaq	256(%rdx),%rdx
+	subq	$1,%rsi
+	jnz	.Lscatter
+.Lscatter_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_scatter5,.-bn_scatter5
+
+.globl	bn_gather5
+.type	bn_gather5,@function
+.align	16
+bn_gather5:
+	movq	%rcx,%r11
+	shrq	$3,%rcx
+	andq	$7,%r11
+	notq	%rcx
+	leaq	.Lmagic_masks(%rip),%rax
+	andq	$3,%rcx
+	leaq	96(%rdx,%r11,8),%rdx
+	movq	0(%rax,%rcx,8),%xmm4
+	movq	8(%rax,%rcx,8),%xmm5
+	movq	16(%rax,%rcx,8),%xmm6
+	movq	24(%rax,%rcx,8),%xmm7
+	jmp	.Lgather
+.align	16
+.Lgather:
+	movq	-96(%rdx),%xmm0
+	movq	-32(%rdx),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movq	96(%rdx),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	leaq	256(%rdx),%rdx
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,(%rdi)
+	leaq	8(%rdi),%rdi
+	subq	$1,%rsi
+	jnz	.Lgather
+	.byte	0xf3,0xc3
+.LSEH_end_bn_gather5:
+.size	bn_gather5,.-bn_gather5
+.align	64
+.Lmagic_masks:
+.long	0,0, 0,0, 0,0, -1,-1
+.long	0,0, 0,0, 0,0,  0,0
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/crypto/md5/asm/md5-x86_64.S b/crypto/md5/asm/md5-x86_64.S
new file mode 100644
index 0000000..235d5e4
--- /dev/null
+++ b/crypto/md5/asm/md5-x86_64.S
@@ -0,0 +1,668 @@
+.text	
+.align	16
+
+.globl	md5_block_asm_data_order
+.type	md5_block_asm_data_order,@function
+md5_block_asm_data_order:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r14
+	pushq	%r15
+.Lprologue:
+
+
+
+
+	movq	%rdi,%rbp
+	shlq	$6,%rdx
+	leaq	(%rsi,%rdx,1),%rdi
+	movl	0(%rbp),%eax
+	movl	4(%rbp),%ebx
+	movl	8(%rbp),%ecx
+	movl	12(%rbp),%edx
+
+
+
+
+
+
+
+	cmpq	%rdi,%rsi
+	je	.Lend				
+
+
+.Lloop:
+	movl	%eax,%r8d
+	movl	%ebx,%r9d
+	movl	%ecx,%r14d
+	movl	%edx,%r15d
+	movl	0(%rsi),%r10d
+	movl	%edx,%r11d
+	xorl	%ecx,%r11d
+	leal	-680876936(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	4(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-389564586(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	8(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	606105819(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	12(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-1044525330(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	16(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	-176418897(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	20(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	1200080426(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	24(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-1473231341(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	28(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-45705983(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	32(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	1770035416(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	36(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-1958414417(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	40(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-42063(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	44(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-1990404162(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	48(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	1804603682(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	52(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-40341101(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	56(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-1502002290(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	60(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	1236535329(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	0(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	movl	4(%rsi),%r10d
+	movl	%edx,%r11d
+	movl	%edx,%r12d
+	notl	%r11d
+	leal	-165796510(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	24(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-1069501632(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	44(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	643717713(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	0(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-373897302(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	20(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	-701558691(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	40(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	38016083(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	60(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	-660478335(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	16(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-405537848(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	36(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	568446438(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	56(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-1019803690(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	12(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	-187363961(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	32(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	1163531501(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	52(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	-1444681467(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	8(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-51403784(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	28(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	1735328473(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	48(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-1926607734(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	0(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	movl	20(%rsi),%r10d
+	movl	%ecx,%r11d
+	leal	-378558(%rax,%r10,1),%eax
+	movl	32(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-2022574463(%rdx,%r10,1),%edx
+	movl	44(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	1839030562(%rcx,%r10,1),%ecx
+	movl	56(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-35309556(%rbx,%r10,1),%ebx
+	movl	4(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	-1530992060(%rax,%r10,1),%eax
+	movl	16(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	1272893353(%rdx,%r10,1),%edx
+	movl	28(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	-155497632(%rcx,%r10,1),%ecx
+	movl	40(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-1094730640(%rbx,%r10,1),%ebx
+	movl	52(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	681279174(%rax,%r10,1),%eax
+	movl	0(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-358537222(%rdx,%r10,1),%edx
+	movl	12(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	-722521979(%rcx,%r10,1),%ecx
+	movl	24(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	76029189(%rbx,%r10,1),%ebx
+	movl	36(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	-640364487(%rax,%r10,1),%eax
+	movl	48(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-421815835(%rdx,%r10,1),%edx
+	movl	60(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	530742520(%rcx,%r10,1),%ecx
+	movl	8(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-995338651(%rbx,%r10,1),%ebx
+	movl	0(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	movl	0(%rsi),%r10d
+	movl	$4294967295,%r11d
+	xorl	%edx,%r11d
+	leal	-198630844(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	28(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	1126891415(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	56(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1416354905(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	20(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-57434055(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	48(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	1700485571(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	12(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-1894986606(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	40(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1051523(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	4(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-2054922799(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	32(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	1873313359(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	60(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-30611744(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	24(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1560198380(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	52(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	1309151649(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	16(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	-145523070(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	44(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-1120210379(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	8(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	718787259(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	36(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-343485551(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	0(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+
+	addl	%r8d,%eax
+	addl	%r9d,%ebx
+	addl	%r14d,%ecx
+	addl	%r15d,%edx
+
+
+	addq	$64,%rsi
+	cmpq	%rdi,%rsi
+	jb	.Lloop				
+
+
+.Lend:
+	movl	%eax,0(%rbp)
+	movl	%ebx,4(%rbp)
+	movl	%ecx,8(%rbp)
+	movl	%edx,12(%rbp)
+
+	movq	(%rsp),%r15
+	movq	8(%rsp),%r14
+	movq	16(%rsp),%r12
+	movq	24(%rsp),%rbx
+	movq	32(%rsp),%rbp
+	addq	$40,%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	md5_block_asm_data_order,.-md5_block_asm_data_order
diff --git a/crypto/modes/asm/ghash-x86_64.S b/crypto/modes/asm/ghash-x86_64.S
new file mode 100644
index 0000000..62d39c6
--- /dev/null
+++ b/crypto/modes/asm/ghash-x86_64.S
@@ -0,0 +1,1026 @@
+.text	
+
+.globl	gcm_gmult_4bit
+.type	gcm_gmult_4bit,@function
+.align	16
+gcm_gmult_4bit:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+.Lgmult_prologue:
+
+	movzbq	15(%rdi),%r8
+	leaq	.Lrem_4bit(%rip),%r11
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	movb	%r8b,%al
+	movb	%r8b,%bl
+	shlb	$4,%al
+	movq	$14,%rcx
+	movq	8(%rsi,%rax,1),%r8
+	movq	(%rsi,%rax,1),%r9
+	andb	$240,%bl
+	movq	%r8,%rdx
+	jmp	.Loop1
+
+.align	16
+.Loop1:
+	shrq	$4,%r8
+	andq	$15,%rdx
+	movq	%r9,%r10
+	movb	(%rdi,%rcx,1),%al
+	shrq	$4,%r9
+	xorq	8(%rsi,%rbx,1),%r8
+	shlq	$60,%r10
+	xorq	(%rsi,%rbx,1),%r9
+	movb	%al,%bl
+	xorq	(%r11,%rdx,8),%r9
+	movq	%r8,%rdx
+	shlb	$4,%al
+	xorq	%r10,%r8
+	decq	%rcx
+	js	.Lbreak1
+
+	shrq	$4,%r8
+	andq	$15,%rdx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	xorq	8(%rsi,%rax,1),%r8
+	shlq	$60,%r10
+	xorq	(%rsi,%rax,1),%r9
+	andb	$240,%bl
+	xorq	(%r11,%rdx,8),%r9
+	movq	%r8,%rdx
+	xorq	%r10,%r8
+	jmp	.Loop1
+
+.align	16
+.Lbreak1:
+	shrq	$4,%r8
+	andq	$15,%rdx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	xorq	8(%rsi,%rax,1),%r8
+	shlq	$60,%r10
+	xorq	(%rsi,%rax,1),%r9
+	andb	$240,%bl
+	xorq	(%r11,%rdx,8),%r9
+	movq	%r8,%rdx
+	xorq	%r10,%r8
+
+	shrq	$4,%r8
+	andq	$15,%rdx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	xorq	8(%rsi,%rbx,1),%r8
+	shlq	$60,%r10
+	xorq	(%rsi,%rbx,1),%r9
+	xorq	%r10,%r8
+	xorq	(%r11,%rdx,8),%r9
+
+	bswapq	%r8
+	bswapq	%r9
+	movq	%r8,8(%rdi)
+	movq	%r9,(%rdi)
+
+	movq	16(%rsp),%rbx
+	leaq	24(%rsp),%rsp
+.Lgmult_epilogue:
+	.byte	0xf3,0xc3
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+.globl	gcm_ghash_4bit
+.type	gcm_ghash_4bit,@function
+.align	16
+gcm_ghash_4bit:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$280,%rsp
+.Lghash_prologue:
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+	subq	$-128,%rsi
+	leaq	16+128(%rsp),%rbp
+	xorl	%edx,%edx
+	movq	0+0-128(%rsi),%r8
+	movq	0+8-128(%rsi),%rax
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	16+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	16+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,0(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,0(%rbp)
+	movq	32+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,0-128(%rbp)
+	movq	32+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,1(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,8(%rbp)
+	movq	48+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,8-128(%rbp)
+	movq	48+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,2(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,16(%rbp)
+	movq	64+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,16-128(%rbp)
+	movq	64+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,3(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,24(%rbp)
+	movq	80+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,24-128(%rbp)
+	movq	80+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,4(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,32(%rbp)
+	movq	96+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,32-128(%rbp)
+	movq	96+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,5(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,40(%rbp)
+	movq	112+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,40-128(%rbp)
+	movq	112+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,6(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,48(%rbp)
+	movq	128+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,48-128(%rbp)
+	movq	128+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,7(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,56(%rbp)
+	movq	144+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,56-128(%rbp)
+	movq	144+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,8(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,64(%rbp)
+	movq	160+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,64-128(%rbp)
+	movq	160+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,9(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,72(%rbp)
+	movq	176+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,72-128(%rbp)
+	movq	176+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,10(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,80(%rbp)
+	movq	192+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,80-128(%rbp)
+	movq	192+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,11(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,88(%rbp)
+	movq	208+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,88-128(%rbp)
+	movq	208+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,12(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,96(%rbp)
+	movq	224+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,96-128(%rbp)
+	movq	224+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,13(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,104(%rbp)
+	movq	240+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,104-128(%rbp)
+	movq	240+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,14(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,112(%rbp)
+	shlb	$4,%dl
+	movq	%rax,112-128(%rbp)
+	shlq	$60,%r10
+	movb	%dl,15(%rsp)
+	orq	%r10,%rbx
+	movq	%r9,120(%rbp)
+	movq	%rbx,120-128(%rbp)
+	addq	$-128,%rsi
+	movq	8(%rdi),%r8
+	movq	0(%rdi),%r9
+	addq	%r14,%r15
+	leaq	.Lrem_8bit(%rip),%r11
+	jmp	.Louter_loop
+.align	16
+.Louter_loop:
+	xorq	(%r14),%r9
+	movq	8(%r14),%rdx
+	leaq	16(%r14),%r14
+	xorq	%r8,%rdx
+	movq	%r9,(%rdi)
+	movq	%rdx,8(%rdi)
+	shrq	$32,%rdx
+	xorq	%rax,%rax
+	roll	$8,%edx
+	movb	%dl,%al
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	shrl	$4,%ebx
+	roll	$8,%edx
+	movq	8(%rsi,%rax,1),%r8
+	movq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	movl	8(%rdi),%edx
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	movl	4(%rdi),%edx
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	movl	0(%rdi),%edx
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	andl	$240,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	movl	-4(%rdi),%edx
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	movzwq	(%r11,%r12,2),%r12
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	shlq	$48,%r12
+	xorq	%r10,%r8
+	xorq	%r12,%r9
+	movzbq	%r8b,%r13
+	shrq	$4,%r8
+	movq	%r9,%r10
+	shlb	$4,%r13b
+	shrq	$4,%r9
+	xorq	8(%rsi,%rcx,1),%r8
+	movzwq	(%r11,%r13,2),%r13
+	shlq	$60,%r10
+	xorq	(%rsi,%rcx,1),%r9
+	xorq	%r10,%r8
+	shlq	$48,%r13
+	bswapq	%r8
+	xorq	%r13,%r9
+	bswapq	%r9
+	cmpq	%r15,%r14
+	jb	.Louter_loop
+	movq	%r8,8(%rdi)
+	movq	%r9,(%rdi)
+
+	leaq	280(%rsp),%rsi
+	movq	0(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lghash_epilogue:
+	.byte	0xf3,0xc3
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+.globl	gcm_init_clmul
+.type	gcm_init_clmul,@function
+.align	16
+gcm_init_clmul:
+	movdqu	(%rsi),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+
+
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+
+
+	pand	.L0x1c2_polynomial(%rip),%xmm5
+	pxor	%xmm5,%xmm2
+
+
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	movdqu	%xmm2,(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	.byte	0xf3,0xc3
+.size	gcm_init_clmul,.-gcm_init_clmul
+.globl	gcm_gmult_clmul
+.type	gcm_gmult_clmul,@function
+.align	16
+gcm_gmult_clmul:
+	movdqu	(%rdi),%xmm0
+	movdqa	.Lbswap_mask(%rip),%xmm5
+	movdqu	(%rsi),%xmm2
+.byte	102,15,56,0,197
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%rdi)
+	.byte	0xf3,0xc3
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+.globl	gcm_ghash_clmul
+.type	gcm_ghash_clmul,@function
+.align	16
+gcm_ghash_clmul:
+	movdqa	.Lbswap_mask(%rip),%xmm5
+
+	movdqu	(%rdi),%xmm0
+	movdqu	(%rsi),%xmm2
+.byte	102,15,56,0,197
+
+	subq	$16,%rcx
+	jz	.Lodd_tail
+
+	movdqu	16(%rsi),%xmm8
+
+
+
+
+
+	movdqu	(%rdx),%xmm3
+	movdqu	16(%rdx),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm6,%xmm7
+	pshufd	$78,%xmm6,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm6,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,242,0
+.byte	102,15,58,68,250,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm6,%xmm3
+	pxor	%xmm7,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm7
+	pxor	%xmm4,%xmm6
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm8,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm8,%xmm4
+
+	leaq	32(%rdx),%rdx
+	subq	$32,%rcx
+	jbe	.Leven_tail
+
+.Lmod_loop:
+.byte	102,65,15,58,68,192,0
+.byte	102,65,15,58,68,200,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqu	(%rdx),%xmm3
+	pxor	%xmm6,%xmm0
+	pxor	%xmm7,%xmm1
+
+	movdqu	16(%rdx),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+
+	movdqa	%xmm6,%xmm7
+	pshufd	$78,%xmm6,%xmm9
+	pshufd	$78,%xmm2,%xmm10
+	pxor	%xmm6,%xmm9
+	pxor	%xmm2,%xmm10
+	pxor	%xmm3,%xmm1
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+.byte	102,15,58,68,242,0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+.byte	102,15,58,68,250,17
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+
+.byte	102,69,15,58,68,202,0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm8,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm8,%xmm4
+
+	pxor	%xmm6,%xmm9
+	pxor	%xmm7,%xmm9
+	movdqa	%xmm9,%xmm10
+	psrldq	$8,%xmm9
+	pslldq	$8,%xmm10
+	pxor	%xmm9,%xmm7
+	pxor	%xmm10,%xmm6
+
+	leaq	32(%rdx),%rdx
+	subq	$32,%rcx
+	ja	.Lmod_loop
+
+.Leven_tail:
+.byte	102,65,15,58,68,192,0
+.byte	102,65,15,58,68,200,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	pxor	%xmm6,%xmm0
+	pxor	%xmm7,%xmm1
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	testq	%rcx,%rcx
+	jnz	.Ldone
+
+.Lodd_tail:
+	movdqu	(%rdx),%xmm3
+.byte	102,15,56,0,221
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+.Ldone:
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%rdi)
+	.byte	0xf3,0xc3
+.LSEH_end_gcm_ghash_clmul:
+.size	gcm_ghash_clmul,.-gcm_ghash_clmul
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.align	64
+.type	.Lrem_4bit,@object
+.Lrem_4bit:
+.long	0,0,0,471859200,0,943718400,0,610271232
+.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
+.type	.Lrem_8bit,@object
+.Lrem_8bit:
+.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/crypto/rc4/asm/rc4-md5-x86_64.S b/crypto/rc4/asm/rc4-md5-x86_64.S
new file mode 100644
index 0000000..aab3c6d
--- /dev/null
+++ b/crypto/rc4/asm/rc4-md5-x86_64.S
@@ -0,0 +1,1259 @@
+.text	
+.align	16
+
+.globl	rc4_md5_enc
+.type	rc4_md5_enc,@function
+rc4_md5_enc:
+	cmpq	$0,%r9
+	je	.Labort
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$40,%rsp
+.Lbody:
+	movq	%rcx,%r11
+	movq	%r9,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%r8,%r15
+	xorq	%rbp,%rbp
+	xorq	%rcx,%rcx
+
+	leaq	8(%rdi),%rdi
+	movb	-8(%rdi),%bpl
+	movb	-4(%rdi),%cl
+
+	incb	%bpl
+	subq	%r13,%r14
+	movl	(%rdi,%rbp,4),%eax
+	addb	%al,%cl
+	leaq	(%rdi,%rbp,4),%rsi
+	shlq	$6,%r12
+	addq	%r15,%r12
+	movq	%r12,16(%rsp)
+
+	movq	%r11,24(%rsp)
+	movl	0(%r11),%r8d
+	movl	4(%r11),%r9d
+	movl	8(%r11),%r10d
+	movl	12(%r11),%r11d
+	jmp	.Loop
+
+.align	16
+.Loop:
+	movl	%r8d,0(%rsp)
+	movl	%r9d,4(%rsp)
+	movl	%r10d,8(%rsp)
+	movl	%r11d,%r12d
+	movl	%r11d,12(%rsp)
+	pxor	%xmm0,%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	0(%r15),%r8d
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	addl	$3614090360,%r8d
+	xorl	%r11d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$7,%r8d
+	movl	%r10d,%r12d
+	movd	(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	pxor	%xmm1,%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	4(%r15),%r11d
+	addb	%dl,%bl
+	movl	8(%rsi),%eax
+	addl	$3905402710,%r11d
+	xorl	%r10d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,4(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$12,%r11d
+	movl	%r9d,%r12d
+	movd	(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	8(%r15),%r10d
+	addb	%dl,%al
+	movl	12(%rsi),%ebx
+	addl	$606105819,%r10d
+	xorl	%r9d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,8(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$17,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	12(%r15),%r9d
+	addb	%dl,%bl
+	movl	16(%rsi),%eax
+	addl	$3250441966,%r9d
+	xorl	%r8d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,12(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$22,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	16(%r15),%r8d
+	addb	%dl,%al
+	movl	20(%rsi),%ebx
+	addl	$4118548399,%r8d
+	xorl	%r11d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,16(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$7,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	20(%r15),%r11d
+	addb	%dl,%bl
+	movl	24(%rsi),%eax
+	addl	$1200080426,%r11d
+	xorl	%r10d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,20(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$12,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	24(%r15),%r10d
+	addb	%dl,%al
+	movl	28(%rsi),%ebx
+	addl	$2821735955,%r10d
+	xorl	%r9d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,24(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$17,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	28(%r15),%r9d
+	addb	%dl,%bl
+	movl	32(%rsi),%eax
+	addl	$4249261313,%r9d
+	xorl	%r8d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,28(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$22,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	32(%r15),%r8d
+	addb	%dl,%al
+	movl	36(%rsi),%ebx
+	addl	$1770035416,%r8d
+	xorl	%r11d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,32(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$7,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	36(%r15),%r11d
+	addb	%dl,%bl
+	movl	40(%rsi),%eax
+	addl	$2336552879,%r11d
+	xorl	%r10d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,36(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$12,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	40(%r15),%r10d
+	addb	%dl,%al
+	movl	44(%rsi),%ebx
+	addl	$4294925233,%r10d
+	xorl	%r9d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,40(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$17,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	44(%r15),%r9d
+	addb	%dl,%bl
+	movl	48(%rsi),%eax
+	addl	$2304563134,%r9d
+	xorl	%r8d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,44(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$22,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	48(%r15),%r8d
+	addb	%dl,%al
+	movl	52(%rsi),%ebx
+	addl	$1804603682,%r8d
+	xorl	%r11d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,48(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$7,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	52(%r15),%r11d
+	addb	%dl,%bl
+	movl	56(%rsi),%eax
+	addl	$4254626195,%r11d
+	xorl	%r10d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,52(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$12,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	56(%r15),%r10d
+	addb	%dl,%al
+	movl	60(%rsi),%ebx
+	addl	$2792965006,%r10d
+	xorl	%r9d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,56(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$17,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movdqu	(%r13),%xmm2
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	60(%r15),%r9d
+	addb	%dl,%bl
+	movl	64(%rsi),%eax
+	addl	$1236535329,%r9d
+	xorl	%r8d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,60(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$22,%r9d
+	movl	%r10d,%r12d
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+	pxor	%xmm0,%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	4(%r15),%r8d
+	addb	%dl,%al
+	movl	68(%rsi),%ebx
+	addl	$4129170786,%r8d
+	xorl	%r10d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,64(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$5,%r8d
+	movl	%r9d,%r12d
+	movd	(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	pxor	%xmm1,%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	24(%r15),%r11d
+	addb	%dl,%bl
+	movl	72(%rsi),%eax
+	addl	$3225465664,%r11d
+	xorl	%r9d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,68(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$9,%r11d
+	movl	%r8d,%r12d
+	movd	(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	44(%r15),%r10d
+	addb	%dl,%al
+	movl	76(%rsi),%ebx
+	addl	$643717713,%r10d
+	xorl	%r8d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,72(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$14,%r10d
+	movl	%r11d,%r12d
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	0(%r15),%r9d
+	addb	%dl,%bl
+	movl	80(%rsi),%eax
+	addl	$3921069994,%r9d
+	xorl	%r11d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,76(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$20,%r9d
+	movl	%r10d,%r12d
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	20(%r15),%r8d
+	addb	%dl,%al
+	movl	84(%rsi),%ebx
+	addl	$3593408605,%r8d
+	xorl	%r10d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,80(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$5,%r8d
+	movl	%r9d,%r12d
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	40(%r15),%r11d
+	addb	%dl,%bl
+	movl	88(%rsi),%eax
+	addl	$38016083,%r11d
+	xorl	%r9d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,84(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$9,%r11d
+	movl	%r8d,%r12d
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	60(%r15),%r10d
+	addb	%dl,%al
+	movl	92(%rsi),%ebx
+	addl	$3634488961,%r10d
+	xorl	%r8d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,88(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$14,%r10d
+	movl	%r11d,%r12d
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	16(%r15),%r9d
+	addb	%dl,%bl
+	movl	96(%rsi),%eax
+	addl	$3889429448,%r9d
+	xorl	%r11d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,92(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$20,%r9d
+	movl	%r10d,%r12d
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	36(%r15),%r8d
+	addb	%dl,%al
+	movl	100(%rsi),%ebx
+	addl	$568446438,%r8d
+	xorl	%r10d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,96(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$5,%r8d
+	movl	%r9d,%r12d
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	56(%r15),%r11d
+	addb	%dl,%bl
+	movl	104(%rsi),%eax
+	addl	$3275163606,%r11d
+	xorl	%r9d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,100(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$9,%r11d
+	movl	%r8d,%r12d
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	12(%r15),%r10d
+	addb	%dl,%al
+	movl	108(%rsi),%ebx
+	addl	$4107603335,%r10d
+	xorl	%r8d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,104(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$14,%r10d
+	movl	%r11d,%r12d
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	32(%r15),%r9d
+	addb	%dl,%bl
+	movl	112(%rsi),%eax
+	addl	$1163531501,%r9d
+	xorl	%r11d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,108(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$20,%r9d
+	movl	%r10d,%r12d
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	52(%r15),%r8d
+	addb	%dl,%al
+	movl	116(%rsi),%ebx
+	addl	$2850285829,%r8d
+	xorl	%r10d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,112(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$5,%r8d
+	movl	%r9d,%r12d
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	8(%r15),%r11d
+	addb	%dl,%bl
+	movl	120(%rsi),%eax
+	addl	$4243563512,%r11d
+	xorl	%r9d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,116(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$9,%r11d
+	movl	%r8d,%r12d
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	28(%r15),%r10d
+	addb	%dl,%al
+	movl	124(%rsi),%ebx
+	addl	$1735328473,%r10d
+	xorl	%r8d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,120(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$14,%r10d
+	movl	%r11d,%r12d
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movdqu	16(%r13),%xmm3
+	addb	$32,%bpl
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	48(%r15),%r9d
+	addb	%dl,%bl
+	movl	0(%rdi,%rbp,4),%eax
+	addl	$2368359562,%r9d
+	xorl	%r11d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,124(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$20,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movq	%rcx,%rsi
+	xorq	%rcx,%rcx
+	movb	%sil,%cl
+	leaq	(%rdi,%rbp,4),%rsi
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pxor	%xmm0,%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r9d,%r12d
+	addl	20(%r15),%r8d
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	addl	$4294588738,%r8d
+	movzbl	%al,%eax
+	addl	%r12d,%r8d
+	movl	%edx,0(%rsi)
+	addb	%bl,%cl
+	roll	$4,%r8d
+	movl	%r10d,%r12d
+	movd	(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	pxor	%xmm1,%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r8d,%r12d
+	addl	32(%r15),%r11d
+	addb	%dl,%bl
+	movl	8(%rsi),%eax
+	addl	$2272392833,%r11d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r11d
+	movl	%edx,4(%rsi)
+	addb	%al,%cl
+	roll	$11,%r11d
+	movl	%r9d,%r12d
+	movd	(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r11d,%r12d
+	addl	44(%r15),%r10d
+	addb	%dl,%al
+	movl	12(%rsi),%ebx
+	addl	$1839030562,%r10d
+	movzbl	%al,%eax
+	addl	%r12d,%r10d
+	movl	%edx,8(%rsi)
+	addb	%bl,%cl
+	roll	$16,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r10d,%r12d
+	addl	56(%r15),%r9d
+	addb	%dl,%bl
+	movl	16(%rsi),%eax
+	addl	$4259657740,%r9d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r9d
+	movl	%edx,12(%rsi)
+	addb	%al,%cl
+	roll	$23,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r9d,%r12d
+	addl	4(%r15),%r8d
+	addb	%dl,%al
+	movl	20(%rsi),%ebx
+	addl	$2763975236,%r8d
+	movzbl	%al,%eax
+	addl	%r12d,%r8d
+	movl	%edx,16(%rsi)
+	addb	%bl,%cl
+	roll	$4,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r8d,%r12d
+	addl	16(%r15),%r11d
+	addb	%dl,%bl
+	movl	24(%rsi),%eax
+	addl	$1272893353,%r11d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r11d
+	movl	%edx,20(%rsi)
+	addb	%al,%cl
+	roll	$11,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r11d,%r12d
+	addl	28(%r15),%r10d
+	addb	%dl,%al
+	movl	28(%rsi),%ebx
+	addl	$4139469664,%r10d
+	movzbl	%al,%eax
+	addl	%r12d,%r10d
+	movl	%edx,24(%rsi)
+	addb	%bl,%cl
+	roll	$16,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r10d,%r12d
+	addl	40(%r15),%r9d
+	addb	%dl,%bl
+	movl	32(%rsi),%eax
+	addl	$3200236656,%r9d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r9d
+	movl	%edx,28(%rsi)
+	addb	%al,%cl
+	roll	$23,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r9d,%r12d
+	addl	52(%r15),%r8d
+	addb	%dl,%al
+	movl	36(%rsi),%ebx
+	addl	$681279174,%r8d
+	movzbl	%al,%eax
+	addl	%r12d,%r8d
+	movl	%edx,32(%rsi)
+	addb	%bl,%cl
+	roll	$4,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r8d,%r12d
+	addl	0(%r15),%r11d
+	addb	%dl,%bl
+	movl	40(%rsi),%eax
+	addl	$3936430074,%r11d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r11d
+	movl	%edx,36(%rsi)
+	addb	%al,%cl
+	roll	$11,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r11d,%r12d
+	addl	12(%r15),%r10d
+	addb	%dl,%al
+	movl	44(%rsi),%ebx
+	addl	$3572445317,%r10d
+	movzbl	%al,%eax
+	addl	%r12d,%r10d
+	movl	%edx,40(%rsi)
+	addb	%bl,%cl
+	roll	$16,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r10d,%r12d
+	addl	24(%r15),%r9d
+	addb	%dl,%bl
+	movl	48(%rsi),%eax
+	addl	$76029189,%r9d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r9d
+	movl	%edx,44(%rsi)
+	addb	%al,%cl
+	roll	$23,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r9d,%r12d
+	addl	36(%r15),%r8d
+	addb	%dl,%al
+	movl	52(%rsi),%ebx
+	addl	$3654602809,%r8d
+	movzbl	%al,%eax
+	addl	%r12d,%r8d
+	movl	%edx,48(%rsi)
+	addb	%bl,%cl
+	roll	$4,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r8d,%r12d
+	addl	48(%r15),%r11d
+	addb	%dl,%bl
+	movl	56(%rsi),%eax
+	addl	$3873151461,%r11d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r11d
+	movl	%edx,52(%rsi)
+	addb	%al,%cl
+	roll	$11,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r11d,%r12d
+	addl	60(%r15),%r10d
+	addb	%dl,%al
+	movl	60(%rsi),%ebx
+	addl	$530742520,%r10d
+	movzbl	%al,%eax
+	addl	%r12d,%r10d
+	movl	%edx,56(%rsi)
+	addb	%bl,%cl
+	roll	$16,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movdqu	32(%r13),%xmm4
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r10d,%r12d
+	addl	8(%r15),%r9d
+	addb	%dl,%bl
+	movl	64(%rsi),%eax
+	addl	$3299628645,%r9d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r9d
+	movl	%edx,60(%rsi)
+	addb	%al,%cl
+	roll	$23,%r9d
+	movl	$-1,%r12d
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm1,%xmm4
+	pxor	%xmm0,%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r9d,%r12d
+	addl	0(%r15),%r8d
+	addb	%dl,%al
+	movl	68(%rsi),%ebx
+	addl	$4096336452,%r8d
+	movzbl	%al,%eax
+	xorl	%r10d,%r12d
+	movl	%edx,64(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$6,%r8d
+	movl	$-1,%r12d
+	movd	(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	pxor	%xmm1,%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r8d,%r12d
+	addl	28(%r15),%r11d
+	addb	%dl,%bl
+	movl	72(%rsi),%eax
+	addl	$1126891415,%r11d
+	movzbl	%bl,%ebx
+	xorl	%r9d,%r12d
+	movl	%edx,68(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$10,%r11d
+	movl	$-1,%r12d
+	movd	(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r11d,%r12d
+	addl	56(%r15),%r10d
+	addb	%dl,%al
+	movl	76(%rsi),%ebx
+	addl	$2878612391,%r10d
+	movzbl	%al,%eax
+	xorl	%r8d,%r12d
+	movl	%edx,72(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$15,%r10d
+	movl	$-1,%r12d
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r10d,%r12d
+	addl	20(%r15),%r9d
+	addb	%dl,%bl
+	movl	80(%rsi),%eax
+	addl	$4237533241,%r9d
+	movzbl	%bl,%ebx
+	xorl	%r11d,%r12d
+	movl	%edx,76(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$21,%r9d
+	movl	$-1,%r12d
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r9d,%r12d
+	addl	48(%r15),%r8d
+	addb	%dl,%al
+	movl	84(%rsi),%ebx
+	addl	$1700485571,%r8d
+	movzbl	%al,%eax
+	xorl	%r10d,%r12d
+	movl	%edx,80(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$6,%r8d
+	movl	$-1,%r12d
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r8d,%r12d
+	addl	12(%r15),%r11d
+	addb	%dl,%bl
+	movl	88(%rsi),%eax
+	addl	$2399980690,%r11d
+	movzbl	%bl,%ebx
+	xorl	%r9d,%r12d
+	movl	%edx,84(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$10,%r11d
+	movl	$-1,%r12d
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r11d,%r12d
+	addl	40(%r15),%r10d
+	addb	%dl,%al
+	movl	92(%rsi),%ebx
+	addl	$4293915773,%r10d
+	movzbl	%al,%eax
+	xorl	%r8d,%r12d
+	movl	%edx,88(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$15,%r10d
+	movl	$-1,%r12d
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r10d,%r12d
+	addl	4(%r15),%r9d
+	addb	%dl,%bl
+	movl	96(%rsi),%eax
+	addl	$2240044497,%r9d
+	movzbl	%bl,%ebx
+	xorl	%r11d,%r12d
+	movl	%edx,92(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$21,%r9d
+	movl	$-1,%r12d
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r9d,%r12d
+	addl	32(%r15),%r8d
+	addb	%dl,%al
+	movl	100(%rsi),%ebx
+	addl	$1873313359,%r8d
+	movzbl	%al,%eax
+	xorl	%r10d,%r12d
+	movl	%edx,96(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$6,%r8d
+	movl	$-1,%r12d
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r8d,%r12d
+	addl	60(%r15),%r11d
+	addb	%dl,%bl
+	movl	104(%rsi),%eax
+	addl	$4264355552,%r11d
+	movzbl	%bl,%ebx
+	xorl	%r9d,%r12d
+	movl	%edx,100(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$10,%r11d
+	movl	$-1,%r12d
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r11d,%r12d
+	addl	24(%r15),%r10d
+	addb	%dl,%al
+	movl	108(%rsi),%ebx
+	addl	$2734768916,%r10d
+	movzbl	%al,%eax
+	xorl	%r8d,%r12d
+	movl	%edx,104(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$15,%r10d
+	movl	$-1,%r12d
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r10d,%r12d
+	addl	52(%r15),%r9d
+	addb	%dl,%bl
+	movl	112(%rsi),%eax
+	addl	$1309151649,%r9d
+	movzbl	%bl,%ebx
+	xorl	%r11d,%r12d
+	movl	%edx,108(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$21,%r9d
+	movl	$-1,%r12d
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r9d,%r12d
+	addl	16(%r15),%r8d
+	addb	%dl,%al
+	movl	116(%rsi),%ebx
+	addl	$4149444226,%r8d
+	movzbl	%al,%eax
+	xorl	%r10d,%r12d
+	movl	%edx,112(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$6,%r8d
+	movl	$-1,%r12d
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r8d,%r12d
+	addl	44(%r15),%r11d
+	addb	%dl,%bl
+	movl	120(%rsi),%eax
+	addl	$3174756917,%r11d
+	movzbl	%bl,%ebx
+	xorl	%r9d,%r12d
+	movl	%edx,116(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$10,%r11d
+	movl	$-1,%r12d
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r11d,%r12d
+	addl	8(%r15),%r10d
+	addb	%dl,%al
+	movl	124(%rsi),%ebx
+	addl	$718787259,%r10d
+	movzbl	%al,%eax
+	xorl	%r8d,%r12d
+	movl	%edx,120(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$15,%r10d
+	movl	$-1,%r12d
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movdqu	48(%r13),%xmm5
+	addb	$32,%bpl
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r10d,%r12d
+	addl	36(%r15),%r9d
+	addb	%dl,%bl
+	movl	0(%rdi,%rbp,4),%eax
+	addl	$3951481745,%r9d
+	movzbl	%bl,%ebx
+	xorl	%r11d,%r12d
+	movl	%edx,124(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$21,%r9d
+	movl	$-1,%r12d
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movq	%rbp,%rsi
+	xorq	%rbp,%rbp
+	movb	%sil,%bpl
+	movq	%rcx,%rsi
+	xorq	%rcx,%rcx
+	movb	%sil,%cl
+	leaq	(%rdi,%rbp,4),%rsi
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm5
+	pxor	%xmm1,%xmm5
+	addl	0(%rsp),%r8d
+	addl	4(%rsp),%r9d
+	addl	8(%rsp),%r10d
+	addl	12(%rsp),%r11d
+
+	movdqu	%xmm2,(%r14,%r13,1)
+	movdqu	%xmm3,16(%r14,%r13,1)
+	movdqu	%xmm4,32(%r14,%r13,1)
+	movdqu	%xmm5,48(%r14,%r13,1)
+	leaq	64(%r15),%r15
+	leaq	64(%r13),%r13
+	cmpq	16(%rsp),%r15
+	jb	.Loop
+
+	movq	24(%rsp),%r12
+	subb	%al,%cl
+	movl	%r8d,0(%r12)
+	movl	%r9d,4(%r12)
+	movl	%r10d,8(%r12)
+	movl	%r11d,12(%r12)
+	subb	$1,%bpl
+	movl	%ebp,-8(%rdi)
+	movl	%ecx,-4(%rdi)
+
+	movq	40(%rsp),%r15
+	movq	48(%rsp),%r14
+	movq	56(%rsp),%r13
+	movq	64(%rsp),%r12
+	movq	72(%rsp),%rbp
+	movq	80(%rsp),%rbx
+	leaq	88(%rsp),%rsp
+.Lepilogue:
+.Labort:
+	.byte	0xf3,0xc3
+.size	rc4_md5_enc,.-rc4_md5_enc
diff --git a/crypto/rc4/asm/rc4-x86_64.S b/crypto/rc4/asm/rc4-x86_64.S
new file mode 100644
index 0000000..af16158
--- /dev/null
+++ b/crypto/rc4/asm/rc4-x86_64.S
@@ -0,0 +1,615 @@
+.text	
+
+
+.globl	RC4
+.type	RC4,@function
+.align	16
+RC4:	orq	%rsi,%rsi
+	jne	.Lentry
+	.byte	0xf3,0xc3
+.Lentry:
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+.Lprologue:
+	movq	%rsi,%r11
+	movq	%rdx,%r12
+	movq	%rcx,%r13
+	xorq	%r10,%r10
+	xorq	%rcx,%rcx
+
+	leaq	8(%rdi),%rdi
+	movb	-8(%rdi),%r10b
+	movb	-4(%rdi),%cl
+	cmpl	$-1,256(%rdi)
+	je	.LRC4_CHAR
+	movl	OPENSSL_ia32cap_P(%rip),%r8d
+	xorq	%rbx,%rbx
+	incb	%r10b
+	subq	%r10,%rbx
+	subq	%r12,%r13
+	movl	(%rdi,%r10,4),%eax
+	testq	$-16,%r11
+	jz	.Lloop1
+	btl	$30,%r8d
+	jc	.Lintel
+	andq	$7,%rbx
+	leaq	1(%r10),%rsi
+	jz	.Loop8
+	subq	%rbx,%r11
+.Loop8_warmup:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%rbx
+	jnz	.Loop8_warmup
+
+	leaq	1(%r10),%rsi
+	jmp	.Loop8
+.align	16
+.Loop8:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	0(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,0(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	4(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,4(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	8(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,8(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	12(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,12(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	16(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,16(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	20(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,20(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	24(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,24(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	$8,%sil
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	-4(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,28(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	$8,%r10b
+	rorq	$8,%r8
+	subq	$8,%r11
+
+	xorq	(%r12),%r8
+	movq	%r8,(%r13,%r12,1)
+	leaq	8(%r12),%r12
+
+	testq	$-8,%r11
+	jnz	.Loop8
+	cmpq	$0,%r11
+	jne	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.Lintel:
+	testq	$-32,%r11
+	jz	.Lloop1
+	andq	$15,%rbx
+	jz	.Loop16_is_hot
+	subq	%rbx,%r11
+.Loop16_warmup:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%rbx
+	jnz	.Loop16_warmup
+
+	movq	%rcx,%rbx
+	xorq	%rcx,%rcx
+	movb	%bl,%cl
+
+.Loop16_is_hot:
+	leaq	(%rdi,%r10,4),%rsi
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm0,%xmm0
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	addb	%bl,%cl
+	pinsrw	$0,(%rdi,%rax,4),%xmm0
+	jmp	.Loop16_enter
+.align	16
+.Loop16:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm0,%xmm2
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm0
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	pxor	%xmm1,%xmm2
+	addb	%bl,%cl
+	pinsrw	$0,(%rdi,%rax,4),%xmm0
+	movdqu	%xmm2,(%r13,%r12,1)
+	leaq	16(%r12),%r12
+.Loop16_enter:
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm1,%xmm1
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	8(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,4(%rsi)
+	addb	%al,%cl
+	pinsrw	$0,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	12(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,8(%rsi)
+	addb	%bl,%cl
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	16(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,12(%rsi)
+	addb	%al,%cl
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	20(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,16(%rsi)
+	addb	%bl,%cl
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	24(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,20(%rsi)
+	addb	%al,%cl
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	28(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,24(%rsi)
+	addb	%bl,%cl
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	32(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,28(%rsi)
+	addb	%al,%cl
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	36(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,32(%rsi)
+	addb	%bl,%cl
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	40(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,36(%rsi)
+	addb	%al,%cl
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	44(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,40(%rsi)
+	addb	%bl,%cl
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	48(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,44(%rsi)
+	addb	%al,%cl
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	52(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,48(%rsi)
+	addb	%bl,%cl
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	56(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,52(%rsi)
+	addb	%al,%cl
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	60(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,56(%rsi)
+	addb	%bl,%cl
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+	addb	$16,%r10b
+	movdqu	(%r12),%xmm2
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movzbl	%bl,%ebx
+	movl	%edx,60(%rsi)
+	leaq	(%rdi,%r10,4),%rsi
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+	movl	(%rsi),%eax
+	movq	%rcx,%rbx
+	xorq	%rcx,%rcx
+	subq	$16,%r11
+	movb	%bl,%cl
+	testq	$-16,%r11
+	jnz	.Loop16
+
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,(%r13,%r12,1)
+	leaq	16(%r12),%r12
+
+	cmpq	$0,%r11
+	jne	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.Lloop1:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%r11
+	jnz	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.LRC4_CHAR:
+	addb	$1,%r10b
+	movzbl	(%rdi,%r10,1),%eax
+	testq	$-8,%r11
+	jz	.Lcloop1
+	jmp	.Lcloop8
+.align	16
+.Lcloop8:
+	movl	(%r12),%r8d
+	movl	4(%r12),%r9d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov0			
+	movq	%rax,%rbx
+.Lcmov0:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov1			
+	movq	%rbx,%rax
+.Lcmov1:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov2			
+	movq	%rax,%rbx
+.Lcmov2:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov3			
+	movq	%rbx,%rax
+.Lcmov3:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov4			
+	movq	%rax,%rbx
+.Lcmov4:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov5			
+	movq	%rbx,%rax
+.Lcmov5:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov6			
+	movq	%rax,%rbx
+.Lcmov6:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov7			
+	movq	%rbx,%rax
+.Lcmov7:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	leaq	-8(%r11),%r11
+	movl	%r8d,(%r13)
+	leaq	8(%r12),%r12
+	movl	%r9d,4(%r13)
+	leaq	8(%r13),%r13
+
+	testq	$-8,%r11
+	jnz	.Lcloop8
+	cmpq	$0,%r11
+	jne	.Lcloop1
+	jmp	.Lexit
+.align	16
+.Lcloop1:
+	addb	%al,%cl
+	movzbl	%cl,%ecx
+	movzbl	(%rdi,%rcx,1),%edx
+	movb	%al,(%rdi,%rcx,1)
+	movb	%dl,(%rdi,%r10,1)
+	addb	%al,%dl
+	addb	$1,%r10b
+	movzbl	%dl,%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%rdx,1),%edx
+	movzbl	(%rdi,%r10,1),%eax
+	xorb	(%r12),%dl
+	leaq	1(%r12),%r12
+	movb	%dl,(%r13)
+	leaq	1(%r13),%r13
+	subq	$1,%r11
+	jnz	.Lcloop1
+	jmp	.Lexit
+
+.align	16
+.Lexit:
+	subb	$1,%r10b
+	movl	%r10d,-8(%rdi)
+	movl	%ecx,-4(%rdi)
+
+	movq	(%rsp),%r13
+	movq	8(%rsp),%r12
+	movq	16(%rsp),%rbx
+	addq	$24,%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	RC4,.-RC4
+.globl	private_RC4_set_key
+.type	private_RC4_set_key,@function
+.align	16
+private_RC4_set_key:
+	leaq	8(%rdi),%rdi
+	leaq	(%rdx,%rsi,1),%rdx
+	negq	%rsi
+	movq	%rsi,%rcx
+	xorl	%eax,%eax
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+
+	movl	OPENSSL_ia32cap_P(%rip),%r8d
+	btl	$20,%r8d
+	jc	.Lc1stloop
+	jmp	.Lw1stloop
+
+.align	16
+.Lw1stloop:
+	movl	%eax,(%rdi,%rax,4)
+	addb	$1,%al
+	jnc	.Lw1stloop
+
+	xorq	%r9,%r9
+	xorq	%r8,%r8
+.align	16
+.Lw2ndloop:
+	movl	(%rdi,%r9,4),%r10d
+	addb	(%rdx,%rsi,1),%r8b
+	addb	%r10b,%r8b
+	addq	$1,%rsi
+	movl	(%rdi,%r8,4),%r11d
+	cmovzq	%rcx,%rsi
+	movl	%r10d,(%rdi,%r8,4)
+	movl	%r11d,(%rdi,%r9,4)
+	addb	$1,%r9b
+	jnc	.Lw2ndloop
+	jmp	.Lexit_key
+
+.align	16
+.Lc1stloop:
+	movb	%al,(%rdi,%rax,1)
+	addb	$1,%al
+	jnc	.Lc1stloop
+
+	xorq	%r9,%r9
+	xorq	%r8,%r8
+.align	16
+.Lc2ndloop:
+	movb	(%rdi,%r9,1),%r10b
+	addb	(%rdx,%rsi,1),%r8b
+	addb	%r10b,%r8b
+	addq	$1,%rsi
+	movb	(%rdi,%r8,1),%r11b
+	jnz	.Lcnowrap
+	movq	%rcx,%rsi
+.Lcnowrap:
+	movb	%r10b,(%rdi,%r8,1)
+	movb	%r11b,(%rdi,%r9,1)
+	addb	$1,%r9b
+	jnc	.Lc2ndloop
+	movl	$-1,256(%rdi)
+
+.align	16
+.Lexit_key:
+	xorl	%eax,%eax
+	movl	%eax,-8(%rdi)
+	movl	%eax,-4(%rdi)
+	.byte	0xf3,0xc3
+.size	private_RC4_set_key,.-private_RC4_set_key
+
+.globl	RC4_options
+.type	RC4_options,@function
+.align	16
+RC4_options:
+	leaq	.Lopts(%rip),%rax
+	movl	OPENSSL_ia32cap_P(%rip),%edx
+	btl	$20,%edx
+	jc	.L8xchar
+	btl	$30,%edx
+	jnc	.Ldone
+	addq	$25,%rax
+	.byte	0xf3,0xc3
+.L8xchar:
+	addq	$12,%rax
+.Ldone:
+	.byte	0xf3,0xc3
+.align	64
+.Lopts:
+.byte	114,99,52,40,56,120,44,105,110,116,41,0
+.byte	114,99,52,40,56,120,44,99,104,97,114,41,0
+.byte	114,99,52,40,49,54,120,44,105,110,116,41,0
+.byte	82,67,52,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+.size	RC4_options,.-RC4_options
diff --git a/crypto/sha/asm/sha1-x86_64.S b/crypto/sha/asm/sha1-x86_64.S
new file mode 100644
index 0000000..3922e20
--- /dev/null
+++ b/crypto/sha/asm/sha1-x86_64.S
@@ -0,0 +1,2486 @@
+.text	
+
+
+.globl	sha1_block_data_order
+.type	sha1_block_data_order,@function
+.align	16
+sha1_block_data_order:
+	movl	OPENSSL_ia32cap_P+0(%rip),%r9d
+	movl	OPENSSL_ia32cap_P+4(%rip),%r8d
+	testl	$512,%r8d
+	jz	.Lialu
+	jmp	_ssse3_shortcut
+
+.align	16
+.Lialu:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	movq	%rsp,%r11
+	movq	%rdi,%r8
+	subq	$72,%rsp
+	movq	%rsi,%r9
+	andq	$-64,%rsp
+	movq	%rdx,%r10
+	movq	%r11,64(%rsp)
+.Lprologue:
+
+	movl	0(%r8),%esi
+	movl	4(%r8),%edi
+	movl	8(%r8),%r11d
+	movl	12(%r8),%r12d
+	movl	16(%r8),%r13d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	movl	0(%r9),%edx
+	bswapl	%edx
+	movl	%edx,0(%rsp)
+	movl	%r11d,%eax
+	movl	4(%r9),%ebp
+	movl	%esi,%ecx
+	xorl	%r12d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r13,1),%r13d
+	andl	%edi,%eax
+	movl	%ebp,4(%rsp)
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	%edi,%eax
+	movl	8(%r9),%edx
+	movl	%r13d,%ecx
+	xorl	%r11d,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%r12,1),%r12d
+	andl	%esi,%eax
+	movl	%edx,8(%rsp)
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	%esi,%eax
+	movl	12(%r9),%ebp
+	movl	%r12d,%ecx
+	xorl	%edi,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r11,1),%r11d
+	andl	%r13d,%eax
+	movl	%ebp,12(%rsp)
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	%r13d,%eax
+	movl	16(%r9),%edx
+	movl	%r11d,%ecx
+	xorl	%esi,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%rdi,1),%edi
+	andl	%r12d,%eax
+	movl	%edx,16(%rsp)
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	%r12d,%eax
+	movl	20(%r9),%ebp
+	movl	%edi,%ecx
+	xorl	%r13d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%rsi,1),%esi
+	andl	%r11d,%eax
+	movl	%ebp,20(%rsp)
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	%r11d,%eax
+	movl	24(%r9),%edx
+	movl	%esi,%ecx
+	xorl	%r12d,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%r13,1),%r13d
+	andl	%edi,%eax
+	movl	%edx,24(%rsp)
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	%edi,%eax
+	movl	28(%r9),%ebp
+	movl	%r13d,%ecx
+	xorl	%r11d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r12,1),%r12d
+	andl	%esi,%eax
+	movl	%ebp,28(%rsp)
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	%esi,%eax
+	movl	32(%r9),%edx
+	movl	%r12d,%ecx
+	xorl	%edi,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%r11,1),%r11d
+	andl	%r13d,%eax
+	movl	%edx,32(%rsp)
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	%r13d,%eax
+	movl	36(%r9),%ebp
+	movl	%r11d,%ecx
+	xorl	%esi,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%rdi,1),%edi
+	andl	%r12d,%eax
+	movl	%ebp,36(%rsp)
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	%r12d,%eax
+	movl	40(%r9),%edx
+	movl	%edi,%ecx
+	xorl	%r13d,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%rsi,1),%esi
+	andl	%r11d,%eax
+	movl	%edx,40(%rsp)
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	%r11d,%eax
+	movl	44(%r9),%ebp
+	movl	%esi,%ecx
+	xorl	%r12d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r13,1),%r13d
+	andl	%edi,%eax
+	movl	%ebp,44(%rsp)
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	%edi,%eax
+	movl	48(%r9),%edx
+	movl	%r13d,%ecx
+	xorl	%r11d,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%r12,1),%r12d
+	andl	%esi,%eax
+	movl	%edx,48(%rsp)
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	%esi,%eax
+	movl	52(%r9),%ebp
+	movl	%r12d,%ecx
+	xorl	%edi,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r11,1),%r11d
+	andl	%r13d,%eax
+	movl	%ebp,52(%rsp)
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	%r13d,%eax
+	movl	56(%r9),%edx
+	movl	%r11d,%ecx
+	xorl	%esi,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%rdi,1),%edi
+	andl	%r12d,%eax
+	movl	%edx,56(%rsp)
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	%r12d,%eax
+	movl	60(%r9),%ebp
+	movl	%edi,%ecx
+	xorl	%r13d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%rsi,1),%esi
+	andl	%r11d,%eax
+	movl	%ebp,60(%rsp)
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	0(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	32(%rsp),%edx
+	andl	%edi,%eax
+	leal	1518500249(%rbp,%r13,1),%r13d
+	xorl	52(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$1,%edx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	movl	%edx,0(%rsp)
+	addl	%eax,%r13d
+	movl	4(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	36(%rsp),%ebp
+	andl	%esi,%eax
+	leal	1518500249(%rdx,%r12,1),%r12d
+	xorl	56(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$1,%ebp
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	movl	%ebp,4(%rsp)
+	addl	%eax,%r12d
+	movl	8(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	40(%rsp),%edx
+	andl	%r13d,%eax
+	leal	1518500249(%rbp,%r11,1),%r11d
+	xorl	60(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$1,%edx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	movl	%edx,8(%rsp)
+	addl	%eax,%r11d
+	movl	12(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	44(%rsp),%ebp
+	andl	%r12d,%eax
+	leal	1518500249(%rdx,%rdi,1),%edi
+	xorl	0(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$1,%ebp
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	movl	%ebp,12(%rsp)
+	addl	%eax,%edi
+	movl	16(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	48(%rsp),%edx
+	andl	%r11d,%eax
+	leal	1518500249(%rbp,%rsi,1),%esi
+	xorl	4(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$1,%edx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	movl	%edx,16(%rsp)
+	addl	%eax,%esi
+	movl	20(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r13,1),%r13d
+	xorl	52(%rsp),%ebp
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	8(%rsp),%ebp
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	movl	%ebp,20(%rsp)
+	movl	24(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	32(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r12,1),%r12d
+	xorl	56(%rsp),%edx
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	12(%rsp),%edx
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	movl	%edx,24(%rsp)
+	movl	28(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	36(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r11,1),%r11d
+	xorl	60(%rsp),%ebp
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	16(%rsp),%ebp
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	movl	%ebp,28(%rsp)
+	movl	32(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%rdi,1),%edi
+	xorl	0(%rsp),%edx
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	20(%rsp),%edx
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	movl	%edx,32(%rsp)
+	movl	36(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%rsi,1),%esi
+	xorl	4(%rsp),%ebp
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	24(%rsp),%ebp
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	movl	%ebp,36(%rsp)
+	movl	40(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	48(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r13,1),%r13d
+	xorl	8(%rsp),%edx
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	28(%rsp),%edx
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	movl	%edx,40(%rsp)
+	movl	44(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	52(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r12,1),%r12d
+	xorl	12(%rsp),%ebp
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	32(%rsp),%ebp
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	movl	%ebp,44(%rsp)
+	movl	48(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	56(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r11,1),%r11d
+	xorl	16(%rsp),%edx
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	36(%rsp),%edx
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	movl	%edx,48(%rsp)
+	movl	52(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%rdi,1),%edi
+	xorl	20(%rsp),%ebp
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	40(%rsp),%ebp
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%ebp,52(%rsp)
+	movl	56(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%rsi,1),%esi
+	xorl	24(%rsp),%edx
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	44(%rsp),%edx
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	movl	%edx,56(%rsp)
+	movl	60(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r13,1),%r13d
+	xorl	28(%rsp),%ebp
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	48(%rsp),%ebp
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	movl	%ebp,60(%rsp)
+	movl	0(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r12,1),%r12d
+	xorl	32(%rsp),%edx
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	52(%rsp),%edx
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	movl	%edx,0(%rsp)
+	movl	4(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r11,1),%r11d
+	xorl	36(%rsp),%ebp
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	56(%rsp),%ebp
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	movl	%ebp,4(%rsp)
+	movl	8(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%rdi,1),%edi
+	xorl	40(%rsp),%edx
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	60(%rsp),%edx
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	movl	%edx,8(%rsp)
+	movl	12(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%rsi,1),%esi
+	xorl	44(%rsp),%ebp
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	0(%rsp),%ebp
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	movl	%ebp,12(%rsp)
+	movl	16(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r13,1),%r13d
+	xorl	48(%rsp),%edx
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	4(%rsp),%edx
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	movl	%edx,16(%rsp)
+	movl	20(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r12,1),%r12d
+	xorl	52(%rsp),%ebp
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	8(%rsp),%ebp
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	movl	%ebp,20(%rsp)
+	movl	24(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	32(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r11,1),%r11d
+	xorl	56(%rsp),%edx
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	12(%rsp),%edx
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	movl	%edx,24(%rsp)
+	movl	28(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	36(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%rdi,1),%edi
+	xorl	60(%rsp),%ebp
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	16(%rsp),%ebp
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%ebp,28(%rsp)
+	movl	32(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%rsi,1),%esi
+	xorl	0(%rsp),%edx
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	20(%rsp),%edx
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	movl	%edx,32(%rsp)
+	movl	36(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%r11d,%ebx
+	xorl	44(%rsp),%ebp
+	andl	%r12d,%eax
+	movl	%esi,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r12d,%ebx
+	leal	-1894007588(%rdx,%r13,1),%r13d
+	roll	$5,%ecx
+	xorl	24(%rsp),%ebp
+	addl	%eax,%r13d
+	andl	%edi,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r13d
+	roll	$30,%edi
+	movl	%ebp,36(%rsp)
+	addl	%ecx,%r13d
+	movl	40(%rsp),%edx
+	movl	%edi,%eax
+	movl	%edi,%ebx
+	xorl	48(%rsp),%edx
+	andl	%r11d,%eax
+	movl	%r13d,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%r11d,%ebx
+	leal	-1894007588(%rbp,%r12,1),%r12d
+	roll	$5,%ecx
+	xorl	28(%rsp),%edx
+	addl	%eax,%r12d
+	andl	%esi,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r12d
+	roll	$30,%esi
+	movl	%edx,40(%rsp)
+	addl	%ecx,%r12d
+	movl	44(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%esi,%ebx
+	xorl	52(%rsp),%ebp
+	andl	%edi,%eax
+	movl	%r12d,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%edi,%ebx
+	leal	-1894007588(%rdx,%r11,1),%r11d
+	roll	$5,%ecx
+	xorl	32(%rsp),%ebp
+	addl	%eax,%r11d
+	andl	%r13d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r11d
+	roll	$30,%r13d
+	movl	%ebp,44(%rsp)
+	addl	%ecx,%r11d
+	movl	48(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r13d,%ebx
+	xorl	56(%rsp),%edx
+	andl	%esi,%eax
+	movl	%r11d,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%esi,%ebx
+	leal	-1894007588(%rbp,%rdi,1),%edi
+	roll	$5,%ecx
+	xorl	36(%rsp),%edx
+	addl	%eax,%edi
+	andl	%r12d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%edi
+	roll	$30,%r12d
+	movl	%edx,48(%rsp)
+	addl	%ecx,%edi
+	movl	52(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%r12d,%ebx
+	xorl	60(%rsp),%ebp
+	andl	%r13d,%eax
+	movl	%edi,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%r13d,%ebx
+	leal	-1894007588(%rdx,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	40(%rsp),%ebp
+	addl	%eax,%esi
+	andl	%r11d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%esi
+	roll	$30,%r11d
+	movl	%ebp,52(%rsp)
+	addl	%ecx,%esi
+	movl	56(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r11d,%ebx
+	xorl	0(%rsp),%edx
+	andl	%r12d,%eax
+	movl	%esi,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%r12d,%ebx
+	leal	-1894007588(%rbp,%r13,1),%r13d
+	roll	$5,%ecx
+	xorl	44(%rsp),%edx
+	addl	%eax,%r13d
+	andl	%edi,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r13d
+	roll	$30,%edi
+	movl	%edx,56(%rsp)
+	addl	%ecx,%r13d
+	movl	60(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edi,%ebx
+	xorl	4(%rsp),%ebp
+	andl	%r11d,%eax
+	movl	%r13d,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%r11d,%ebx
+	leal	-1894007588(%rdx,%r12,1),%r12d
+	roll	$5,%ecx
+	xorl	48(%rsp),%ebp
+	addl	%eax,%r12d
+	andl	%esi,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r12d
+	roll	$30,%esi
+	movl	%ebp,60(%rsp)
+	addl	%ecx,%r12d
+	movl	0(%rsp),%edx
+	movl	%esi,%eax
+	movl	%esi,%ebx
+	xorl	8(%rsp),%edx
+	andl	%edi,%eax
+	movl	%r12d,%ecx
+	xorl	32(%rsp),%edx
+	xorl	%edi,%ebx
+	leal	-1894007588(%rbp,%r11,1),%r11d
+	roll	$5,%ecx
+	xorl	52(%rsp),%edx
+	addl	%eax,%r11d
+	andl	%r13d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r11d
+	roll	$30,%r13d
+	movl	%edx,0(%rsp)
+	addl	%ecx,%r11d
+	movl	4(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r13d,%ebx
+	xorl	12(%rsp),%ebp
+	andl	%esi,%eax
+	movl	%r11d,%ecx
+	xorl	36(%rsp),%ebp
+	xorl	%esi,%ebx
+	leal	-1894007588(%rdx,%rdi,1),%edi
+	roll	$5,%ecx
+	xorl	56(%rsp),%ebp
+	addl	%eax,%edi
+	andl	%r12d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%edi
+	roll	$30,%r12d
+	movl	%ebp,4(%rsp)
+	addl	%ecx,%edi
+	movl	8(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r12d,%ebx
+	xorl	16(%rsp),%edx
+	andl	%r13d,%eax
+	movl	%edi,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%r13d,%ebx
+	leal	-1894007588(%rbp,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	60(%rsp),%edx
+	addl	%eax,%esi
+	andl	%r11d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%esi
+	roll	$30,%r11d
+	movl	%edx,8(%rsp)
+	addl	%ecx,%esi
+	movl	12(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%r11d,%ebx
+	xorl	20(%rsp),%ebp
+	andl	%r12d,%eax
+	movl	%esi,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%r12d,%ebx
+	leal	-1894007588(%rdx,%r13,1),%r13d
+	roll	$5,%ecx
+	xorl	0(%rsp),%ebp
+	addl	%eax,%r13d
+	andl	%edi,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r13d
+	roll	$30,%edi
+	movl	%ebp,12(%rsp)
+	addl	%ecx,%r13d
+	movl	16(%rsp),%edx
+	movl	%edi,%eax
+	movl	%edi,%ebx
+	xorl	24(%rsp),%edx
+	andl	%r11d,%eax
+	movl	%r13d,%ecx
+	xorl	48(%rsp),%edx
+	xorl	%r11d,%ebx
+	leal	-1894007588(%rbp,%r12,1),%r12d
+	roll	$5,%ecx
+	xorl	4(%rsp),%edx
+	addl	%eax,%r12d
+	andl	%esi,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r12d
+	roll	$30,%esi
+	movl	%edx,16(%rsp)
+	addl	%ecx,%r12d
+	movl	20(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%esi,%ebx
+	xorl	28(%rsp),%ebp
+	andl	%edi,%eax
+	movl	%r12d,%ecx
+	xorl	52(%rsp),%ebp
+	xorl	%edi,%ebx
+	leal	-1894007588(%rdx,%r11,1),%r11d
+	roll	$5,%ecx
+	xorl	8(%rsp),%ebp
+	addl	%eax,%r11d
+	andl	%r13d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r11d
+	roll	$30,%r13d
+	movl	%ebp,20(%rsp)
+	addl	%ecx,%r11d
+	movl	24(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r13d,%ebx
+	xorl	32(%rsp),%edx
+	andl	%esi,%eax
+	movl	%r11d,%ecx
+	xorl	56(%rsp),%edx
+	xorl	%esi,%ebx
+	leal	-1894007588(%rbp,%rdi,1),%edi
+	roll	$5,%ecx
+	xorl	12(%rsp),%edx
+	addl	%eax,%edi
+	andl	%r12d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%edi
+	roll	$30,%r12d
+	movl	%edx,24(%rsp)
+	addl	%ecx,%edi
+	movl	28(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%r12d,%ebx
+	xorl	36(%rsp),%ebp
+	andl	%r13d,%eax
+	movl	%edi,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%r13d,%ebx
+	leal	-1894007588(%rdx,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	16(%rsp),%ebp
+	addl	%eax,%esi
+	andl	%r11d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%esi
+	roll	$30,%r11d
+	movl	%ebp,28(%rsp)
+	addl	%ecx,%esi
+	movl	32(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r11d,%ebx
+	xorl	40(%rsp),%edx
+	andl	%r12d,%eax
+	movl	%esi,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%r12d,%ebx
+	leal	-1894007588(%rbp,%r13,1),%r13d
+	roll	$5,%ecx
+	xorl	20(%rsp),%edx
+	addl	%eax,%r13d
+	andl	%edi,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r13d
+	roll	$30,%edi
+	movl	%edx,32(%rsp)
+	addl	%ecx,%r13d
+	movl	36(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edi,%ebx
+	xorl	44(%rsp),%ebp
+	andl	%r11d,%eax
+	movl	%r13d,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r11d,%ebx
+	leal	-1894007588(%rdx,%r12,1),%r12d
+	roll	$5,%ecx
+	xorl	24(%rsp),%ebp
+	addl	%eax,%r12d
+	andl	%esi,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r12d
+	roll	$30,%esi
+	movl	%ebp,36(%rsp)
+	addl	%ecx,%r12d
+	movl	40(%rsp),%edx
+	movl	%esi,%eax
+	movl	%esi,%ebx
+	xorl	48(%rsp),%edx
+	andl	%edi,%eax
+	movl	%r12d,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%edi,%ebx
+	leal	-1894007588(%rbp,%r11,1),%r11d
+	roll	$5,%ecx
+	xorl	28(%rsp),%edx
+	addl	%eax,%r11d
+	andl	%r13d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r11d
+	roll	$30,%r13d
+	movl	%edx,40(%rsp)
+	addl	%ecx,%r11d
+	movl	44(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r13d,%ebx
+	xorl	52(%rsp),%ebp
+	andl	%esi,%eax
+	movl	%r11d,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%esi,%ebx
+	leal	-1894007588(%rdx,%rdi,1),%edi
+	roll	$5,%ecx
+	xorl	32(%rsp),%ebp
+	addl	%eax,%edi
+	andl	%r12d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%edi
+	roll	$30,%r12d
+	movl	%ebp,44(%rsp)
+	addl	%ecx,%edi
+	movl	48(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r12d,%ebx
+	xorl	56(%rsp),%edx
+	andl	%r13d,%eax
+	movl	%edi,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%r13d,%ebx
+	leal	-1894007588(%rbp,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	36(%rsp),%edx
+	addl	%eax,%esi
+	andl	%r11d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%esi
+	roll	$30,%r11d
+	movl	%edx,48(%rsp)
+	addl	%ecx,%esi
+	movl	52(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r13,1),%r13d
+	xorl	20(%rsp),%ebp
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	40(%rsp),%ebp
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	movl	%ebp,52(%rsp)
+	movl	56(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r12,1),%r12d
+	xorl	24(%rsp),%edx
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	44(%rsp),%edx
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	movl	%edx,56(%rsp)
+	movl	60(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r11,1),%r11d
+	xorl	28(%rsp),%ebp
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	48(%rsp),%ebp
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	movl	%ebp,60(%rsp)
+	movl	0(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%rdi,1),%edi
+	xorl	32(%rsp),%edx
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	52(%rsp),%edx
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	movl	%edx,0(%rsp)
+	movl	4(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%rsi,1),%esi
+	xorl	36(%rsp),%ebp
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	56(%rsp),%ebp
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	movl	%ebp,4(%rsp)
+	movl	8(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r13,1),%r13d
+	xorl	40(%rsp),%edx
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	60(%rsp),%edx
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	movl	%edx,8(%rsp)
+	movl	12(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r12,1),%r12d
+	xorl	44(%rsp),%ebp
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	0(%rsp),%ebp
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	movl	%ebp,12(%rsp)
+	movl	16(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r11,1),%r11d
+	xorl	48(%rsp),%edx
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	4(%rsp),%edx
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	movl	%edx,16(%rsp)
+	movl	20(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%rdi,1),%edi
+	xorl	52(%rsp),%ebp
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	8(%rsp),%ebp
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%ebp,20(%rsp)
+	movl	24(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	32(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%rsi,1),%esi
+	xorl	56(%rsp),%edx
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	12(%rsp),%edx
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	movl	%edx,24(%rsp)
+	movl	28(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	36(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r13,1),%r13d
+	xorl	60(%rsp),%ebp
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	16(%rsp),%ebp
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	movl	%ebp,28(%rsp)
+	movl	32(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r12,1),%r12d
+	xorl	0(%rsp),%edx
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	20(%rsp),%edx
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	movl	%edx,32(%rsp)
+	movl	36(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r11,1),%r11d
+	xorl	4(%rsp),%ebp
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	24(%rsp),%ebp
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	movl	%ebp,36(%rsp)
+	movl	40(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	48(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%rdi,1),%edi
+	xorl	8(%rsp),%edx
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	28(%rsp),%edx
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	movl	%edx,40(%rsp)
+	movl	44(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	52(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%rsi,1),%esi
+	xorl	12(%rsp),%ebp
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	32(%rsp),%ebp
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	movl	%ebp,44(%rsp)
+	movl	48(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	56(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r13,1),%r13d
+	xorl	16(%rsp),%edx
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	36(%rsp),%edx
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	movl	%edx,48(%rsp)
+	movl	52(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r12,1),%r12d
+	xorl	20(%rsp),%ebp
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	40(%rsp),%ebp
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	movl	56(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r11,1),%r11d
+	xorl	24(%rsp),%edx
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	44(%rsp),%edx
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	movl	60(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%rdi,1),%edi
+	xorl	28(%rsp),%ebp
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	48(%rsp),%ebp
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	%r11d,%eax
+	leal	-899497514(%rbp,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	addl	0(%r8),%esi
+	addl	4(%r8),%edi
+	addl	8(%r8),%r11d
+	addl	12(%r8),%r12d
+	addl	16(%r8),%r13d
+	movl	%esi,0(%r8)
+	movl	%edi,4(%r8)
+	movl	%r11d,8(%r8)
+	movl	%r12d,12(%r8)
+	movl	%r13d,16(%r8)
+
+	subq	$1,%r10
+	leaq	64(%r9),%r9
+	jnz	.Lloop
+
+	movq	64(%rsp),%rsi
+	movq	(%rsi),%r13
+	movq	8(%rsi),%r12
+	movq	16(%rsi),%rbp
+	movq	24(%rsi),%rbx
+	leaq	32(%rsi),%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	sha1_block_data_order,.-sha1_block_data_order
+.type	sha1_block_data_order_ssse3,@function
+.align	16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	leaq	-64(%rsp),%rsp
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+
+	shlq	$6,%r10
+	addq	%r9,%r10
+	leaq	K_XX_XX(%rip),%r11
+
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	%ebx,%esi
+	movl	16(%r8),%ebp
+
+	movdqa	64(%r11),%xmm6
+	movdqa	0(%r11),%xmm9
+	movdqu	0(%r9),%xmm0
+	movdqu	16(%r9),%xmm1
+	movdqu	32(%r9),%xmm2
+	movdqu	48(%r9),%xmm3
+.byte	102,15,56,0,198
+	addq	$64,%r9
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+.byte	102,15,56,0,222
+	paddd	%xmm9,%xmm0
+	paddd	%xmm9,%xmm1
+	paddd	%xmm9,%xmm2
+	movdqa	%xmm0,0(%rsp)
+	psubd	%xmm9,%xmm0
+	movdqa	%xmm1,16(%rsp)
+	psubd	%xmm9,%xmm1
+	movdqa	%xmm2,32(%rsp)
+	psubd	%xmm9,%xmm2
+	jmp	.Loop_ssse3
+.align	16
+.Loop_ssse3:
+	movdqa	%xmm1,%xmm4
+	addl	0(%rsp),%ebp
+	xorl	%edx,%ecx
+	movdqa	%xmm3,%xmm8
+.byte	102,15,58,15,224,8
+	movl	%eax,%edi
+	roll	$5,%eax
+	paddd	%xmm3,%xmm9
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrldq	$4,%xmm8
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	pxor	%xmm0,%xmm4
+	rorl	$2,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm2,%xmm8
+	addl	4(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	pxor	%xmm8,%xmm4
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	%xmm9,48(%rsp)
+	xorl	%ecx,%edi
+	addl	%ebp,%edx
+	movdqa	%xmm4,%xmm10
+	movdqa	%xmm4,%xmm8
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	8(%rsp),%ecx
+	xorl	%ebx,%eax
+	pslldq	$12,%xmm10
+	paddd	%xmm4,%xmm4
+	movl	%edx,%edi
+	roll	$5,%edx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrld	$31,%xmm8
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	movdqa	%xmm10,%xmm9
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	psrld	$30,%xmm10
+	por	%xmm8,%xmm4
+	addl	12(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm4
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	0(%r11),%xmm10
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	pxor	%xmm9,%xmm4
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	movdqa	%xmm2,%xmm5
+	addl	16(%rsp),%eax
+	xorl	%ebp,%edx
+	movdqa	%xmm4,%xmm9
+.byte	102,15,58,15,233,8
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	paddd	%xmm4,%xmm10
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	psrldq	$4,%xmm9
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	pxor	%xmm1,%xmm5
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm3,%xmm9
+	addl	20(%rsp),%ebp
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	roll	$5,%eax
+	pxor	%xmm9,%xmm5
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	movdqa	%xmm10,0(%rsp)
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	movdqa	%xmm5,%xmm8
+	movdqa	%xmm5,%xmm9
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	24(%rsp),%edx
+	xorl	%ecx,%ebx
+	pslldq	$12,%xmm8
+	paddd	%xmm5,%xmm5
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	psrld	$31,%xmm9
+	xorl	%ecx,%esi
+	addl	%ebp,%edx
+	movdqa	%xmm8,%xmm10
+	rorl	$7,%eax
+	addl	%esi,%edx
+	psrld	$30,%xmm8
+	por	%xmm9,%xmm5
+	addl	28(%rsp),%ecx
+	xorl	%ebx,%eax
+	movl	%edx,%esi
+	roll	$5,%edx
+	pslld	$2,%xmm10
+	pxor	%xmm8,%xmm5
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	movdqa	16(%r11),%xmm8
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	pxor	%xmm10,%xmm5
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	movdqa	%xmm3,%xmm6
+	addl	32(%rsp),%ebx
+	xorl	%eax,%ebp
+	movdqa	%xmm5,%xmm10
+.byte	102,15,58,15,242,8
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	paddd	%xmm5,%xmm8
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	psrldq	$4,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	pxor	%xmm2,%xmm6
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm4,%xmm10
+	addl	36(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	pxor	%xmm10,%xmm6
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	movdqa	%xmm8,16(%rsp)
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	movdqa	%xmm6,%xmm9
+	movdqa	%xmm6,%xmm10
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	40(%rsp),%ebp
+	xorl	%edx,%ecx
+	pslldq	$12,%xmm9
+	paddd	%xmm6,%xmm6
+	movl	%eax,%edi
+	roll	$5,%eax
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrld	$31,%xmm10
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm9,%xmm8
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	psrld	$30,%xmm9
+	por	%xmm10,%xmm6
+	addl	44(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	pslld	$2,%xmm8
+	pxor	%xmm9,%xmm6
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	16(%r11),%xmm9
+	xorl	%ecx,%edi
+	addl	%ebp,%edx
+	pxor	%xmm8,%xmm6
+	rorl	$7,%eax
+	addl	%edi,%edx
+	movdqa	%xmm4,%xmm7
+	addl	48(%rsp),%ecx
+	xorl	%ebx,%eax
+	movdqa	%xmm6,%xmm8
+.byte	102,15,58,15,251,8
+	movl	%edx,%edi
+	roll	$5,%edx
+	paddd	%xmm6,%xmm9
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrldq	$4,%xmm8
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	pxor	%xmm3,%xmm7
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	pxor	%xmm5,%xmm8
+	addl	52(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	pxor	%xmm8,%xmm7
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	%xmm9,32(%rsp)
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	movdqa	%xmm7,%xmm10
+	movdqa	%xmm7,%xmm8
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	56(%rsp),%eax
+	xorl	%ebp,%edx
+	pslldq	$12,%xmm10
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	psrld	$31,%xmm8
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	movdqa	%xmm10,%xmm9
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	psrld	$30,%xmm10
+	por	%xmm8,%xmm7
+	addl	60(%rsp),%ebp
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	roll	$5,%eax
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm7
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	movdqa	16(%r11),%xmm10
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	pxor	%xmm9,%xmm7
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	movdqa	%xmm7,%xmm9
+	addl	0(%rsp),%edx
+	pxor	%xmm4,%xmm0
+.byte	102,68,15,58,15,206,8
+	xorl	%ecx,%ebx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	pxor	%xmm1,%xmm0
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm7,%xmm10
+	xorl	%ecx,%esi
+	addl	%ebp,%edx
+	pxor	%xmm9,%xmm0
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	4(%rsp),%ecx
+	xorl	%ebx,%eax
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm10,48(%rsp)
+	movl	%edx,%esi
+	roll	$5,%edx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	pslld	$2,%xmm0
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	psrld	$30,%xmm9
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	8(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	por	%xmm9,%xmm0
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	movdqa	%xmm0,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	12(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	16(%rsp),%ebp
+	pxor	%xmm5,%xmm1
+.byte	102,68,15,58,15,215,8
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm0,%xmm8
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm10,%xmm1
+	addl	20(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm1,%xmm10
+	movdqa	%xmm8,0(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm1
+	addl	24(%rsp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm10
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	por	%xmm10,%xmm1
+	addl	28(%rsp),%ebx
+	xorl	%eax,%edi
+	movdqa	%xmm1,%xmm8
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	32(%rsp),%eax
+	pxor	%xmm6,%xmm2
+.byte	102,68,15,58,15,192,8
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	movdqa	32(%r11),%xmm10
+	paddd	%xmm1,%xmm9
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm8,%xmm2
+	addl	36(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm8
+	movdqa	%xmm9,16(%rsp)
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	pslld	$2,%xmm2
+	addl	40(%rsp),%edx
+	xorl	%ecx,%esi
+	psrld	$30,%xmm8
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	por	%xmm8,%xmm2
+	addl	44(%rsp),%ecx
+	xorl	%ebx,%edi
+	movdqa	%xmm2,%xmm9
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	48(%rsp),%ebx
+	pxor	%xmm7,%xmm3
+.byte	102,68,15,58,15,201,8
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm2,%xmm10
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm9,%xmm3
+	addl	52(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm9
+	movdqa	%xmm10,32(%rsp)
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	pslld	$2,%xmm3
+	addl	56(%rsp),%ebp
+	xorl	%edx,%esi
+	psrld	$30,%xmm9
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	por	%xmm9,%xmm3
+	addl	60(%rsp),%edx
+	xorl	%ecx,%edi
+	movdqa	%xmm3,%xmm10
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	0(%rsp),%ecx
+	pxor	%xmm0,%xmm4
+.byte	102,68,15,58,15,210,8
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm3,%xmm8
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	pxor	%xmm10,%xmm4
+	addl	4(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm10
+	movdqa	%xmm8,48(%rsp)
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	pslld	$2,%xmm4
+	addl	8(%rsp),%eax
+	xorl	%ebp,%esi
+	psrld	$30,%xmm10
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	por	%xmm10,%xmm4
+	addl	12(%rsp),%ebp
+	xorl	%edx,%edi
+	movdqa	%xmm4,%xmm8
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	16(%rsp),%edx
+	pxor	%xmm1,%xmm5
+.byte	102,68,15,58,15,195,8
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	pxor	%xmm6,%xmm5
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm4,%xmm9
+	rorl	$7,%eax
+	addl	%esi,%edx
+	pxor	%xmm8,%xmm5
+	addl	20(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm8
+	movdqa	%xmm9,0(%rsp)
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	pslld	$2,%xmm5
+	addl	24(%rsp),%ebx
+	xorl	%eax,%esi
+	psrld	$30,%xmm8
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	por	%xmm8,%xmm5
+	addl	28(%rsp),%eax
+	xorl	%ebp,%edi
+	movdqa	%xmm5,%xmm9
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movl	%ecx,%edi
+	pxor	%xmm2,%xmm6
+.byte	102,68,15,58,15,204,8
+	xorl	%edx,%ecx
+	addl	32(%rsp),%ebp
+	andl	%edx,%edi
+	pxor	%xmm7,%xmm6
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm5,%xmm10
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	pxor	%xmm9,%xmm6
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movdqa	%xmm6,%xmm9
+	movdqa	%xmm10,16(%rsp)
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	36(%rsp),%edx
+	andl	%ecx,%esi
+	pslld	$2,%xmm6
+	andl	%ebx,%edi
+	rorl	$7,%eax
+	psrld	$30,%xmm9
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	por	%xmm9,%xmm6
+	movl	%eax,%edi
+	xorl	%ebx,%eax
+	movdqa	%xmm6,%xmm10
+	addl	40(%rsp),%ecx
+	andl	%ebx,%edi
+	andl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	44(%rsp),%ebx
+	andl	%eax,%esi
+	andl	%ebp,%edi
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movl	%edx,%edi
+	pxor	%xmm3,%xmm7
+.byte	102,68,15,58,15,213,8
+	xorl	%ebp,%edx
+	addl	48(%rsp),%eax
+	andl	%ebp,%edi
+	pxor	%xmm0,%xmm7
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	movdqa	48(%r11),%xmm9
+	paddd	%xmm6,%xmm8
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	pxor	%xmm10,%xmm7
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	movdqa	%xmm7,%xmm10
+	movdqa	%xmm8,32(%rsp)
+	movl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	52(%rsp),%ebp
+	andl	%edx,%esi
+	pslld	$2,%xmm7
+	andl	%ecx,%edi
+	rorl	$7,%ebx
+	psrld	$30,%xmm10
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	por	%xmm10,%xmm7
+	movl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	%xmm7,%xmm8
+	addl	56(%rsp),%edx
+	andl	%ecx,%edi
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	60(%rsp),%ecx
+	andl	%ebx,%esi
+	andl	%eax,%edi
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%ebp,%edi
+	pxor	%xmm4,%xmm0
+.byte	102,68,15,58,15,198,8
+	xorl	%eax,%ebp
+	addl	0(%rsp),%ebx
+	andl	%eax,%edi
+	pxor	%xmm1,%xmm0
+	andl	%ebp,%esi
+	rorl	$7,%edx
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm7,%xmm9
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	pxor	%xmm8,%xmm0
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movdqa	%xmm0,%xmm8
+	movdqa	%xmm9,48(%rsp)
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	4(%rsp),%eax
+	andl	%ebp,%esi
+	pslld	$2,%xmm0
+	andl	%edx,%edi
+	rorl	$7,%ecx
+	psrld	$30,%xmm8
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	por	%xmm8,%xmm0
+	movl	%ecx,%edi
+	xorl	%edx,%ecx
+	movdqa	%xmm0,%xmm9
+	addl	8(%rsp),%ebp
+	andl	%edx,%edi
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	12(%rsp),%edx
+	andl	%ecx,%esi
+	andl	%ebx,%edi
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movl	%eax,%edi
+	pxor	%xmm5,%xmm1
+.byte	102,68,15,58,15,207,8
+	xorl	%ebx,%eax
+	addl	16(%rsp),%ecx
+	andl	%ebx,%edi
+	pxor	%xmm2,%xmm1
+	andl	%eax,%esi
+	rorl	$7,%ebp
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm0,%xmm10
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	pxor	%xmm9,%xmm1
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movdqa	%xmm1,%xmm9
+	movdqa	%xmm10,0(%rsp)
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	20(%rsp),%ebx
+	andl	%eax,%esi
+	pslld	$2,%xmm1
+	andl	%ebp,%edi
+	rorl	$7,%edx
+	psrld	$30,%xmm9
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	por	%xmm9,%xmm1
+	movl	%edx,%edi
+	xorl	%ebp,%edx
+	movdqa	%xmm1,%xmm10
+	addl	24(%rsp),%eax
+	andl	%ebp,%edi
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	movl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	28(%rsp),%ebp
+	andl	%edx,%esi
+	andl	%ecx,%edi
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movl	%ebx,%edi
+	pxor	%xmm6,%xmm2
+.byte	102,68,15,58,15,208,8
+	xorl	%ecx,%ebx
+	addl	32(%rsp),%edx
+	andl	%ecx,%edi
+	pxor	%xmm3,%xmm2
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm1,%xmm8
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	pxor	%xmm10,%xmm2
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movdqa	%xmm2,%xmm10
+	movdqa	%xmm8,16(%rsp)
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	36(%rsp),%ecx
+	andl	%ebx,%esi
+	pslld	$2,%xmm2
+	andl	%eax,%edi
+	rorl	$7,%ebp
+	psrld	$30,%xmm10
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	por	%xmm10,%xmm2
+	movl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	%xmm2,%xmm8
+	addl	40(%rsp),%ebx
+	andl	%eax,%edi
+	andl	%ebp,%esi
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	44(%rsp),%eax
+	andl	%ebp,%esi
+	andl	%edx,%edi
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	addl	48(%rsp),%ebp
+	pxor	%xmm7,%xmm3
+.byte	102,68,15,58,15,193,8
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm2,%xmm9
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm8,%xmm3
+	addl	52(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm3,%xmm8
+	movdqa	%xmm9,32(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm3
+	addl	56(%rsp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm8
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	por	%xmm8,%xmm3
+	addl	60(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	0(%rsp),%eax
+	paddd	%xmm3,%xmm10
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	movdqa	%xmm10,48(%rsp)
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	4(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	8(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	12(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	cmpq	%r10,%r9
+	je	.Ldone_ssse3
+	movdqa	64(%r11),%xmm6
+	movdqa	0(%r11),%xmm9
+	movdqu	0(%r9),%xmm0
+	movdqu	16(%r9),%xmm1
+	movdqu	32(%r9),%xmm2
+	movdqu	48(%r9),%xmm3
+.byte	102,15,56,0,198
+	addq	$64,%r9
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+.byte	102,15,56,0,206
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	paddd	%xmm9,%xmm0
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movdqa	%xmm0,0(%rsp)
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	psubd	%xmm9,%xmm0
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+.byte	102,15,56,0,214
+	movl	%edx,%edi
+	roll	$5,%edx
+	paddd	%xmm9,%xmm1
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	movdqa	%xmm1,16(%rsp)
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	psubd	%xmm9,%xmm1
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+.byte	102,15,56,0,222
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	paddd	%xmm9,%xmm2
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movdqa	%xmm2,32(%rsp)
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	psubd	%xmm9,%xmm2
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	addl	12(%r8),%edx
+	movl	%eax,0(%r8)
+	addl	16(%r8),%ebp
+	movl	%esi,4(%r8)
+	movl	%esi,%ebx
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	movl	%eax,0(%r8)
+	addl	12(%r8),%edx
+	movl	%esi,4(%r8)
+	addl	16(%r8),%ebp
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	leaq	64(%rsp),%rsi
+	movq	0(%rsi),%r12
+	movq	8(%rsi),%rbp
+	movq	16(%rsi),%rbx
+	leaq	24(%rsi),%rsp
+.Lepilogue_ssse3:
+	.byte	0xf3,0xc3
+.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/crypto/sha/asm/sha512-x86_64.S b/crypto/sha/asm/sha512-x86_64.S
new file mode 100644
index 0000000..db5b898
--- /dev/null
+++ b/crypto/sha/asm/sha512-x86_64.S
@@ -0,0 +1,1778 @@
+.text	
+
+.globl	sha256_block_data_order
+.type	sha256_block_data_order,@function
+.align	16
+sha256_block_data_order:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	shlq	$4,%rdx
+	subq	$64+32,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%r11,64+24(%rsp)
+.Lprologue:
+
+	leaq	K256(%rip),%rbp
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	xorq	%rdi,%rdi
+	movl	0(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+	movl	%r12d,0(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%eax,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r8d,%r15d
+	movl	%ebx,%r11d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	xorl	%ecx,%r11d
+	xorl	%eax,%r14d
+	addl	%r15d,%r12d
+	movl	%ebx,%r15d
+
+	rorl	$6,%r13d
+	andl	%eax,%r11d
+	andl	%ecx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r11d
+
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r11d
+
+	movl	4(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%r15d
+	movl	%r12d,4(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r11d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%edx,%r15d
+	movl	%eax,%r10d
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	xorl	%ebx,%r10d
+	xorl	%r11d,%r14d
+	addl	%r15d,%r12d
+	movl	%eax,%r15d
+
+	rorl	$6,%r13d
+	andl	%r11d,%r10d
+	andl	%ebx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r10d
+
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r10d
+
+	movl	8(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+	movl	%r12d,8(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r10d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ecx,%r15d
+	movl	%r11d,%r9d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	xorl	%eax,%r9d
+	xorl	%r10d,%r14d
+	addl	%r15d,%r12d
+	movl	%r11d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r10d,%r9d
+	andl	%eax,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r9d
+
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r9d
+
+	movl	12(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%r15d
+	movl	%r12d,12(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%r9d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ebx,%r15d
+	movl	%r10d,%r8d
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	xorl	%r11d,%r8d
+	xorl	%r9d,%r14d
+	addl	%r15d,%r12d
+	movl	%r10d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r9d,%r8d
+	andl	%r11d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r8d
+
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r8d
+
+	movl	16(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+	movl	%r12d,16(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%r8d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%eax,%r15d
+	movl	%r9d,%edx
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	xorl	%r10d,%edx
+	xorl	%r8d,%r14d
+	addl	%r15d,%r12d
+	movl	%r9d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r8d,%edx
+	andl	%r10d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%edx
+
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%edx
+
+	movl	20(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%r15d
+	movl	%r12d,20(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%edx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r11d,%r15d
+	movl	%r8d,%ecx
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	xorl	%r9d,%ecx
+	xorl	%edx,%r14d
+	addl	%r15d,%r12d
+	movl	%r8d,%r15d
+
+	rorl	$6,%r13d
+	andl	%edx,%ecx
+	andl	%r9d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ecx
+
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ecx
+
+	movl	24(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+	movl	%r12d,24(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%ecx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r10d,%r15d
+	movl	%edx,%ebx
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	xorl	%r8d,%ebx
+	xorl	%ecx,%r14d
+	addl	%r15d,%r12d
+	movl	%edx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ecx,%ebx
+	andl	%r8d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ebx
+
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ebx
+
+	movl	28(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%r15d
+	movl	%r12d,28(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%ebx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r9d,%r15d
+	movl	%ecx,%eax
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	xorl	%edx,%eax
+	xorl	%ebx,%r14d
+	addl	%r15d,%r12d
+	movl	%ecx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ebx,%eax
+	andl	%edx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%eax
+
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%eax
+
+	movl	32(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+	movl	%r12d,32(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%eax,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r8d,%r15d
+	movl	%ebx,%r11d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	xorl	%ecx,%r11d
+	xorl	%eax,%r14d
+	addl	%r15d,%r12d
+	movl	%ebx,%r15d
+
+	rorl	$6,%r13d
+	andl	%eax,%r11d
+	andl	%ecx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r11d
+
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r11d
+
+	movl	36(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%r15d
+	movl	%r12d,36(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r11d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%edx,%r15d
+	movl	%eax,%r10d
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	xorl	%ebx,%r10d
+	xorl	%r11d,%r14d
+	addl	%r15d,%r12d
+	movl	%eax,%r15d
+
+	rorl	$6,%r13d
+	andl	%r11d,%r10d
+	andl	%ebx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r10d
+
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r10d
+
+	movl	40(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+	movl	%r12d,40(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r10d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ecx,%r15d
+	movl	%r11d,%r9d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	xorl	%eax,%r9d
+	xorl	%r10d,%r14d
+	addl	%r15d,%r12d
+	movl	%r11d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r10d,%r9d
+	andl	%eax,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r9d
+
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r9d
+
+	movl	44(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%r15d
+	movl	%r12d,44(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%r9d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ebx,%r15d
+	movl	%r10d,%r8d
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	xorl	%r11d,%r8d
+	xorl	%r9d,%r14d
+	addl	%r15d,%r12d
+	movl	%r10d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r9d,%r8d
+	andl	%r11d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r8d
+
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r8d
+
+	movl	48(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+	movl	%r12d,48(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%r8d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%eax,%r15d
+	movl	%r9d,%edx
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	xorl	%r10d,%edx
+	xorl	%r8d,%r14d
+	addl	%r15d,%r12d
+	movl	%r9d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r8d,%edx
+	andl	%r10d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%edx
+
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%edx
+
+	movl	52(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%r15d
+	movl	%r12d,52(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%edx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r11d,%r15d
+	movl	%r8d,%ecx
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	xorl	%r9d,%ecx
+	xorl	%edx,%r14d
+	addl	%r15d,%r12d
+	movl	%r8d,%r15d
+
+	rorl	$6,%r13d
+	andl	%edx,%ecx
+	andl	%r9d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ecx
+
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ecx
+
+	movl	56(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+	movl	%r12d,56(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%ecx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r10d,%r15d
+	movl	%edx,%ebx
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	xorl	%r8d,%ebx
+	xorl	%ecx,%r14d
+	addl	%r15d,%r12d
+	movl	%edx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ecx,%ebx
+	andl	%r8d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ebx
+
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ebx
+
+	movl	60(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%r15d
+	movl	%r12d,60(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%ebx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r9d,%r15d
+	movl	%ecx,%eax
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	xorl	%edx,%eax
+	xorl	%ebx,%r14d
+	addl	%r15d,%r12d
+	movl	%ecx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ebx,%eax
+	andl	%edx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%eax
+
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%eax
+
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	movl	4(%rsp),%r13d
+	movl	56(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	36(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	0(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r14d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+	movl	%r12d,0(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%eax,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r8d,%r15d
+	movl	%ebx,%r11d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	xorl	%ecx,%r11d
+	xorl	%eax,%r14d
+	addl	%r15d,%r12d
+	movl	%ebx,%r15d
+
+	rorl	$6,%r13d
+	andl	%eax,%r11d
+	andl	%ecx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r11d
+
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r11d
+
+	movl	8(%rsp),%r13d
+	movl	60(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	40(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	4(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%r14d,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%r15d
+	movl	%r12d,4(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r11d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%edx,%r15d
+	movl	%eax,%r10d
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	xorl	%ebx,%r10d
+	xorl	%r11d,%r14d
+	addl	%r15d,%r12d
+	movl	%eax,%r15d
+
+	rorl	$6,%r13d
+	andl	%r11d,%r10d
+	andl	%ebx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r10d
+
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r10d
+
+	movl	12(%rsp),%r13d
+	movl	0(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	44(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	8(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r14d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+	movl	%r12d,8(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r10d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ecx,%r15d
+	movl	%r11d,%r9d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	xorl	%eax,%r9d
+	xorl	%r10d,%r14d
+	addl	%r15d,%r12d
+	movl	%r11d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r10d,%r9d
+	andl	%eax,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r9d
+
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r9d
+
+	movl	16(%rsp),%r13d
+	movl	4(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	48(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	12(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%r14d,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%r15d
+	movl	%r12d,12(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%r9d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ebx,%r15d
+	movl	%r10d,%r8d
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	xorl	%r11d,%r8d
+	xorl	%r9d,%r14d
+	addl	%r15d,%r12d
+	movl	%r10d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r9d,%r8d
+	andl	%r11d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r8d
+
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r8d
+
+	movl	20(%rsp),%r13d
+	movl	8(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	52(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	16(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r14d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+	movl	%r12d,16(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%r8d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%eax,%r15d
+	movl	%r9d,%edx
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	xorl	%r10d,%edx
+	xorl	%r8d,%r14d
+	addl	%r15d,%r12d
+	movl	%r9d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r8d,%edx
+	andl	%r10d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%edx
+
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%edx
+
+	movl	24(%rsp),%r13d
+	movl	12(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	56(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	20(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%r14d,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%r15d
+	movl	%r12d,20(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%edx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r11d,%r15d
+	movl	%r8d,%ecx
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	xorl	%r9d,%ecx
+	xorl	%edx,%r14d
+	addl	%r15d,%r12d
+	movl	%r8d,%r15d
+
+	rorl	$6,%r13d
+	andl	%edx,%ecx
+	andl	%r9d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ecx
+
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ecx
+
+	movl	28(%rsp),%r13d
+	movl	16(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	60(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	24(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r14d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+	movl	%r12d,24(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%ecx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r10d,%r15d
+	movl	%edx,%ebx
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	xorl	%r8d,%ebx
+	xorl	%ecx,%r14d
+	addl	%r15d,%r12d
+	movl	%edx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ecx,%ebx
+	andl	%r8d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ebx
+
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ebx
+
+	movl	32(%rsp),%r13d
+	movl	20(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	0(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	28(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%r14d,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%r15d
+	movl	%r12d,28(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%ebx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r9d,%r15d
+	movl	%ecx,%eax
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	xorl	%edx,%eax
+	xorl	%ebx,%r14d
+	addl	%r15d,%r12d
+	movl	%ecx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ebx,%eax
+	andl	%edx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%eax
+
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%eax
+
+	movl	36(%rsp),%r13d
+	movl	24(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	4(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	32(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r14d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+	movl	%r12d,32(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%eax,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r8d,%r15d
+	movl	%ebx,%r11d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	xorl	%ecx,%r11d
+	xorl	%eax,%r14d
+	addl	%r15d,%r12d
+	movl	%ebx,%r15d
+
+	rorl	$6,%r13d
+	andl	%eax,%r11d
+	andl	%ecx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r11d
+
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r11d
+
+	movl	40(%rsp),%r13d
+	movl	28(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	8(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	36(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%r14d,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%r15d
+	movl	%r12d,36(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r11d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%edx,%r15d
+	movl	%eax,%r10d
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	xorl	%ebx,%r10d
+	xorl	%r11d,%r14d
+	addl	%r15d,%r12d
+	movl	%eax,%r15d
+
+	rorl	$6,%r13d
+	andl	%r11d,%r10d
+	andl	%ebx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r10d
+
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r10d
+
+	movl	44(%rsp),%r13d
+	movl	32(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	12(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	40(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r14d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+	movl	%r12d,40(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r10d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ecx,%r15d
+	movl	%r11d,%r9d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	xorl	%eax,%r9d
+	xorl	%r10d,%r14d
+	addl	%r15d,%r12d
+	movl	%r11d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r10d,%r9d
+	andl	%eax,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r9d
+
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r9d
+
+	movl	48(%rsp),%r13d
+	movl	36(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	16(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	44(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%r14d,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%r15d
+	movl	%r12d,44(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%r9d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ebx,%r15d
+	movl	%r10d,%r8d
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	xorl	%r11d,%r8d
+	xorl	%r9d,%r14d
+	addl	%r15d,%r12d
+	movl	%r10d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r9d,%r8d
+	andl	%r11d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r8d
+
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r8d
+
+	movl	52(%rsp),%r13d
+	movl	40(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	20(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	48(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r14d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+	movl	%r12d,48(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%r8d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%eax,%r15d
+	movl	%r9d,%edx
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	xorl	%r10d,%edx
+	xorl	%r8d,%r14d
+	addl	%r15d,%r12d
+	movl	%r9d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r8d,%edx
+	andl	%r10d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%edx
+
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%edx
+
+	movl	56(%rsp),%r13d
+	movl	44(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	24(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	52(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%r14d,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%r15d
+	movl	%r12d,52(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%edx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r11d,%r15d
+	movl	%r8d,%ecx
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	xorl	%r9d,%ecx
+	xorl	%edx,%r14d
+	addl	%r15d,%r12d
+	movl	%r8d,%r15d
+
+	rorl	$6,%r13d
+	andl	%edx,%ecx
+	andl	%r9d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ecx
+
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ecx
+
+	movl	60(%rsp),%r13d
+	movl	48(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	28(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	56(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r14d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+	movl	%r12d,56(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%ecx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r10d,%r15d
+	movl	%edx,%ebx
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	xorl	%r8d,%ebx
+	xorl	%ecx,%r14d
+	addl	%r15d,%r12d
+	movl	%edx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ecx,%ebx
+	andl	%r8d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ebx
+
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ebx
+
+	movl	0(%rsp),%r13d
+	movl	52(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	32(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	60(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%r14d,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%r15d
+	movl	%r12d,60(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%ebx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r9d,%r15d
+	movl	%ecx,%eax
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	xorl	%edx,%eax
+	xorl	%ebx,%r14d
+	addl	%r15d,%r12d
+	movl	%ecx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ebx,%eax
+	andl	%edx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%eax
+
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%eax
+
+	cmpq	$64,%rdi
+	jb	.Lrounds_16_xx
+
+	movq	64+0(%rsp),%rdi
+	leaq	64(%rsi),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop
+
+	movq	64+24(%rsp),%rsi
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	sha256_block_data_order,.-sha256_block_data_order
+.align	64
+.type	K256,@object
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/crypto/x86_64cpuid.S b/crypto/x86_64cpuid.S
new file mode 100644
index 0000000..562b03a
--- /dev/null
+++ b/crypto/x86_64cpuid.S
@@ -0,0 +1,234 @@
+
+.hidden	OPENSSL_cpuid_setup
+.section	.init
+	call	OPENSSL_cpuid_setup
+
+.hidden	OPENSSL_ia32cap_P
+.comm	OPENSSL_ia32cap_P,8,4
+
+.text	
+
+.globl	OPENSSL_atomic_add
+.type	OPENSSL_atomic_add,@function
+.align	16
+OPENSSL_atomic_add:
+	movl	(%rdi),%eax
+.Lspin:	leaq	(%rsi,%rax,1),%r8
+.byte	0xf0		
+	cmpxchgl	%r8d,(%rdi)
+	jne	.Lspin
+	movl	%r8d,%eax
+.byte	0x48,0x98	
+	.byte	0xf3,0xc3
+.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.globl	OPENSSL_rdtsc
+.type	OPENSSL_rdtsc,@function
+.align	16
+OPENSSL_rdtsc:
+	rdtsc
+	shlq	$32,%rdx
+	orq	%rdx,%rax
+	.byte	0xf3,0xc3
+.size	OPENSSL_rdtsc,.-OPENSSL_rdtsc
+
+.globl	OPENSSL_ia32_cpuid
+.type	OPENSSL_ia32_cpuid,@function
+.align	16
+OPENSSL_ia32_cpuid:
+	movq	%rbx,%r8
+
+	xorl	%eax,%eax
+	cpuid
+	movl	%eax,%r11d
+
+	xorl	%eax,%eax
+	cmpl	$1970169159,%ebx
+	setne	%al
+	movl	%eax,%r9d
+	cmpl	$1231384169,%edx
+	setne	%al
+	orl	%eax,%r9d
+	cmpl	$1818588270,%ecx
+	setne	%al
+	orl	%eax,%r9d
+	jz	.Lintel
+
+	cmpl	$1752462657,%ebx
+	setne	%al
+	movl	%eax,%r10d
+	cmpl	$1769238117,%edx
+	setne	%al
+	orl	%eax,%r10d
+	cmpl	$1145913699,%ecx
+	setne	%al
+	orl	%eax,%r10d
+	jnz	.Lintel
+
+
+	movl	$2147483648,%eax
+	cpuid
+	cmpl	$2147483649,%eax
+	jb	.Lintel
+	movl	%eax,%r10d
+	movl	$2147483649,%eax
+	cpuid
+	orl	%ecx,%r9d
+	andl	$2049,%r9d
+
+	cmpl	$2147483656,%r10d
+	jb	.Lintel
+
+	movl	$2147483656,%eax
+	cpuid
+	movzbq	%cl,%r10
+	incq	%r10
+
+	movl	$1,%eax
+	cpuid
+	btl	$28,%edx
+	jnc	.Lgeneric
+	shrl	$16,%ebx
+	cmpb	%r10b,%bl
+	ja	.Lgeneric
+	andl	$4026531839,%edx
+	jmp	.Lgeneric
+
+.Lintel:
+	cmpl	$4,%r11d
+	movl	$-1,%r10d
+	jb	.Lnocacheinfo
+
+	movl	$4,%eax
+	movl	$0,%ecx
+	cpuid
+	movl	%eax,%r10d
+	shrl	$14,%r10d
+	andl	$4095,%r10d
+
+.Lnocacheinfo:
+	movl	$1,%eax
+	cpuid
+	andl	$3220176895,%edx
+	cmpl	$0,%r9d
+	jne	.Lnotintel
+	orl	$1073741824,%edx
+	andb	$15,%ah
+	cmpb	$15,%ah
+	jne	.Lnotintel
+	orl	$1048576,%edx
+.Lnotintel:
+	btl	$28,%edx
+	jnc	.Lgeneric
+	andl	$4026531839,%edx
+	cmpl	$0,%r10d
+	je	.Lgeneric
+
+	orl	$268435456,%edx
+	shrl	$16,%ebx
+	cmpb	$1,%bl
+	ja	.Lgeneric
+	andl	$4026531839,%edx
+.Lgeneric:
+	andl	$2048,%r9d
+	andl	$4294965247,%ecx
+	orl	%ecx,%r9d
+
+	movl	%edx,%r10d
+	btl	$27,%r9d
+	jnc	.Lclear_avx
+	xorl	%ecx,%ecx
+.byte	0x0f,0x01,0xd0		
+	andl	$6,%eax
+	cmpl	$6,%eax
+	je	.Ldone
+.Lclear_avx:
+	movl	$4026525695,%eax
+	andl	%eax,%r9d
+.Ldone:
+	shlq	$32,%r9
+	movl	%r10d,%eax
+	movq	%r8,%rbx
+	orq	%r9,%rax
+	.byte	0xf3,0xc3
+.size	OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
+
+.globl	OPENSSL_cleanse
+.type	OPENSSL_cleanse,@function
+.align	16
+OPENSSL_cleanse:
+	xorq	%rax,%rax
+	cmpq	$15,%rsi
+	jae	.Lot
+	cmpq	$0,%rsi
+	je	.Lret
+.Little:
+	movb	%al,(%rdi)
+	subq	$1,%rsi
+	leaq	1(%rdi),%rdi
+	jnz	.Little
+.Lret:
+	.byte	0xf3,0xc3
+.align	16
+.Lot:
+	testq	$7,%rdi
+	jz	.Laligned
+	movb	%al,(%rdi)
+	leaq	-1(%rsi),%rsi
+	leaq	1(%rdi),%rdi
+	jmp	.Lot
+.Laligned:
+	movq	%rax,(%rdi)
+	leaq	-8(%rsi),%rsi
+	testq	$-8,%rsi
+	leaq	8(%rdi),%rdi
+	jnz	.Laligned
+	cmpq	$0,%rsi
+	jne	.Little
+	.byte	0xf3,0xc3
+.size	OPENSSL_cleanse,.-OPENSSL_cleanse
+.globl	OPENSSL_wipe_cpu
+.type	OPENSSL_wipe_cpu,@function
+.align	16
+OPENSSL_wipe_cpu:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	pxor	%xmm10,%xmm10
+	pxor	%xmm11,%xmm11
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+	pxor	%xmm14,%xmm14
+	pxor	%xmm15,%xmm15
+	xorq	%rcx,%rcx
+	xorq	%rdx,%rdx
+	xorq	%rsi,%rsi
+	xorq	%rdi,%rdi
+	xorq	%r8,%r8
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	leaq	8(%rsp),%rax
+	.byte	0xf3,0xc3
+.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+.globl	OPENSSL_ia32_rdrand
+.type	OPENSSL_ia32_rdrand,@function
+.align	16
+OPENSSL_ia32_rdrand:
+	movl	$8,%ecx
+.Loop_rdrand:
+.byte	72,15,199,240
+	jc	.Lbreak_rdrand
+	loop	.Loop_rdrand
+.Lbreak_rdrand:
+	cmpq	$0,%rax
+	cmoveq	%rcx,%rax
+	.byte	0xf3,0xc3
+.size	OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand
diff --git a/import_openssl.sh b/import_openssl.sh
index c2eea5b..640e6ee 100755
--- a/import_openssl.sh
+++ b/import_openssl.sh
@@ -138,6 +138,12 @@
   perl "$1" elf -fPIC > "$OUT"
 }
 
+function gen_asm_x86_64 () {
+  local OUT
+  OUT=$(default_asm_file "$@")
+  perl "$1" elf > "$OUT"
+}
+
 function import() {
   declare -r OPENSSL_SOURCE=$1
 
@@ -205,6 +211,23 @@
   gen_asm_x86 crypto/des/asm/crypt586.pl
   gen_asm_x86 crypto/bf/asm/bf-586.pl
 
+  # Generate x86_64 asm
+  gen_asm_x86_64 crypto/x86_64cpuid.pl
+  gen_asm_x86_64 crypto/sha/asm/sha1-x86_64.pl
+  gen_asm_x86_64 crypto/sha/asm/sha512-x86_64.pl
+  gen_asm_x86_64 crypto/modes/asm/ghash-x86_64.pl
+  gen_asm_x86_64 crypto/aes/asm/aesni-x86_64.pl
+  gen_asm_x86_64 crypto/aes/asm/vpaes-x86_64.pl
+  gen_asm_x86_64 crypto/aes/asm/bsaes-x86_64.pl
+  gen_asm_x86_64 crypto/aes/asm/aes-x86_64.pl
+  gen_asm_x86_64 crypto/md5/asm/md5-x86_64.pl
+  gen_asm_x86_64 crypto/bn/asm/modexp512-x86_64.pl
+  gen_asm_x86_64 crypto/bn/asm/x86_64-mont.pl
+  gen_asm_x86_64 crypto/bn/asm/x86_64-gf2m.pl
+  gen_asm_x86_64 crypto/bn/asm/x86_64-mont5.pl
+  gen_asm_x86_64 crypto/rc4/asm/rc4-x86_64.pl
+  gen_asm_x86_64 crypto/rc4/asm/rc4-md5-x86_64.pl
+
   # Setup android.testssl directory
   mkdir android.testssl
   cat test/testssl | \