| #!/usr/bin/env perl |
| |
| # ==================================================================== |
| # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| # ==================================================================== |
| |
| # December 2005 |
| # |
| # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons |
| # for undertaken effort are multiple. First of all, UltraSPARC is not |
| # the whole SPARCv9 universe and other VIS-free implementations deserve |
| # optimized code as much. Secondly, newly introduced UltraSPARC T1, |
| # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, |
| # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with |
| # several integrated RSA/DSA accelerator circuits accessible through |
| # kernel driver [only(*)], but having decent user-land software |
| # implementation is important too. Finally, reasons like desire to |
| # experiment with dedicated squaring procedure. Yes, this module |
| # implements one, because it was easiest to draft it in SPARCv9 |
| # instructions... |
| |
| # (*) Engine accessing the driver in question is on my TODO list. |
| # For reference, acceleator is estimated to give 6 to 10 times |
| # improvement on single-threaded RSA sign. It should be noted |
| # that 6-10x improvement coefficient does not actually mean |
| # something extraordinary in terms of absolute [single-threaded] |
| # performance, as SPARCv9 instruction set is by all means least |
| # suitable for high performance crypto among other 64 bit |
| # platforms. 6-10x factor simply places T1 in same performance |
| # domain as say AMD64 and IA-64. Improvement of RSA verify don't |
| # appear impressive at all, but it's the sign operation which is |
| # far more critical/interesting. |
| |
| # You might notice that inner loops are modulo-scheduled:-) This has |
| # essentially negligible impact on UltraSPARC performance, it's |
| # Fujitsu SPARC64 V users who should notice and hopefully appreciate |
| # the advantage... Currently this module surpasses sparcv9a-mont.pl |
| # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a |
| # module still have hidden potential [see TODO list there], which is |
| # estimated to be larger than 20%... |
| |
| # int bn_mul_mont( |
| $rp="%i0"; # BN_ULONG *rp, |
| $ap="%i1"; # const BN_ULONG *ap, |
| $bp="%i2"; # const BN_ULONG *bp, |
| $np="%i3"; # const BN_ULONG *np, |
| $n0="%i4"; # const BN_ULONG *n0, |
| $num="%i5"; # int num); |
| |
| $bits=32; |
| for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } |
| if ($bits==64) { $bias=2047; $frame=192; } |
| else { $bias=0; $frame=128; } |
| |
| $car0="%o0"; |
| $car1="%o1"; |
| $car2="%o2"; # 1 bit |
| $acc0="%o3"; |
| $acc1="%o4"; |
| $mask="%g1"; # 32 bits, what a waste... |
| $tmp0="%g4"; |
| $tmp1="%g5"; |
| |
| $i="%l0"; |
| $j="%l1"; |
| $mul0="%l2"; |
| $mul1="%l3"; |
| $tp="%l4"; |
| $apj="%l5"; |
| $npj="%l6"; |
| $tpj="%l7"; |
| |
| $fname="bn_mul_mont_int"; |
| |
| $code=<<___; |
| .section ".text",#alloc,#execinstr |
| |
| .global $fname |
| .align 32 |
| $fname: |
| cmp %o5,4 ! 128 bits minimum |
| bge,pt %icc,.Lenter |
| sethi %hi(0xffffffff),$mask |
| retl |
| clr %o0 |
| .align 32 |
| .Lenter: |
| save %sp,-$frame,%sp |
| sll $num,2,$num ! num*=4 |
| or $mask,%lo(0xffffffff),$mask |
| ld [$n0],$n0 |
| cmp $ap,$bp |
| and $num,$mask,$num |
| ld [$bp],$mul0 ! bp[0] |
| nop |
| |
| add %sp,$bias,%o7 ! real top of stack |
| ld [$ap],$car0 ! ap[0] ! redundant in squaring context |
| sub %o7,$num,%o7 |
| ld [$ap+4],$apj ! ap[1] |
| and %o7,-1024,%o7 |
| ld [$np],$car1 ! np[0] |
| sub %o7,$bias,%sp ! alloca |
| ld [$np+4],$npj ! np[1] |
| be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont |
| mov 12,$j |
| |
| mulx $car0,$mul0,$car0 ! ap[0]*bp[0] |
| mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] |
| and $car0,$mask,$acc0 |
| add %sp,$bias+$frame,$tp |
| ld [$ap+8],$apj !prologue! |
| |
| mulx $n0,$acc0,$mul1 ! "t[0]"*n0 |
| and $mul1,$mask,$mul1 |
| |
| mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 |
| mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 |
| srlx $car0,32,$car0 |
| add $acc0,$car1,$car1 |
| ld [$np+8],$npj !prologue! |
| srlx $car1,32,$car1 |
| mov $tmp0,$acc0 !prologue! |
| |
| .L1st: |
| mulx $apj,$mul0,$tmp0 |
| mulx $npj,$mul1,$tmp1 |
| add $acc0,$car0,$car0 |
| ld [$ap+$j],$apj ! ap[j] |
| and $car0,$mask,$acc0 |
| add $acc1,$car1,$car1 |
| ld [$np+$j],$npj ! np[j] |
| srlx $car0,32,$car0 |
| add $acc0,$car1,$car1 |
| add $j,4,$j ! j++ |
| mov $tmp0,$acc0 |
| st $car1,[$tp] |
| cmp $j,$num |
| mov $tmp1,$acc1 |
| srlx $car1,32,$car1 |
| bl %icc,.L1st |
| add $tp,4,$tp ! tp++ |
| !.L1st |
| |
| mulx $apj,$mul0,$tmp0 !epilogue! |
| mulx $npj,$mul1,$tmp1 |
| add $acc0,$car0,$car0 |
| and $car0,$mask,$acc0 |
| add $acc1,$car1,$car1 |
| srlx $car0,32,$car0 |
| add $acc0,$car1,$car1 |
| st $car1,[$tp] |
| srlx $car1,32,$car1 |
| |
| add $tmp0,$car0,$car0 |
| and $car0,$mask,$acc0 |
| add $tmp1,$car1,$car1 |
| srlx $car0,32,$car0 |
| add $acc0,$car1,$car1 |
| st $car1,[$tp+4] |
| srlx $car1,32,$car1 |
| |
| add $car0,$car1,$car1 |
| st $car1,[$tp+8] |
| srlx $car1,32,$car2 |
| |
| mov 4,$i ! i++ |
| ld [$bp+4],$mul0 ! bp[1] |
| .Louter: |
| add %sp,$bias+$frame,$tp |
| ld [$ap],$car0 ! ap[0] |
| ld [$ap+4],$apj ! ap[1] |
| ld [$np],$car1 ! np[0] |
| ld [$np+4],$npj ! np[1] |
| ld [$tp],$tmp1 ! tp[0] |
| ld [$tp+4],$tpj ! tp[1] |
| mov 12,$j |
| |
| mulx $car0,$mul0,$car0 |
| mulx $apj,$mul0,$tmp0 !prologue! |
| add $tmp1,$car0,$car0 |
| ld [$ap+8],$apj !prologue! |
| and $car0,$mask,$acc0 |
| |
| mulx $n0,$acc0,$mul1 |
| and $mul1,$mask,$mul1 |
| |
| mulx $car1,$mul1,$car1 |
| mulx $npj,$mul1,$acc1 !prologue! |
| srlx $car0,32,$car0 |
| add $acc0,$car1,$car1 |
| ld [$np+8],$npj !prologue! |
| srlx $car1,32,$car1 |
| mov $tmp0,$acc0 !prologue! |
| |
| .Linner: |
| mulx $apj,$mul0,$tmp0 |
| mulx $npj,$mul1,$tmp1 |
| add $tpj,$car0,$car0 |
| ld [$ap+$j],$apj ! ap[j] |
| add $acc0,$car0,$car0 |
| add $acc1,$car1,$car1 |
| ld [$np+$j],$npj ! np[j] |
| and $car0,$mask,$acc0 |
| ld [$tp+8],$tpj ! tp[j] |
| srlx $car0,32,$car0 |
| add $acc0,$car1,$car1 |
| add $j,4,$j ! j++ |
| mov $tmp0,$acc0 |
| st $car1,[$tp] ! tp[j-1] |
| srlx $car1,32,$car1 |
| mov $tmp1,$acc1 |
| cmp $j,$num |
| bl %icc,.Linner |
| add $tp,4,$tp ! tp++ |
| !.Linner |
| |
| mulx $apj,$mul0,$tmp0 !epilogue! |
| mulx $npj,$mul1,$tmp1 |
| add $tpj,$car0,$car0 |
| add $acc0,$car0,$car0 |
| ld [$tp+8],$tpj ! tp[j] |
| and $car0,$mask,$acc0 |
| add $acc1,$car1,$car1 |
| srlx $car0,32,$car0 |
| add $acc0,$car1,$car1 |
| st $car1,[$tp] ! tp[j-1] |
| srlx $car1,32,$car1 |
| |
| add $tpj,$car0,$car0 |
| add $tmp0,$car0,$car0 |
| and $car0,$mask,$acc0 |
| add $tmp1,$car1,$car1 |
| add $acc0,$car1,$car1 |
| st $car1,[$tp+4] ! tp[j-1] |
| srlx $car0,32,$car0 |
| add $i,4,$i ! i++ |
| srlx $car1,32,$car1 |
| |
| add $car0,$car1,$car1 |
| cmp $i,$num |
| add $car2,$car1,$car1 |
| st $car1,[$tp+8] |
| |
| srlx $car1,32,$car2 |
| bl,a %icc,.Louter |
| ld [$bp+$i],$mul0 ! bp[i] |
| !.Louter |
| |
| add $tp,12,$tp |
| |
| .Ltail: |
| add $np,$num,$np |
| add $rp,$num,$rp |
| mov $tp,$ap |
| sub %g0,$num,%o7 ! k=-num |
| ba .Lsub |
| subcc %g0,%g0,%g0 ! clear %icc.c |
| .align 16 |
| .Lsub: |
| ld [$tp+%o7],%o0 |
| ld [$np+%o7],%o1 |
| subccc %o0,%o1,%o1 ! tp[j]-np[j] |
| add $rp,%o7,$i |
| add %o7,4,%o7 |
| brnz %o7,.Lsub |
| st %o1,[$i] |
| subc $car2,0,$car2 ! handle upmost overflow bit |
| and $tp,$car2,$ap |
| andn $rp,$car2,$np |
| or $ap,$np,$ap |
| sub %g0,$num,%o7 |
| |
| .Lcopy: |
| ld [$ap+%o7],%o0 ! copy or in-place refresh |
| st %g0,[$tp+%o7] ! zap tp |
| st %o0,[$rp+%o7] |
| add %o7,4,%o7 |
| brnz %o7,.Lcopy |
| nop |
| mov 1,%i0 |
| ret |
| restore |
| ___ |
| |
| ######## |
| ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over |
| ######## code without following dedicated squaring procedure. |
| ######## |
| $sbit="%i2"; # re-use $bp! |
| |
| $code.=<<___; |
| .align 32 |
| .Lbn_sqr_mont: |
| mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] |
| mulx $apj,$mul0,$tmp0 !prologue! |
| and $car0,$mask,$acc0 |
| add %sp,$bias+$frame,$tp |
| ld [$ap+8],$apj !prologue! |
| |
| mulx $n0,$acc0,$mul1 ! "t[0]"*n0 |
| srlx $car0,32,$car0 |
| and $mul1,$mask,$mul1 |
| |
| mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 |
| mulx $npj,$mul1,$acc1 !prologue! |
| and $car0,1,$sbit |
| ld [$np+8],$npj !prologue! |
| srlx $car0,1,$car0 |
| add $acc0,$car1,$car1 |
| srlx $car1,32,$car1 |
| mov $tmp0,$acc0 !prologue! |
| |
| .Lsqr_1st: |
| mulx $apj,$mul0,$tmp0 |
| mulx $npj,$mul1,$tmp1 |
| add $acc0,$car0,$car0 ! ap[j]*a0+c0 |
| add $acc1,$car1,$car1 |
| ld [$ap+$j],$apj ! ap[j] |
| and $car0,$mask,$acc0 |
| ld [$np+$j],$npj ! np[j] |
| srlx $car0,32,$car0 |
| add $acc0,$acc0,$acc0 |
| or $sbit,$acc0,$acc0 |
| mov $tmp1,$acc1 |
| srlx $acc0,32,$sbit |
| add $j,4,$j ! j++ |
| and $acc0,$mask,$acc0 |
| cmp $j,$num |
| add $acc0,$car1,$car1 |
| st $car1,[$tp] |
| mov $tmp0,$acc0 |
| srlx $car1,32,$car1 |
| bl %icc,.Lsqr_1st |
| add $tp,4,$tp ! tp++ |
| !.Lsqr_1st |
| |
| mulx $apj,$mul0,$tmp0 ! epilogue |
| mulx $npj,$mul1,$tmp1 |
| add $acc0,$car0,$car0 ! ap[j]*a0+c0 |
| add $acc1,$car1,$car1 |
| and $car0,$mask,$acc0 |
| srlx $car0,32,$car0 |
| add $acc0,$acc0,$acc0 |
| or $sbit,$acc0,$acc0 |
| srlx $acc0,32,$sbit |
| and $acc0,$mask,$acc0 |
| add $acc0,$car1,$car1 |
| st $car1,[$tp] |
| srlx $car1,32,$car1 |
| |
| add $tmp0,$car0,$car0 ! ap[j]*a0+c0 |
| add $tmp1,$car1,$car1 |
| and $car0,$mask,$acc0 |
| srlx $car0,32,$car0 |
| add $acc0,$acc0,$acc0 |
| or $sbit,$acc0,$acc0 |
| srlx $acc0,32,$sbit |
| and $acc0,$mask,$acc0 |
| add $acc0,$car1,$car1 |
| st $car1,[$tp+4] |
| srlx $car1,32,$car1 |
| |
| add $car0,$car0,$car0 |
| or $sbit,$car0,$car0 |
| add $car0,$car1,$car1 |
| st $car1,[$tp+8] |
| srlx $car1,32,$car2 |
| |
| ld [%sp+$bias+$frame],$tmp0 ! tp[0] |
| ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] |
| ld [%sp+$bias+$frame+8],$tpj ! tp[2] |
| ld [$ap+4],$mul0 ! ap[1] |
| ld [$ap+8],$apj ! ap[2] |
| ld [$np],$car1 ! np[0] |
| ld [$np+4],$npj ! np[1] |
| mulx $n0,$tmp0,$mul1 |
| |
| mulx $mul0,$mul0,$car0 |
| and $mul1,$mask,$mul1 |
| |
| mulx $car1,$mul1,$car1 |
| mulx $npj,$mul1,$acc1 |
| add $tmp0,$car1,$car1 |
| and $car0,$mask,$acc0 |
| ld [$np+8],$npj ! np[2] |
| srlx $car1,32,$car1 |
| add $tmp1,$car1,$car1 |
| srlx $car0,32,$car0 |
| add $acc0,$car1,$car1 |
| and $car0,1,$sbit |
| add $acc1,$car1,$car1 |
| srlx $car0,1,$car0 |
| mov 12,$j |
| st $car1,[%sp+$bias+$frame] ! tp[0]= |
| srlx $car1,32,$car1 |
| add %sp,$bias+$frame+4,$tp |
| |
| .Lsqr_2nd: |
| mulx $apj,$mul0,$acc0 |
| mulx $npj,$mul1,$acc1 |
| add $acc0,$car0,$car0 |
| add $tpj,$car1,$car1 |
| ld [$ap+$j],$apj ! ap[j] |
| and $car0,$mask,$acc0 |
| ld [$np+$j],$npj ! np[j] |
| srlx $car0,32,$car0 |
| add $acc1,$car1,$car1 |
| ld [$tp+8],$tpj ! tp[j] |
| add $acc0,$acc0,$acc0 |
| add $j,4,$j ! j++ |
| or $sbit,$acc0,$acc0 |
| srlx $acc0,32,$sbit |
| and $acc0,$mask,$acc0 |
| cmp $j,$num |
| add $acc0,$car1,$car1 |
| st $car1,[$tp] ! tp[j-1] |
| srlx $car1,32,$car1 |
| bl %icc,.Lsqr_2nd |
| add $tp,4,$tp ! tp++ |
| !.Lsqr_2nd |
| |
| mulx $apj,$mul0,$acc0 |
| mulx $npj,$mul1,$acc1 |
| add $acc0,$car0,$car0 |
| add $tpj,$car1,$car1 |
| and $car0,$mask,$acc0 |
| srlx $car0,32,$car0 |
| add $acc1,$car1,$car1 |
| add $acc0,$acc0,$acc0 |
| or $sbit,$acc0,$acc0 |
| srlx $acc0,32,$sbit |
| and $acc0,$mask,$acc0 |
| add $acc0,$car1,$car1 |
| st $car1,[$tp] ! tp[j-1] |
| srlx $car1,32,$car1 |
| |
| add $car0,$car0,$car0 |
| or $sbit,$car0,$car0 |
| add $car0,$car1,$car1 |
| add $car2,$car1,$car1 |
| st $car1,[$tp+4] |
| srlx $car1,32,$car2 |
| |
| ld [%sp+$bias+$frame],$tmp1 ! tp[0] |
| ld [%sp+$bias+$frame+4],$tpj ! tp[1] |
| ld [$ap+8],$mul0 ! ap[2] |
| ld [$np],$car1 ! np[0] |
| ld [$np+4],$npj ! np[1] |
| mulx $n0,$tmp1,$mul1 |
| and $mul1,$mask,$mul1 |
| mov 8,$i |
| |
| mulx $mul0,$mul0,$car0 |
| mulx $car1,$mul1,$car1 |
| and $car0,$mask,$acc0 |
| add $tmp1,$car1,$car1 |
| srlx $car0,32,$car0 |
| add %sp,$bias+$frame,$tp |
| srlx $car1,32,$car1 |
| and $car0,1,$sbit |
| srlx $car0,1,$car0 |
| mov 4,$j |
| |
| .Lsqr_outer: |
| .Lsqr_inner1: |
| mulx $npj,$mul1,$acc1 |
| add $tpj,$car1,$car1 |
| add $j,4,$j |
| ld [$tp+8],$tpj |
| cmp $j,$i |
| add $acc1,$car1,$car1 |
| ld [$np+$j],$npj |
| st $car1,[$tp] |
| srlx $car1,32,$car1 |
| bl %icc,.Lsqr_inner1 |
| add $tp,4,$tp |
| !.Lsqr_inner1 |
| |
| add $j,4,$j |
| ld [$ap+$j],$apj ! ap[j] |
| mulx $npj,$mul1,$acc1 |
| add $tpj,$car1,$car1 |
| ld [$np+$j],$npj ! np[j] |
| add $acc0,$car1,$car1 |
| ld [$tp+8],$tpj ! tp[j] |
| add $acc1,$car1,$car1 |
| st $car1,[$tp] |
| srlx $car1,32,$car1 |
| |
| add $j,4,$j |
| cmp $j,$num |
| be,pn %icc,.Lsqr_no_inner2 |
| add $tp,4,$tp |
| |
| .Lsqr_inner2: |
| mulx $apj,$mul0,$acc0 |
| mulx $npj,$mul1,$acc1 |
| add $tpj,$car1,$car1 |
| add $acc0,$car0,$car0 |
| ld [$ap+$j],$apj ! ap[j] |
| and $car0,$mask,$acc0 |
| ld [$np+$j],$npj ! np[j] |
| srlx $car0,32,$car0 |
| add $acc0,$acc0,$acc0 |
| ld [$tp+8],$tpj ! tp[j] |
| or $sbit,$acc0,$acc0 |
| add $j,4,$j ! j++ |
| srlx $acc0,32,$sbit |
| and $acc0,$mask,$acc0 |
| cmp $j,$num |
| add $acc0,$car1,$car1 |
| add $acc1,$car1,$car1 |
| st $car1,[$tp] ! tp[j-1] |
| srlx $car1,32,$car1 |
| bl %icc,.Lsqr_inner2 |
| add $tp,4,$tp ! tp++ |
| |
| .Lsqr_no_inner2: |
| mulx $apj,$mul0,$acc0 |
| mulx $npj,$mul1,$acc1 |
| add $tpj,$car1,$car1 |
| add $acc0,$car0,$car0 |
| and $car0,$mask,$acc0 |
| srlx $car0,32,$car0 |
| add $acc0,$acc0,$acc0 |
| or $sbit,$acc0,$acc0 |
| srlx $acc0,32,$sbit |
| and $acc0,$mask,$acc0 |
| add $acc0,$car1,$car1 |
| add $acc1,$car1,$car1 |
| st $car1,[$tp] ! tp[j-1] |
| srlx $car1,32,$car1 |
| |
| add $car0,$car0,$car0 |
| or $sbit,$car0,$car0 |
| add $car0,$car1,$car1 |
| add $car2,$car1,$car1 |
| st $car1,[$tp+4] |
| srlx $car1,32,$car2 |
| |
| add $i,4,$i ! i++ |
| ld [%sp+$bias+$frame],$tmp1 ! tp[0] |
| ld [%sp+$bias+$frame+4],$tpj ! tp[1] |
| ld [$ap+$i],$mul0 ! ap[j] |
| ld [$np],$car1 ! np[0] |
| ld [$np+4],$npj ! np[1] |
| mulx $n0,$tmp1,$mul1 |
| and $mul1,$mask,$mul1 |
| add $i,4,$tmp0 |
| |
| mulx $mul0,$mul0,$car0 |
| mulx $car1,$mul1,$car1 |
| and $car0,$mask,$acc0 |
| add $tmp1,$car1,$car1 |
| srlx $car0,32,$car0 |
| add %sp,$bias+$frame,$tp |
| srlx $car1,32,$car1 |
| and $car0,1,$sbit |
| srlx $car0,1,$car0 |
| |
| cmp $tmp0,$num ! i<num-1 |
| bl %icc,.Lsqr_outer |
| mov 4,$j |
| |
| .Lsqr_last: |
| mulx $npj,$mul1,$acc1 |
| add $tpj,$car1,$car1 |
| add $j,4,$j |
| ld [$tp+8],$tpj |
| cmp $j,$i |
| add $acc1,$car1,$car1 |
| ld [$np+$j],$npj |
| st $car1,[$tp] |
| srlx $car1,32,$car1 |
| bl %icc,.Lsqr_last |
| add $tp,4,$tp |
| !.Lsqr_last |
| |
| mulx $npj,$mul1,$acc1 |
| add $tpj,$car1,$car1 |
| add $acc0,$car1,$car1 |
| add $acc1,$car1,$car1 |
| st $car1,[$tp] |
| srlx $car1,32,$car1 |
| |
| add $car0,$car0,$car0 ! recover $car0 |
| or $sbit,$car0,$car0 |
| add $car0,$car1,$car1 |
| add $car2,$car1,$car1 |
| st $car1,[$tp+4] |
| srlx $car1,32,$car2 |
| |
| ba .Ltail |
| add $tp,8,$tp |
| .type $fname,#function |
| .size $fname,(.-$fname) |
| .asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" |
| .align 32 |
| ___ |
| $code =~ s/\`([^\`]*)\`/eval($1)/gem; |
| print $code; |
| close STDOUT; |