| #!/usr/bin/env perl |
| |
| # ==================================================================== |
| # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| # ==================================================================== |
| |
| # RC4 for PA-RISC. |
| |
| # June 2009. |
| # |
| # Performance is 33% better than gcc 3.2 generated code on PA-7100LC. |
| # For reference, [4x] unrolled loop is >40% faster than folded one. |
| # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement |
| # is believed to be not sufficient to justify the effort... |
| # |
| # Special thanks to polarhome.com for providing HP-UX account. |
| |
| $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| |
| $flavour = shift; |
| $output = shift; |
| open STDOUT,">$output"; |
| |
| if ($flavour =~ /64/) { |
| $LEVEL ="2.0W"; |
| $SIZE_T =8; |
| $FRAME_MARKER =80; |
| $SAVED_RP =16; |
| $PUSH ="std"; |
| $PUSHMA ="std,ma"; |
| $POP ="ldd"; |
| $POPMB ="ldd,mb"; |
| } else { |
| $LEVEL ="1.0"; |
| $SIZE_T =4; |
| $FRAME_MARKER =48; |
| $SAVED_RP =20; |
| $PUSH ="stw"; |
| $PUSHMA ="stwm"; |
| $POP ="ldw"; |
| $POPMB ="ldwm"; |
| } |
| |
| $FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker |
| # [+ argument transfer] |
| $SZ=1; # defaults to RC4_CHAR |
| if (open CONF,"<${dir}../../opensslconf.h") { |
| while(<CONF>) { |
| if (m/#\s*define\s+RC4_INT\s+(.*)/) { |
| $SZ = ($1=~/char$/) ? 1 : 4; |
| last; |
| } |
| } |
| close CONF; |
| } |
| |
| if ($SZ==1) { # RC4_CHAR |
| $LD="ldb"; |
| $LDX="ldbx"; |
| $MKX="addl"; |
| $ST="stb"; |
| } else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) |
| $LD="ldw"; |
| $LDX="ldwx,s"; |
| $MKX="sh2addl"; |
| $ST="stw"; |
| } |
| |
| $key="%r26"; |
| $len="%r25"; |
| $inp="%r24"; |
| $out="%r23"; |
| |
| @XX=("%r19","%r20"); |
| @TX=("%r21","%r22"); |
| $YY="%r28"; |
| $TY="%r29"; |
| |
| $acc="%r1"; |
| $ix="%r2"; |
| $iy="%r3"; |
| $dat0="%r4"; |
| $dat1="%r5"; |
| $rem="%r6"; |
| $mask="%r31"; |
| |
| sub unrolledloopbody { |
| for ($i=0;$i<4;$i++) { |
| $code.=<<___; |
| ldo 1($XX[0]),$XX[1] |
| `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` |
| and $mask,$XX[1],$XX[1] |
| $LDX $YY($key),$TY |
| $MKX $YY,$key,$ix |
| $LDX $XX[1]($key),$TX[1] |
| $MKX $XX[0],$key,$iy |
| $ST $TX[0],0($ix) |
| comclr,<> $XX[1],$YY,%r0 ; conditional |
| copy $TX[0],$TX[1] ; move |
| `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` |
| $ST $TY,0($iy) |
| addl $TX[0],$TY,$TY |
| addl $TX[1],$YY,$YY |
| and $mask,$TY,$TY |
| and $mask,$YY,$YY |
| ___ |
| push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers |
| } } |
| |
| sub foldedloop { |
| my ($label,$count)=@_; |
| $code.=<<___; |
| $label |
| $MKX $YY,$key,$iy |
| $LDX $YY($key),$TY |
| $MKX $XX[0],$key,$ix |
| $ST $TX[0],0($iy) |
| ldo 1($XX[0]),$XX[0] |
| $ST $TY,0($ix) |
| addl $TX[0],$TY,$TY |
| ldbx $inp($out),$dat1 |
| and $mask,$TY,$TY |
| and $mask,$XX[0],$XX[0] |
| $LDX $TY($key),$acc |
| $LDX $XX[0]($key),$TX[0] |
| ldo 1($out),$out |
| xor $dat1,$acc,$acc |
| addl $TX[0],$YY,$YY |
| stb $acc,-1($out) |
| addib,<> -1,$count,$label ; $count is always small |
| and $mask,$YY,$YY |
| ___ |
| } |
| |
| $code=<<___; |
| .LEVEL $LEVEL |
| .SPACE \$TEXT\$ |
| .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY |
| |
| .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR |
| RC4 |
| .PROC |
| .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 |
| .ENTRY |
| $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue |
| $PUSHMA %r3,$FRAME(%sp) |
| $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) |
| $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) |
| $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) |
| |
| cmpib,*= 0,$len,L\$abort |
| sub $inp,$out,$inp ; distance between $inp and $out |
| |
| $LD `0*$SZ`($key),$XX[0] |
| $LD `1*$SZ`($key),$YY |
| ldo `2*$SZ`($key),$key |
| |
| ldi 0xff,$mask |
| ldi 3,$dat0 |
| |
| ldo 1($XX[0]),$XX[0] ; warm up loop |
| and $mask,$XX[0],$XX[0] |
| $LDX $XX[0]($key),$TX[0] |
| addl $TX[0],$YY,$YY |
| cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? |
| and $mask,$YY,$YY |
| |
| and,<> $out,$dat0,$rem ; is $out aligned? |
| b L\$alignedout |
| subi 4,$rem,$rem |
| sub $len,$rem,$len |
| ___ |
| &foldedloop("L\$alignout",$rem); # process till $out is aligned |
| |
| $code.=<<___; |
| L\$alignedout ; $len is at least 4 here |
| and,<> $inp,$dat0,$acc ; is $inp aligned? |
| b L\$oop4 |
| sub $inp,$acc,$rem ; align $inp |
| |
| sh3addl $acc,%r0,$acc |
| subi 32,$acc,$acc |
| mtctl $acc,%cr11 ; load %sar with vshd align factor |
| ldwx $rem($out),$dat0 |
| ldo 4($rem),$rem |
| L\$oop4misalignedinp |
| ___ |
| &unrolledloopbody(); |
| $code.=<<___; |
| $LDX $TY($key),$ix |
| ldwx $rem($out),$dat1 |
| ldo -4($len),$len |
| or $ix,$acc,$acc ; last piece, no need to dep |
| vshd $dat0,$dat1,$iy ; align data |
| copy $dat1,$dat0 |
| xor $iy,$acc,$acc |
| stw $acc,0($out) |
| cmpib,*<< 3,$len,L\$oop4misalignedinp |
| ldo 4($out),$out |
| cmpib,*= 0,$len,L\$done |
| nop |
| b L\$oop1 |
| nop |
| |
| .ALIGN 8 |
| L\$oop4 |
| ___ |
| &unrolledloopbody(); |
| $code.=<<___; |
| $LDX $TY($key),$ix |
| ldwx $inp($out),$dat0 |
| ldo -4($len),$len |
| or $ix,$acc,$acc ; last piece, no need to dep |
| xor $dat0,$acc,$acc |
| stw $acc,0($out) |
| cmpib,*<< 3,$len,L\$oop4 |
| ldo 4($out),$out |
| cmpib,*= 0,$len,L\$done |
| nop |
| ___ |
| &foldedloop("L\$oop1",$len); |
| $code.=<<___; |
| L\$done |
| $POP `-$FRAME-$SAVED_RP`(%sp),%r2 |
| ldo -1($XX[0]),$XX[0] ; chill out loop |
| sub $YY,$TX[0],$YY |
| and $mask,$XX[0],$XX[0] |
| and $mask,$YY,$YY |
| $ST $XX[0],`-2*$SZ`($key) |
| $ST $YY,`-1*$SZ`($key) |
| $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 |
| $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 |
| $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 |
| L\$abort |
| bv (%r2) |
| .EXIT |
| $POPMB -$FRAME(%sp),%r3 |
| .PROCEND |
| ___ |
| |
| $code.=<<___; |
| |
| .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR |
| .ALIGN 8 |
| private_RC4_set_key |
| .PROC |
| .CALLINFO NO_CALLS |
| .ENTRY |
| $ST %r0,`0*$SZ`($key) |
| $ST %r0,`1*$SZ`($key) |
| ldo `2*$SZ`($key),$key |
| copy %r0,@XX[0] |
| L\$1st |
| $ST @XX[0],0($key) |
| ldo 1(@XX[0]),@XX[0] |
| bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 |
| ldo $SZ($key),$key |
| |
| ldo `-256*$SZ`($key),$key ; rewind $key |
| addl $len,$inp,$inp ; $inp to point at the end |
| sub %r0,$len,%r23 ; inverse index |
| copy %r0,@XX[0] |
| copy %r0,@XX[1] |
| ldi 0xff,$mask |
| |
| L\$2nd |
| $LDX @XX[0]($key),@TX[0] |
| ldbx %r23($inp),@TX[1] |
| addi,nuv 1,%r23,%r23 ; increment and conditional |
| sub %r0,$len,%r23 ; inverse index |
| addl @TX[0],@XX[1],@XX[1] |
| addl @TX[1],@XX[1],@XX[1] |
| and $mask,@XX[1],@XX[1] |
| $MKX @XX[0],$key,$TY |
| $LDX @XX[1]($key),@TX[1] |
| $MKX @XX[1],$key,$YY |
| ldo 1(@XX[0]),@XX[0] |
| $ST @TX[0],0($YY) |
| bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 |
| $ST @TX[1],0($TY) |
| |
| bv,n (%r2) |
| .EXIT |
| nop |
| .PROCEND |
| |
| .EXPORT RC4_options,ENTRY |
| .ALIGN 8 |
| RC4_options |
| .PROC |
| .CALLINFO NO_CALLS |
| .ENTRY |
| blr %r0,%r28 |
| ldi 3,%r1 |
| L\$pic |
| andcm %r28,%r1,%r28 |
| bv (%r2) |
| .EXIT |
| ldo L\$opts-L\$pic(%r28),%r28 |
| .PROCEND |
| .ALIGN 8 |
| L\$opts |
| .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" |
| .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" |
| ___ |
| $code =~ s/\`([^\`]*)\`/eval $1/gem; |
| $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); |
| |
| print $code; |
| close STDOUT; |