| #!/usr/bin/env perl |
| # |
| # ==================================================================== |
| # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| # ==================================================================== |
| # |
| # February 2009 |
| # |
| # Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to |
| # "cluster" Address Generation Interlocks, so that one pipeline stall |
| # resolves several dependencies. |
| |
| # November 2010. |
| # |
| # Adapt for -m31 build. If kernel supports what's called "highgprs" |
| # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit |
| # instructions and achieve "64-bit" performance even in 31-bit legacy |
| # application context. The feature is not specific to any particular |
| # processor, as long as it's "z-CPU". Latter implies that the code |
| # remains z/Architecture specific. On z990 it was measured to perform |
| # 50% better than code generated by gcc 4.3. |
| |
| $flavour = shift; |
| |
| if ($flavour =~ /3[12]/) { |
| $SIZE_T=4; |
| $g=""; |
| } else { |
| $SIZE_T=8; |
| $g="g"; |
| } |
| |
| while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
| open STDOUT,">$output"; |
| |
| $rp="%r14"; |
| $sp="%r15"; |
| $code=<<___; |
| .text |
| |
| ___ |
| |
| # void RC4(RC4_KEY *key,size_t len,const void *inp,void *out) |
| { |
| $acc="%r0"; |
| $cnt="%r1"; |
| $key="%r2"; |
| $len="%r3"; |
| $inp="%r4"; |
| $out="%r5"; |
| |
| @XX=("%r6","%r7"); |
| @TX=("%r8","%r9"); |
| $YY="%r10"; |
| $TY="%r11"; |
| |
| $code.=<<___; |
| .globl RC4 |
| .type RC4,\@function |
| .align 64 |
| RC4: |
| stm${g} %r6,%r11,6*$SIZE_T($sp) |
| ___ |
| $code.=<<___ if ($flavour =~ /3[12]/); |
| llgfr $len,$len |
| ___ |
| $code.=<<___; |
| llgc $XX[0],0($key) |
| llgc $YY,1($key) |
| la $XX[0],1($XX[0]) |
| nill $XX[0],0xff |
| srlg $cnt,$len,3 |
| ltgr $cnt,$cnt |
| llgc $TX[0],2($XX[0],$key) |
| jz .Lshort |
| j .Loop8 |
| |
| .align 64 |
| .Loop8: |
| ___ |
| for ($i=0;$i<8;$i++) { |
| $code.=<<___; |
| la $YY,0($YY,$TX[0]) # $i |
| nill $YY,255 |
| la $XX[1],1($XX[0]) |
| nill $XX[1],255 |
| ___ |
| $code.=<<___ if ($i==1); |
| llgc $acc,2($TY,$key) |
| ___ |
| $code.=<<___ if ($i>1); |
| sllg $acc,$acc,8 |
| ic $acc,2($TY,$key) |
| ___ |
| $code.=<<___; |
| llgc $TY,2($YY,$key) |
| stc $TX[0],2($YY,$key) |
| llgc $TX[1],2($XX[1],$key) |
| stc $TY,2($XX[0],$key) |
| cr $XX[1],$YY |
| jne .Lcmov$i |
| la $TX[1],0($TX[0]) |
| .Lcmov$i: |
| la $TY,0($TY,$TX[0]) |
| nill $TY,255 |
| ___ |
| push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers |
| } |
| |
| $code.=<<___; |
| lg $TX[1],0($inp) |
| sllg $acc,$acc,8 |
| la $inp,8($inp) |
| ic $acc,2($TY,$key) |
| xgr $acc,$TX[1] |
| stg $acc,0($out) |
| la $out,8($out) |
| brctg $cnt,.Loop8 |
| |
| .Lshort: |
| lghi $acc,7 |
| ngr $len,$acc |
| jz .Lexit |
| j .Loop1 |
| |
| .align 16 |
| .Loop1: |
| la $YY,0($YY,$TX[0]) |
| nill $YY,255 |
| llgc $TY,2($YY,$key) |
| stc $TX[0],2($YY,$key) |
| stc $TY,2($XX[0],$key) |
| ar $TY,$TX[0] |
| ahi $XX[0],1 |
| nill $TY,255 |
| nill $XX[0],255 |
| llgc $acc,0($inp) |
| la $inp,1($inp) |
| llgc $TY,2($TY,$key) |
| llgc $TX[0],2($XX[0],$key) |
| xr $acc,$TY |
| stc $acc,0($out) |
| la $out,1($out) |
| brct $len,.Loop1 |
| |
| .Lexit: |
| ahi $XX[0],-1 |
| stc $XX[0],0($key) |
| stc $YY,1($key) |
| lm${g} %r6,%r11,6*$SIZE_T($sp) |
| br $rp |
| .size RC4,.-RC4 |
| .string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
| |
| ___ |
| } |
| |
| # void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp) |
| { |
| $cnt="%r0"; |
| $idx="%r1"; |
| $key="%r2"; |
| $len="%r3"; |
| $inp="%r4"; |
| $acc="%r5"; |
| $dat="%r6"; |
| $ikey="%r7"; |
| $iinp="%r8"; |
| |
| $code.=<<___; |
| .globl private_RC4_set_key |
| .type private_RC4_set_key,\@function |
| .align 64 |
| private_RC4_set_key: |
| stm${g} %r6,%r8,6*$SIZE_T($sp) |
| lhi $cnt,256 |
| la $idx,0(%r0) |
| sth $idx,0($key) |
| .align 4 |
| .L1stloop: |
| stc $idx,2($idx,$key) |
| la $idx,1($idx) |
| brct $cnt,.L1stloop |
| |
| lghi $ikey,-256 |
| lr $cnt,$len |
| la $iinp,0(%r0) |
| la $idx,0(%r0) |
| .align 16 |
| .L2ndloop: |
| llgc $acc,2+256($ikey,$key) |
| llgc $dat,0($iinp,$inp) |
| la $idx,0($idx,$acc) |
| la $ikey,1($ikey) |
| la $idx,0($idx,$dat) |
| nill $idx,255 |
| la $iinp,1($iinp) |
| tml $ikey,255 |
| llgc $dat,2($idx,$key) |
| stc $dat,2+256-1($ikey,$key) |
| stc $acc,2($idx,$key) |
| jz .Ldone |
| brct $cnt,.L2ndloop |
| lr $cnt,$len |
| la $iinp,0(%r0) |
| j .L2ndloop |
| .Ldone: |
| lm${g} %r6,%r8,6*$SIZE_T($sp) |
| br $rp |
| .size private_RC4_set_key,.-private_RC4_set_key |
| |
| ___ |
| } |
| |
| # const char *RC4_options() |
| $code.=<<___; |
| .globl RC4_options |
| .type RC4_options,\@function |
| .align 16 |
| RC4_options: |
| larl %r2,.Loptions |
| br %r14 |
| .size RC4_options,.-RC4_options |
| .section .rodata |
| .Loptions: |
| .align 8 |
| .string "rc4(8x,char)" |
| ___ |
| |
| print $code; |
| close STDOUT; # force flush |