1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# SHA256/512 block procedure for PA-RISC. 11 12# June 2009. 13# 14# SHA256 performance is >75% better than gcc 3.2 generated code on 15# PA-7100LC. Compared to code generated by vendor compiler this 16# implementation is almost 70% faster in 64-bit build, but delivers 17# virtually same performance in 32-bit build on PA-8600. 18# 19# SHA512 performance is >2.9x better than gcc 3.2 generated code on 20# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the 21# code is executed on PA-RISC 2.0 processor and switches to 64-bit 22# code path delivering adequate performance even in "blended" 32-bit 23# build. Though 64-bit code is not any faster than code generated by 24# vendor compiler on PA-8600... 25# 26# Special thanks to polarhome.com for providing HP-UX account. 27 28$flavour = shift; 29$output = shift; 30open STDOUT,">$output"; 31 32if ($flavour =~ /64/) { 33 $LEVEL ="2.0W"; 34 $SIZE_T =8; 35 $FRAME_MARKER =80; 36 $SAVED_RP =16; 37 $PUSH ="std"; 38 $PUSHMA ="std,ma"; 39 $POP ="ldd"; 40 $POPMB ="ldd,mb"; 41} else { 42 $LEVEL ="1.0"; 43 $SIZE_T =4; 44 $FRAME_MARKER =48; 45 $SAVED_RP =20; 46 $PUSH ="stw"; 47 $PUSHMA ="stwm"; 48 $POP ="ldw"; 49 $POPMB ="ldwm"; 50} 51 52if ($output =~ /512/) { 53 $func="sha512_block_data_order"; 54 $SZ=8; 55 @Sigma0=(28,34,39); 56 @Sigma1=(14,18,41); 57 @sigma0=(1, 8, 7); 58 @sigma1=(19,61, 6); 59 $rounds=80; 60 $LAST10BITS=0x017; 61 $LD="ldd"; 62 $LDM="ldd,ma"; 63 $ST="std"; 64} else { 65 $func="sha256_block_data_order"; 66 $SZ=4; 67 @Sigma0=( 2,13,22); 68 @Sigma1=( 6,11,25); 69 @sigma0=( 7,18, 3); 70 @sigma1=(17,19,10); 71 $rounds=64; 72 $LAST10BITS=0x0f2; 73 $LD="ldw"; 74 $LDM="ldwm"; 75 $ST="stw"; 76} 77 78$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker 79 # [+ argument transfer] 80$XOFF=16*$SZ+32; # local variables 81$FRAME+=$XOFF; 82$XOFF+=$FRAME_MARKER; # distance between %sp and local variables 83 84$ctx="%r26"; # zapped by $a0 85$inp="%r25"; # zapped by $a1 86$num="%r24"; # zapped by $t0 87 88$a0 ="%r26"; 89$a1 ="%r25"; 90$t0 ="%r24"; 91$t1 ="%r29"; 92$Tbl="%r31"; 93 94@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28"); 95 96@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", 97 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp); 98 99sub ROUND_00_15 { 100my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 101$code.=<<___; 102 _ror $e,$Sigma1[0],$a0 103 and $f,$e,$t0 104 _ror $e,$Sigma1[1],$a1 105 addl $t1,$h,$h 106 andcm $g,$e,$t1 107 xor $a1,$a0,$a0 108 _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1 109 or $t0,$t1,$t1 ; Ch(e,f,g) 110 addl @X[$i%16],$h,$h 111 xor $a0,$a1,$a1 ; Sigma1(e) 112 addl $t1,$h,$h 113 _ror $a,$Sigma0[0],$a0 114 addl $a1,$h,$h 115 116 _ror $a,$Sigma0[1],$a1 117 and $a,$b,$t0 118 and $a,$c,$t1 119 xor $a1,$a0,$a0 120 _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1 121 xor $t1,$t0,$t0 122 and $b,$c,$t1 123 xor $a0,$a1,$a1 ; Sigma0(a) 124 addl $h,$d,$d 125 xor $t1,$t0,$t0 ; Maj(a,b,c) 126 `"$LDM $SZ($Tbl),$t1" if ($i<15)` 127 addl $a1,$h,$h 128 addl $t0,$h,$h 129 130___ 131} 132 133sub ROUND_16_xx { 134my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 135$i-=16; 136$code.=<<___; 137 _ror @X[($i+1)%16],$sigma0[0],$a0 138 _ror @X[($i+1)%16],$sigma0[1],$a1 139 addl @X[($i+9)%16],@X[$i],@X[$i] 140 _ror @X[($i+14)%16],$sigma1[0],$t0 141 _ror @X[($i+14)%16],$sigma1[1],$t1 142 xor $a1,$a0,$a0 143 _shr @X[($i+1)%16],$sigma0[2],$a1 144 xor $t1,$t0,$t0 145 _shr @X[($i+14)%16],$sigma1[2],$t1 146 xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f]) 147 xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f]) 148 $LDM $SZ($Tbl),$t1 149 addl $a0,@X[$i],@X[$i] 150 addl $t0,@X[$i],@X[$i] 151___ 152$code.=<<___ if ($i==15); 153 extru $t1,31,10,$a1 154 comiclr,<> $LAST10BITS,$a1,%r0 155 ldo 1($Tbl),$Tbl ; signal end of $Tbl 156___ 157&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h); 158} 159 160$code=<<___; 161 .LEVEL $LEVEL 162 .text 163 164 .section .rodata 165 .ALIGN 64 166L\$table 167___ 168$code.=<<___ if ($SZ==8); 169 .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd 170 .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc 171 .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019 172 .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118 173 .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe 174 .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2 175 .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1 176 .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694 177 .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3 178 .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65 179 .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483 180 .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5 181 .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210 182 .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4 183 .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725 184 .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70 185 .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926 186 .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df 187 .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8 188 .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b 189 .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001 190 .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30 191 .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910 192 .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8 193 .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53 194 .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8 195 .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb 196 .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3 197 .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60 198 .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec 199 .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9 200 .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b 201 .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207 202 .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178 203 .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6 204 .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b 205 .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493 206 .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c 207 .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a 208 .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817 209___ 210$code.=<<___ if ($SZ==4); 211 .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 212 .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 213 .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 214 .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 215 .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 216 .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 217 .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 218 .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 219 .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 220 .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 221 .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 222 .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 223 .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 224 .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 225 .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 226 .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 227___ 228$code.=<<___; 229 .previous 230 231 .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR 232 .ALIGN 64 233$func 234 .PROC 235 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 236 .ENTRY 237 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 238 $PUSHMA %r3,$FRAME(%sp) 239 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 240 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 241 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 242 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 243 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 244 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 245 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 246 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) 247 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) 248 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) 249 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) 250 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) 251 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) 252 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) 253 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) 254 255 _shl $num,`log(16*$SZ)/log(2)`,$num 256 addl $inp,$num,$num ; $num to point at the end of $inp 257 258 $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments 259 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) 260 $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp) 261 262#ifdef __PIC__ 263 addil LT'L\$table, %r19 264 ldw RT'L\$table(%r1), $Tbl 265#else 266 ldil L'L\$table, %t1 267 ldo R'L\$table(%t1), $Tbl 268#endif 269___ 270$code.=<<___ if ($SZ==8 && $SIZE_T==4); 271#ifndef __OpenBSD__ 272___ 273$code.=<<___ if ($SZ==8 && $SIZE_T==4); 274 ldi 31,$t1 275 mtctl $t1,%cr11 276 extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0 277 b L\$parisc1 278 nop 279___ 280$code.=<<___; 281 $LD `0*$SZ`($ctx),$A ; load context 282 $LD `1*$SZ`($ctx),$B 283 $LD `2*$SZ`($ctx),$C 284 $LD `3*$SZ`($ctx),$D 285 $LD `4*$SZ`($ctx),$E 286 $LD `5*$SZ`($ctx),$F 287 $LD `6*$SZ`($ctx),$G 288 $LD `7*$SZ`($ctx),$H 289 290 extru $inp,31,`log($SZ)/log(2)`,$t0 291 sh3addl $t0,%r0,$t0 292 subi `8*$SZ`,$t0,$t0 293 mtctl $t0,%cr11 ; load %sar with align factor 294 295L\$oop 296 ldi `$SZ-1`,$t0 297 $LDM $SZ($Tbl),$t1 298 andcm $inp,$t0,$t0 ; align $inp 299___ 300 for ($i=0;$i<15;$i++) { # load input block 301 $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; } 302$code.=<<___; 303 cmpb,*= $inp,$t0,L\$aligned 304 $LD `$SZ*15`($t0),@X[15] 305 $LD `$SZ*16`($t0),@X[16] 306___ 307 for ($i=0;$i<16;$i++) { # align data 308 $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; } 309$code.=<<___; 310L\$aligned 311 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD 312___ 313 314for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } 315$code.=<<___; 316L\$rounds 317 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD 318___ 319for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); } 320$code.=<<___; 321 bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled? 322 nop 323 324 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments 325 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp 326 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num 327 ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl 328 329 $LD `0*$SZ`($ctx),@X[0] ; load context 330 $LD `1*$SZ`($ctx),@X[1] 331 $LD `2*$SZ`($ctx),@X[2] 332 $LD `3*$SZ`($ctx),@X[3] 333 $LD `4*$SZ`($ctx),@X[4] 334 $LD `5*$SZ`($ctx),@X[5] 335 addl @X[0],$A,$A 336 $LD `6*$SZ`($ctx),@X[6] 337 addl @X[1],$B,$B 338 $LD `7*$SZ`($ctx),@X[7] 339 ldo `16*$SZ`($inp),$inp ; advance $inp 340 341 $ST $A,`0*$SZ`($ctx) ; save context 342 addl @X[2],$C,$C 343 $ST $B,`1*$SZ`($ctx) 344 addl @X[3],$D,$D 345 $ST $C,`2*$SZ`($ctx) 346 addl @X[4],$E,$E 347 $ST $D,`3*$SZ`($ctx) 348 addl @X[5],$F,$F 349 $ST $E,`4*$SZ`($ctx) 350 addl @X[6],$G,$G 351 $ST $F,`5*$SZ`($ctx) 352 addl @X[7],$H,$H 353 $ST $G,`6*$SZ`($ctx) 354 $ST $H,`7*$SZ`($ctx) 355 356 cmpb,*<>,n $inp,$num,L\$oop 357 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp 358___ 359if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0 360{{ 361$code.=<<___; 362 b L\$done 363 nop 364 365 .ALIGN 64 366L\$parisc1 367___ 368$code.=<<___ if ($SZ==8 && $SIZE_T==4); 369#endif 370___ 371 372@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo, 373 $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) = 374 ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", 375 "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16"); 376$a0 ="%r17"; 377$a1 ="%r18"; 378$a2 ="%r19"; 379$a3 ="%r20"; 380$t0 ="%r21"; 381$t1 ="%r22"; 382$t2 ="%r28"; 383$t3 ="%r29"; 384$Tbl="%r31"; 385 386@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx 387 388sub ROUND_00_15_pa1 { 389my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, 390 $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_; 391my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; 392 393$code.=<<___ if (!$flag); 394 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi 395 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] 396___ 397$code.=<<___; 398 shd $ehi,$elo,$Sigma1[0],$t0 399 add $Xlo,$hlo,$hlo 400 shd $elo,$ehi,$Sigma1[0],$t1 401 addc $Xhi,$hhi,$hhi ; h += X[i] 402 shd $ehi,$elo,$Sigma1[1],$t2 403 ldwm 8($Tbl),$Xhi 404 shd $elo,$ehi,$Sigma1[1],$t3 405 ldw -4($Tbl),$Xlo ; load K[i] 406 xor $t2,$t0,$t0 407 xor $t3,$t1,$t1 408 and $flo,$elo,$a0 409 and $fhi,$ehi,$a1 410 shd $ehi,$elo,$Sigma1[2],$t2 411 andcm $glo,$elo,$a2 412 shd $elo,$ehi,$Sigma1[2],$t3 413 andcm $ghi,$ehi,$a3 414 xor $t2,$t0,$t0 415 xor $t3,$t1,$t1 ; Sigma1(e) 416 add $Xlo,$hlo,$hlo 417 xor $a2,$a0,$a0 418 addc $Xhi,$hhi,$hhi ; h += K[i] 419 xor $a3,$a1,$a1 ; Ch(e,f,g) 420 421 add $t0,$hlo,$hlo 422 shd $ahi,$alo,$Sigma0[0],$t0 423 addc $t1,$hhi,$hhi ; h += Sigma1(e) 424 shd $alo,$ahi,$Sigma0[0],$t1 425 add $a0,$hlo,$hlo 426 shd $ahi,$alo,$Sigma0[1],$t2 427 addc $a1,$hhi,$hhi ; h += Ch(e,f,g) 428 shd $alo,$ahi,$Sigma0[1],$t3 429 430 xor $t2,$t0,$t0 431 xor $t3,$t1,$t1 432 shd $ahi,$alo,$Sigma0[2],$t2 433 and $alo,$blo,$a0 434 shd $alo,$ahi,$Sigma0[2],$t3 435 and $ahi,$bhi,$a1 436 xor $t2,$t0,$t0 437 xor $t3,$t1,$t1 ; Sigma0(a) 438 439 and $alo,$clo,$a2 440 and $ahi,$chi,$a3 441 xor $a2,$a0,$a0 442 add $hlo,$dlo,$dlo 443 xor $a3,$a1,$a1 444 addc $hhi,$dhi,$dhi ; d += h 445 and $blo,$clo,$a2 446 add $t0,$hlo,$hlo 447 and $bhi,$chi,$a3 448 addc $t1,$hhi,$hhi ; h += Sigma0(a) 449 xor $a2,$a0,$a0 450 add $a0,$hlo,$hlo 451 xor $a3,$a1,$a1 ; Maj(a,b,c) 452 addc $a1,$hhi,$hhi ; h += Maj(a,b,c) 453 454___ 455$code.=<<___ if ($i==15 && $flag); 456 extru $Xlo,31,10,$Xlo 457 comiclr,= $LAST10BITS,$Xlo,%r0 458 b L\$rounds_pa1 459 nop 460___ 461push(@X,shift(@X)); push(@X,shift(@X)); 462} 463 464sub ROUND_16_xx_pa1 { 465my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; 466my ($i)=shift; 467$i-=16; 468$code.=<<___; 469 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi 470 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] 471 ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1 472 ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9] 473 ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3 474 ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14] 475 shd $Xnhi,$Xnlo,$sigma0[0],$t0 476 shd $Xnlo,$Xnhi,$sigma0[0],$t1 477 add $a0,$Xlo,$Xlo 478 shd $Xnhi,$Xnlo,$sigma0[1],$t2 479 addc $a1,$Xhi,$Xhi 480 shd $Xnlo,$Xnhi,$sigma0[1],$t3 481 xor $t2,$t0,$t0 482 shd $Xnhi,$Xnlo,$sigma0[2],$t2 483 xor $t3,$t1,$t1 484 extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3 485 xor $t2,$t0,$t0 486 shd $a3,$a2,$sigma1[0],$a0 487 xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f]) 488 shd $a2,$a3,$sigma1[0],$a1 489 add $t0,$Xlo,$Xlo 490 shd $a3,$a2,$sigma1[1],$t2 491 addc $t1,$Xhi,$Xhi 492 shd $a2,$a3,$sigma1[1],$t3 493 xor $t2,$a0,$a0 494 shd $a3,$a2,$sigma1[2],$t2 495 xor $t3,$a1,$a1 496 extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3 497 xor $t2,$a0,$a0 498 xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f]) 499 add $a0,$Xlo,$Xlo 500 addc $a1,$Xhi,$Xhi 501 502 stw $Xhi,`-$XOFF+8*($i%16)`(%sp) 503 stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp) 504___ 505&ROUND_00_15_pa1($i,@_,1); 506} 507$code.=<<___; 508 ldw `0*4`($ctx),$Ahi ; load context 509 ldw `1*4`($ctx),$Alo 510 ldw `2*4`($ctx),$Bhi 511 ldw `3*4`($ctx),$Blo 512 ldw `4*4`($ctx),$Chi 513 ldw `5*4`($ctx),$Clo 514 ldw `6*4`($ctx),$Dhi 515 ldw `7*4`($ctx),$Dlo 516 ldw `8*4`($ctx),$Ehi 517 ldw `9*4`($ctx),$Elo 518 ldw `10*4`($ctx),$Fhi 519 ldw `11*4`($ctx),$Flo 520 ldw `12*4`($ctx),$Ghi 521 ldw `13*4`($ctx),$Glo 522 ldw `14*4`($ctx),$Hhi 523 ldw `15*4`($ctx),$Hlo 524 525 extru $inp,31,2,$t0 526 sh3addl $t0,%r0,$t0 527 subi 32,$t0,$t0 528 mtctl $t0,%cr11 ; load %sar with align factor 529 530L\$oop_pa1 531 extru $inp,31,2,$a3 532 comib,= 0,$a3,L\$aligned_pa1 533 sub $inp,$a3,$inp 534 535 ldw `0*4`($inp),$X[0] 536 ldw `1*4`($inp),$X[1] 537 ldw `2*4`($inp),$t2 538 ldw `3*4`($inp),$t3 539 ldw `4*4`($inp),$a0 540 ldw `5*4`($inp),$a1 541 ldw `6*4`($inp),$a2 542 ldw `7*4`($inp),$a3 543 vshd $X[0],$X[1],$X[0] 544 vshd $X[1],$t2,$X[1] 545 stw $X[0],`-$XOFF+0*4`(%sp) 546 ldw `8*4`($inp),$t0 547 vshd $t2,$t3,$t2 548 stw $X[1],`-$XOFF+1*4`(%sp) 549 ldw `9*4`($inp),$t1 550 vshd $t3,$a0,$t3 551___ 552{ 553my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); 554for ($i=2;$i<=(128/4-8);$i++) { 555$code.=<<___; 556 stw $t[0],`-$XOFF+$i*4`(%sp) 557 ldw `(8+$i)*4`($inp),$t[0] 558 vshd $t[1],$t[2],$t[1] 559___ 560push(@t,shift(@t)); 561} 562for (;$i<(128/4-1);$i++) { 563$code.=<<___; 564 stw $t[0],`-$XOFF+$i*4`(%sp) 565 vshd $t[1],$t[2],$t[1] 566___ 567push(@t,shift(@t)); 568} 569$code.=<<___; 570 b L\$collected_pa1 571 stw $t[0],`-$XOFF+$i*4`(%sp) 572 573___ 574} 575$code.=<<___; 576L\$aligned_pa1 577 ldw `0*4`($inp),$X[0] 578 ldw `1*4`($inp),$X[1] 579 ldw `2*4`($inp),$t2 580 ldw `3*4`($inp),$t3 581 ldw `4*4`($inp),$a0 582 ldw `5*4`($inp),$a1 583 ldw `6*4`($inp),$a2 584 ldw `7*4`($inp),$a3 585 stw $X[0],`-$XOFF+0*4`(%sp) 586 ldw `8*4`($inp),$t0 587 stw $X[1],`-$XOFF+1*4`(%sp) 588 ldw `9*4`($inp),$t1 589___ 590{ 591my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); 592for ($i=2;$i<(128/4-8);$i++) { 593$code.=<<___; 594 stw $t[0],`-$XOFF+$i*4`(%sp) 595 ldw `(8+$i)*4`($inp),$t[0] 596___ 597push(@t,shift(@t)); 598} 599for (;$i<128/4;$i++) { 600$code.=<<___; 601 stw $t[0],`-$XOFF+$i*4`(%sp) 602___ 603push(@t,shift(@t)); 604} 605$code.="L\$collected_pa1\n"; 606} 607 608for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } 609$code.="L\$rounds_pa1\n"; 610for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } 611 612$code.=<<___; 613 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments 614 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp 615 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num 616 ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl 617 618 ldw `0*4`($ctx),$t1 ; update context 619 ldw `1*4`($ctx),$t0 620 ldw `2*4`($ctx),$t3 621 ldw `3*4`($ctx),$t2 622 ldw `4*4`($ctx),$a1 623 ldw `5*4`($ctx),$a0 624 ldw `6*4`($ctx),$a3 625 add $t0,$Alo,$Alo 626 ldw `7*4`($ctx),$a2 627 addc $t1,$Ahi,$Ahi 628 ldw `8*4`($ctx),$t1 629 add $t2,$Blo,$Blo 630 ldw `9*4`($ctx),$t0 631 addc $t3,$Bhi,$Bhi 632 ldw `10*4`($ctx),$t3 633 add $a0,$Clo,$Clo 634 ldw `11*4`($ctx),$t2 635 addc $a1,$Chi,$Chi 636 ldw `12*4`($ctx),$a1 637 add $a2,$Dlo,$Dlo 638 ldw `13*4`($ctx),$a0 639 addc $a3,$Dhi,$Dhi 640 ldw `14*4`($ctx),$a3 641 add $t0,$Elo,$Elo 642 ldw `15*4`($ctx),$a2 643 addc $t1,$Ehi,$Ehi 644 stw $Ahi,`0*4`($ctx) 645 add $t2,$Flo,$Flo 646 stw $Alo,`1*4`($ctx) 647 addc $t3,$Fhi,$Fhi 648 stw $Bhi,`2*4`($ctx) 649 add $a0,$Glo,$Glo 650 stw $Blo,`3*4`($ctx) 651 addc $a1,$Ghi,$Ghi 652 stw $Chi,`4*4`($ctx) 653 add $a2,$Hlo,$Hlo 654 stw $Clo,`5*4`($ctx) 655 addc $a3,$Hhi,$Hhi 656 stw $Dhi,`6*4`($ctx) 657 ldo `16*$SZ`($inp),$inp ; advance $inp 658 stw $Dlo,`7*4`($ctx) 659 stw $Ehi,`8*4`($ctx) 660 stw $Elo,`9*4`($ctx) 661 stw $Fhi,`10*4`($ctx) 662 stw $Flo,`11*4`($ctx) 663 stw $Ghi,`12*4`($ctx) 664 stw $Glo,`13*4`($ctx) 665 stw $Hhi,`14*4`($ctx) 666 comb,= $inp,$num,L\$done 667 stw $Hlo,`15*4`($ctx) 668 b L\$oop_pa1 669 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp 670L\$done 671___ 672}} 673$code.=<<___; 674 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 675 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 676 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 677 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 678 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 679 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 680 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 681 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 682 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 683 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 684 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 685 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 686 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 687 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 688 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 689 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 690 bv (%r2) 691 .EXIT 692 $POPMB -$FRAME(%sp),%r3 693 .PROCEND 694___ 695 696# Explicitly encode PA-RISC 2.0 instructions used in this module, so 697# that it can be compiled with .LEVEL 1.0. It should be noted that I 698# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 699# directive... 700 701my $ldd = sub { 702 my ($mod,$args) = @_; 703 my $orig = "ldd$mod\t$args"; 704 705 if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices 706 { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1); 707 $opcode|=(1<<3) if ($mod =~ /^,m/); 708 $opcode|=(1<<2) if ($mod =~ /^,mb/); 709 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 710 } 711 else { "\t".$orig; } 712}; 713 714my $std = sub { 715 my ($mod,$args) = @_; 716 my $orig = "std$mod\t$args"; 717 718 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices 719 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); 720 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 721 } 722 else { "\t".$orig; } 723}; 724 725my $extrd = sub { 726 my ($mod,$args) = @_; 727 my $orig = "extrd$mod\t$args"; 728 729 # I only have ",u" completer, it's implicitly encoded... 730 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 731 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 732 my $len=32-$3; 733 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 734 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 735 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 736 } 737 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 738 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 739 my $len=32-$2; 740 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 741 $opcode |= (1<<13) if ($mod =~ /,\**=/); 742 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 743 } 744 else { "\t".$orig; } 745}; 746 747my $shrpd = sub { 748 my ($mod,$args) = @_; 749 my $orig = "shrpd$mod\t$args"; 750 751 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 752 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 753 my $cpos=63-$3; 754 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 755 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 756 } 757 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 758 { sprintf "\t.WORD\t0x%08x\t; %s", 759 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; 760 } 761 else { "\t".$orig; } 762}; 763 764sub assemble { 765 my ($mnemonic,$mod,$args)=@_; 766 my $opcode = eval("\$$mnemonic"); 767 768 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 769} 770 771foreach (split("\n",$code)) { 772 s/\`([^\`]*)\`/eval $1/ge; 773 774 s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/ 775 $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32 776 : sprintf("shd\t%$1,%$2,%d",$3)/e or 777 # translate made up instructons: _ror, _shr, _align, _shl 778 s/_ror(\s+)(%r[0-9]+),/ 779 ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or 780 781 s/_shr(\s+%r[0-9]+),([0-9]+),/ 782 $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2) 783 : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or 784 785 s/_align(\s+%r[0-9]+,%r[0-9]+),/ 786 ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or 787 788 s/_shl(\s+%r[0-9]+),([0-9]+),/ 789 $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2) 790 : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e; 791 792 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4); 793 794 s/cmpb,\*/comb,/ if ($SIZE_T==4); 795 796 s/\bbv\b/bve/ if ($SIZE_T==8); 797 798 print $_,"\n"; 799} 800 801close STDOUT; 802