1238384Sjkim#!/usr/bin/env perl 2238384Sjkim# 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim# 10238384Sjkim# April 2010 11238384Sjkim# 12238384Sjkim# The module implements "4-bit" GCM GHASH function and underlying 13238384Sjkim# single multiplication operation in GF(2^128). "4-bit" means that it 14238384Sjkim# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC 15238384Sjkim# it processes one byte in 19.6 cycles, which is more than twice as 16238384Sjkim# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for 17238384Sjkim# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per 18238384Sjkim# processed byte. This is ~2.2x faster than 64-bit code generated by 19238384Sjkim# vendor compiler (which used to be very hard to beat:-). 20238384Sjkim# 21238384Sjkim# Special thanks to polarhome.com for providing HP-UX account. 22238384Sjkim 23238384Sjkim$flavour = shift; 24238384Sjkim$output = shift; 25238384Sjkimopen STDOUT,">$output"; 26238384Sjkim 27238384Sjkimif ($flavour =~ /64/) { 28238384Sjkim $LEVEL ="2.0W"; 29238384Sjkim $SIZE_T =8; 30238384Sjkim $FRAME_MARKER =80; 31238384Sjkim $SAVED_RP =16; 32238384Sjkim $PUSH ="std"; 33238384Sjkim $PUSHMA ="std,ma"; 34238384Sjkim $POP ="ldd"; 35238384Sjkim $POPMB ="ldd,mb"; 36238384Sjkim $NREGS =6; 37238384Sjkim} else { 38238384Sjkim $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0"; 39238384Sjkim $SIZE_T =4; 40238384Sjkim $FRAME_MARKER =48; 41238384Sjkim $SAVED_RP =20; 42238384Sjkim $PUSH ="stw"; 43238384Sjkim $PUSHMA ="stwm"; 44238384Sjkim $POP ="ldw"; 45238384Sjkim $POPMB ="ldwm"; 46238384Sjkim $NREGS =11; 47238384Sjkim} 48238384Sjkim 49238384Sjkim$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker 50238384Sjkim # [+ argument transfer] 51238384Sjkim 52238384Sjkim################# volatile registers 53238384Sjkim$Xi="%r26"; # argument block 54238384Sjkim$Htbl="%r25"; 55238384Sjkim$inp="%r24"; 56238384Sjkim$len="%r23"; 57238384Sjkim$Hhh=$Htbl; # variables 58238384Sjkim$Hll="%r22"; 59238384Sjkim$Zhh="%r21"; 60238384Sjkim$Zll="%r20"; 61238384Sjkim$cnt="%r19"; 62238384Sjkim$rem_4bit="%r28"; 63238384Sjkim$rem="%r29"; 64238384Sjkim$mask0xf0="%r31"; 65238384Sjkim 66238384Sjkim################# preserved registers 67238384Sjkim$Thh="%r1"; 68238384Sjkim$Tll="%r2"; 69238384Sjkim$nlo="%r3"; 70238384Sjkim$nhi="%r4"; 71238384Sjkim$byte="%r5"; 72238384Sjkimif ($SIZE_T==4) { 73238384Sjkim $Zhl="%r6"; 74238384Sjkim $Zlh="%r7"; 75238384Sjkim $Hhl="%r8"; 76238384Sjkim $Hlh="%r9"; 77238384Sjkim $Thl="%r10"; 78238384Sjkim $Tlh="%r11"; 79238384Sjkim} 80238384Sjkim$rem2="%r6"; # used in PA-RISC 2.0 code 81238384Sjkim 82238384Sjkim$code.=<<___; 83238384Sjkim .LEVEL $LEVEL 84238384Sjkim .SPACE \$TEXT\$ 85238384Sjkim .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 86238384Sjkim 87238384Sjkim .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR 88238384Sjkim .ALIGN 64 89238384Sjkimgcm_gmult_4bit 90238384Sjkim .PROC 91238384Sjkim .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS 92238384Sjkim .ENTRY 93238384Sjkim $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 94238384Sjkim $PUSHMA %r3,$FRAME(%sp) 95238384Sjkim $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 96238384Sjkim $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 97238384Sjkim $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 98238384Sjkim___ 99238384Sjkim$code.=<<___ if ($SIZE_T==4); 100238384Sjkim $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 101238384Sjkim $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 102238384Sjkim $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 103238384Sjkim $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 104238384Sjkim $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) 105238384Sjkim___ 106238384Sjkim$code.=<<___; 107238384Sjkim blr %r0,$rem_4bit 108238384Sjkim ldi 3,$rem 109238384SjkimL\$pic_gmult 110238384Sjkim andcm $rem_4bit,$rem,$rem_4bit 111238384Sjkim addl $inp,$len,$len 112238384Sjkim ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit 113238384Sjkim ldi 0xf0,$mask0xf0 114238384Sjkim___ 115238384Sjkim$code.=<<___ if ($SIZE_T==4); 116238384Sjkim ldi 31,$rem 117238384Sjkim mtctl $rem,%cr11 118238384Sjkim extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 119238384Sjkim b L\$parisc1_gmult 120238384Sjkim nop 121238384Sjkim___ 122238384Sjkim 123238384Sjkim$code.=<<___; 124238384Sjkim ldb 15($Xi),$nlo 125238384Sjkim ldo 8($Htbl),$Hll 126238384Sjkim 127238384Sjkim and $mask0xf0,$nlo,$nhi 128238384Sjkim depd,z $nlo,59,4,$nlo 129238384Sjkim 130238384Sjkim ldd $nlo($Hll),$Zll 131238384Sjkim ldd $nlo($Hhh),$Zhh 132238384Sjkim 133238384Sjkim depd,z $Zll,60,4,$rem 134238384Sjkim shrpd $Zhh,$Zll,4,$Zll 135238384Sjkim extrd,u $Zhh,59,60,$Zhh 136238384Sjkim ldb 14($Xi),$nlo 137238384Sjkim 138238384Sjkim ldd $nhi($Hll),$Tll 139238384Sjkim ldd $nhi($Hhh),$Thh 140238384Sjkim and $mask0xf0,$nlo,$nhi 141238384Sjkim depd,z $nlo,59,4,$nlo 142238384Sjkim 143238384Sjkim xor $Tll,$Zll,$Zll 144238384Sjkim xor $Thh,$Zhh,$Zhh 145238384Sjkim ldd $rem($rem_4bit),$rem 146238384Sjkim b L\$oop_gmult_pa2 147238384Sjkim ldi 13,$cnt 148238384Sjkim 149238384Sjkim .ALIGN 8 150238384SjkimL\$oop_gmult_pa2 151238384Sjkim xor $rem,$Zhh,$Zhh ; moved here to work around gas bug 152238384Sjkim depd,z $Zll,60,4,$rem 153238384Sjkim 154238384Sjkim shrpd $Zhh,$Zll,4,$Zll 155238384Sjkim extrd,u $Zhh,59,60,$Zhh 156238384Sjkim ldd $nlo($Hll),$Tll 157238384Sjkim ldd $nlo($Hhh),$Thh 158238384Sjkim 159238384Sjkim xor $Tll,$Zll,$Zll 160238384Sjkim xor $Thh,$Zhh,$Zhh 161238384Sjkim ldd $rem($rem_4bit),$rem 162238384Sjkim 163238384Sjkim xor $rem,$Zhh,$Zhh 164238384Sjkim depd,z $Zll,60,4,$rem 165238384Sjkim ldbx $cnt($Xi),$nlo 166238384Sjkim 167238384Sjkim shrpd $Zhh,$Zll,4,$Zll 168238384Sjkim extrd,u $Zhh,59,60,$Zhh 169238384Sjkim ldd $nhi($Hll),$Tll 170238384Sjkim ldd $nhi($Hhh),$Thh 171238384Sjkim 172238384Sjkim and $mask0xf0,$nlo,$nhi 173238384Sjkim depd,z $nlo,59,4,$nlo 174238384Sjkim ldd $rem($rem_4bit),$rem 175238384Sjkim 176238384Sjkim xor $Tll,$Zll,$Zll 177238384Sjkim addib,uv -1,$cnt,L\$oop_gmult_pa2 178238384Sjkim xor $Thh,$Zhh,$Zhh 179238384Sjkim 180238384Sjkim xor $rem,$Zhh,$Zhh 181238384Sjkim depd,z $Zll,60,4,$rem 182238384Sjkim 183238384Sjkim shrpd $Zhh,$Zll,4,$Zll 184238384Sjkim extrd,u $Zhh,59,60,$Zhh 185238384Sjkim ldd $nlo($Hll),$Tll 186238384Sjkim ldd $nlo($Hhh),$Thh 187238384Sjkim 188238384Sjkim xor $Tll,$Zll,$Zll 189238384Sjkim xor $Thh,$Zhh,$Zhh 190238384Sjkim ldd $rem($rem_4bit),$rem 191238384Sjkim 192238384Sjkim xor $rem,$Zhh,$Zhh 193238384Sjkim depd,z $Zll,60,4,$rem 194238384Sjkim 195238384Sjkim shrpd $Zhh,$Zll,4,$Zll 196238384Sjkim extrd,u $Zhh,59,60,$Zhh 197238384Sjkim ldd $nhi($Hll),$Tll 198238384Sjkim ldd $nhi($Hhh),$Thh 199238384Sjkim 200238384Sjkim xor $Tll,$Zll,$Zll 201238384Sjkim xor $Thh,$Zhh,$Zhh 202238384Sjkim ldd $rem($rem_4bit),$rem 203238384Sjkim 204238384Sjkim xor $rem,$Zhh,$Zhh 205238384Sjkim std $Zll,8($Xi) 206238384Sjkim std $Zhh,0($Xi) 207238384Sjkim___ 208238384Sjkim 209238384Sjkim$code.=<<___ if ($SIZE_T==4); 210238384Sjkim b L\$done_gmult 211238384Sjkim nop 212238384Sjkim 213238384SjkimL\$parisc1_gmult 214238384Sjkim ldb 15($Xi),$nlo 215238384Sjkim ldo 12($Htbl),$Hll 216238384Sjkim ldo 8($Htbl),$Hlh 217238384Sjkim ldo 4($Htbl),$Hhl 218238384Sjkim 219238384Sjkim and $mask0xf0,$nlo,$nhi 220238384Sjkim zdep $nlo,27,4,$nlo 221238384Sjkim 222238384Sjkim ldwx $nlo($Hll),$Zll 223238384Sjkim ldwx $nlo($Hlh),$Zlh 224238384Sjkim ldwx $nlo($Hhl),$Zhl 225238384Sjkim ldwx $nlo($Hhh),$Zhh 226238384Sjkim zdep $Zll,28,4,$rem 227238384Sjkim ldb 14($Xi),$nlo 228238384Sjkim ldwx $rem($rem_4bit),$rem 229238384Sjkim shrpw $Zlh,$Zll,4,$Zll 230238384Sjkim ldwx $nhi($Hll),$Tll 231238384Sjkim shrpw $Zhl,$Zlh,4,$Zlh 232238384Sjkim ldwx $nhi($Hlh),$Tlh 233238384Sjkim shrpw $Zhh,$Zhl,4,$Zhl 234238384Sjkim ldwx $nhi($Hhl),$Thl 235238384Sjkim extru $Zhh,27,28,$Zhh 236238384Sjkim ldwx $nhi($Hhh),$Thh 237238384Sjkim xor $rem,$Zhh,$Zhh 238238384Sjkim and $mask0xf0,$nlo,$nhi 239238384Sjkim zdep $nlo,27,4,$nlo 240238384Sjkim 241238384Sjkim xor $Tll,$Zll,$Zll 242238384Sjkim ldwx $nlo($Hll),$Tll 243238384Sjkim xor $Tlh,$Zlh,$Zlh 244238384Sjkim ldwx $nlo($Hlh),$Tlh 245238384Sjkim xor $Thl,$Zhl,$Zhl 246238384Sjkim b L\$oop_gmult_pa1 247238384Sjkim ldi 13,$cnt 248238384Sjkim 249238384Sjkim .ALIGN 8 250238384SjkimL\$oop_gmult_pa1 251238384Sjkim zdep $Zll,28,4,$rem 252238384Sjkim ldwx $nlo($Hhl),$Thl 253238384Sjkim xor $Thh,$Zhh,$Zhh 254238384Sjkim ldwx $rem($rem_4bit),$rem 255238384Sjkim shrpw $Zlh,$Zll,4,$Zll 256238384Sjkim ldwx $nlo($Hhh),$Thh 257238384Sjkim shrpw $Zhl,$Zlh,4,$Zlh 258238384Sjkim ldbx $cnt($Xi),$nlo 259238384Sjkim xor $Tll,$Zll,$Zll 260238384Sjkim ldwx $nhi($Hll),$Tll 261238384Sjkim shrpw $Zhh,$Zhl,4,$Zhl 262238384Sjkim xor $Tlh,$Zlh,$Zlh 263238384Sjkim ldwx $nhi($Hlh),$Tlh 264238384Sjkim extru $Zhh,27,28,$Zhh 265238384Sjkim xor $Thl,$Zhl,$Zhl 266238384Sjkim ldwx $nhi($Hhl),$Thl 267238384Sjkim xor $rem,$Zhh,$Zhh 268238384Sjkim zdep $Zll,28,4,$rem 269238384Sjkim xor $Thh,$Zhh,$Zhh 270238384Sjkim ldwx $nhi($Hhh),$Thh 271238384Sjkim shrpw $Zlh,$Zll,4,$Zll 272238384Sjkim ldwx $rem($rem_4bit),$rem 273238384Sjkim shrpw $Zhl,$Zlh,4,$Zlh 274238384Sjkim shrpw $Zhh,$Zhl,4,$Zhl 275238384Sjkim and $mask0xf0,$nlo,$nhi 276238384Sjkim extru $Zhh,27,28,$Zhh 277238384Sjkim zdep $nlo,27,4,$nlo 278238384Sjkim xor $Tll,$Zll,$Zll 279238384Sjkim ldwx $nlo($Hll),$Tll 280238384Sjkim xor $Tlh,$Zlh,$Zlh 281238384Sjkim ldwx $nlo($Hlh),$Tlh 282238384Sjkim xor $rem,$Zhh,$Zhh 283238384Sjkim addib,uv -1,$cnt,L\$oop_gmult_pa1 284238384Sjkim xor $Thl,$Zhl,$Zhl 285238384Sjkim 286238384Sjkim zdep $Zll,28,4,$rem 287238384Sjkim ldwx $nlo($Hhl),$Thl 288238384Sjkim xor $Thh,$Zhh,$Zhh 289238384Sjkim ldwx $rem($rem_4bit),$rem 290238384Sjkim shrpw $Zlh,$Zll,4,$Zll 291238384Sjkim ldwx $nlo($Hhh),$Thh 292238384Sjkim shrpw $Zhl,$Zlh,4,$Zlh 293238384Sjkim xor $Tll,$Zll,$Zll 294238384Sjkim ldwx $nhi($Hll),$Tll 295238384Sjkim shrpw $Zhh,$Zhl,4,$Zhl 296238384Sjkim xor $Tlh,$Zlh,$Zlh 297238384Sjkim ldwx $nhi($Hlh),$Tlh 298238384Sjkim extru $Zhh,27,28,$Zhh 299238384Sjkim xor $rem,$Zhh,$Zhh 300238384Sjkim xor $Thl,$Zhl,$Zhl 301238384Sjkim ldwx $nhi($Hhl),$Thl 302238384Sjkim xor $Thh,$Zhh,$Zhh 303238384Sjkim ldwx $nhi($Hhh),$Thh 304238384Sjkim zdep $Zll,28,4,$rem 305238384Sjkim ldwx $rem($rem_4bit),$rem 306238384Sjkim shrpw $Zlh,$Zll,4,$Zll 307238384Sjkim shrpw $Zhl,$Zlh,4,$Zlh 308238384Sjkim shrpw $Zhh,$Zhl,4,$Zhl 309238384Sjkim extru $Zhh,27,28,$Zhh 310238384Sjkim xor $Tll,$Zll,$Zll 311238384Sjkim xor $Tlh,$Zlh,$Zlh 312238384Sjkim xor $rem,$Zhh,$Zhh 313238384Sjkim stw $Zll,12($Xi) 314238384Sjkim xor $Thl,$Zhl,$Zhl 315238384Sjkim stw $Zlh,8($Xi) 316238384Sjkim xor $Thh,$Zhh,$Zhh 317238384Sjkim stw $Zhl,4($Xi) 318238384Sjkim stw $Zhh,0($Xi) 319238384Sjkim___ 320238384Sjkim$code.=<<___; 321238384SjkimL\$done_gmult 322238384Sjkim $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 323238384Sjkim $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 324238384Sjkim $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 325238384Sjkim $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 326238384Sjkim___ 327238384Sjkim$code.=<<___ if ($SIZE_T==4); 328238384Sjkim $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 329238384Sjkim $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 330238384Sjkim $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 331238384Sjkim $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 332238384Sjkim $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 333238384Sjkim___ 334238384Sjkim$code.=<<___; 335238384Sjkim bv (%r2) 336238384Sjkim .EXIT 337238384Sjkim $POPMB -$FRAME(%sp),%r3 338238384Sjkim .PROCEND 339238384Sjkim 340238384Sjkim .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 341238384Sjkim .ALIGN 64 342238384Sjkimgcm_ghash_4bit 343238384Sjkim .PROC 344238384Sjkim .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11 345238384Sjkim .ENTRY 346238384Sjkim $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 347238384Sjkim $PUSHMA %r3,$FRAME(%sp) 348238384Sjkim $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 349238384Sjkim $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 350238384Sjkim $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 351238384Sjkim___ 352238384Sjkim$code.=<<___ if ($SIZE_T==4); 353238384Sjkim $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 354238384Sjkim $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 355238384Sjkim $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 356238384Sjkim $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 357238384Sjkim $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) 358238384Sjkim___ 359238384Sjkim$code.=<<___; 360238384Sjkim blr %r0,$rem_4bit 361238384Sjkim ldi 3,$rem 362238384SjkimL\$pic_ghash 363238384Sjkim andcm $rem_4bit,$rem,$rem_4bit 364238384Sjkim addl $inp,$len,$len 365238384Sjkim ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit 366238384Sjkim ldi 0xf0,$mask0xf0 367238384Sjkim___ 368238384Sjkim$code.=<<___ if ($SIZE_T==4); 369238384Sjkim ldi 31,$rem 370238384Sjkim mtctl $rem,%cr11 371238384Sjkim extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 372238384Sjkim b L\$parisc1_ghash 373238384Sjkim nop 374238384Sjkim___ 375238384Sjkim 376238384Sjkim$code.=<<___; 377238384Sjkim ldb 15($Xi),$nlo 378238384Sjkim ldo 8($Htbl),$Hll 379238384Sjkim 380238384SjkimL\$outer_ghash_pa2 381238384Sjkim ldb 15($inp),$nhi 382238384Sjkim xor $nhi,$nlo,$nlo 383238384Sjkim and $mask0xf0,$nlo,$nhi 384238384Sjkim depd,z $nlo,59,4,$nlo 385238384Sjkim 386238384Sjkim ldd $nlo($Hll),$Zll 387238384Sjkim ldd $nlo($Hhh),$Zhh 388238384Sjkim 389238384Sjkim depd,z $Zll,60,4,$rem 390238384Sjkim shrpd $Zhh,$Zll,4,$Zll 391238384Sjkim extrd,u $Zhh,59,60,$Zhh 392238384Sjkim ldb 14($Xi),$nlo 393238384Sjkim ldb 14($inp),$byte 394238384Sjkim 395238384Sjkim ldd $nhi($Hll),$Tll 396238384Sjkim ldd $nhi($Hhh),$Thh 397238384Sjkim xor $byte,$nlo,$nlo 398238384Sjkim and $mask0xf0,$nlo,$nhi 399238384Sjkim depd,z $nlo,59,4,$nlo 400238384Sjkim 401238384Sjkim xor $Tll,$Zll,$Zll 402238384Sjkim xor $Thh,$Zhh,$Zhh 403238384Sjkim ldd $rem($rem_4bit),$rem 404238384Sjkim b L\$oop_ghash_pa2 405238384Sjkim ldi 13,$cnt 406238384Sjkim 407238384Sjkim .ALIGN 8 408238384SjkimL\$oop_ghash_pa2 409238384Sjkim xor $rem,$Zhh,$Zhh ; moved here to work around gas bug 410238384Sjkim depd,z $Zll,60,4,$rem2 411238384Sjkim 412238384Sjkim shrpd $Zhh,$Zll,4,$Zll 413238384Sjkim extrd,u $Zhh,59,60,$Zhh 414238384Sjkim ldd $nlo($Hll),$Tll 415238384Sjkim ldd $nlo($Hhh),$Thh 416238384Sjkim 417238384Sjkim xor $Tll,$Zll,$Zll 418238384Sjkim xor $Thh,$Zhh,$Zhh 419238384Sjkim ldbx $cnt($Xi),$nlo 420238384Sjkim ldbx $cnt($inp),$byte 421238384Sjkim 422238384Sjkim depd,z $Zll,60,4,$rem 423238384Sjkim shrpd $Zhh,$Zll,4,$Zll 424238384Sjkim ldd $rem2($rem_4bit),$rem2 425238384Sjkim 426238384Sjkim xor $rem2,$Zhh,$Zhh 427238384Sjkim xor $byte,$nlo,$nlo 428238384Sjkim ldd $nhi($Hll),$Tll 429238384Sjkim ldd $nhi($Hhh),$Thh 430238384Sjkim 431238384Sjkim and $mask0xf0,$nlo,$nhi 432238384Sjkim depd,z $nlo,59,4,$nlo 433238384Sjkim 434238384Sjkim extrd,u $Zhh,59,60,$Zhh 435238384Sjkim xor $Tll,$Zll,$Zll 436238384Sjkim 437238384Sjkim ldd $rem($rem_4bit),$rem 438238384Sjkim addib,uv -1,$cnt,L\$oop_ghash_pa2 439238384Sjkim xor $Thh,$Zhh,$Zhh 440238384Sjkim 441238384Sjkim xor $rem,$Zhh,$Zhh 442238384Sjkim depd,z $Zll,60,4,$rem2 443238384Sjkim 444238384Sjkim shrpd $Zhh,$Zll,4,$Zll 445238384Sjkim extrd,u $Zhh,59,60,$Zhh 446238384Sjkim ldd $nlo($Hll),$Tll 447238384Sjkim ldd $nlo($Hhh),$Thh 448238384Sjkim 449238384Sjkim xor $Tll,$Zll,$Zll 450238384Sjkim xor $Thh,$Zhh,$Zhh 451238384Sjkim 452238384Sjkim depd,z $Zll,60,4,$rem 453238384Sjkim shrpd $Zhh,$Zll,4,$Zll 454238384Sjkim ldd $rem2($rem_4bit),$rem2 455238384Sjkim 456238384Sjkim xor $rem2,$Zhh,$Zhh 457238384Sjkim ldd $nhi($Hll),$Tll 458238384Sjkim ldd $nhi($Hhh),$Thh 459238384Sjkim 460238384Sjkim extrd,u $Zhh,59,60,$Zhh 461238384Sjkim xor $Tll,$Zll,$Zll 462238384Sjkim xor $Thh,$Zhh,$Zhh 463238384Sjkim ldd $rem($rem_4bit),$rem 464238384Sjkim 465238384Sjkim xor $rem,$Zhh,$Zhh 466238384Sjkim std $Zll,8($Xi) 467238384Sjkim ldo 16($inp),$inp 468238384Sjkim std $Zhh,0($Xi) 469238384Sjkim cmpb,*<> $inp,$len,L\$outer_ghash_pa2 470238384Sjkim copy $Zll,$nlo 471238384Sjkim___ 472238384Sjkim 473238384Sjkim$code.=<<___ if ($SIZE_T==4); 474238384Sjkim b L\$done_ghash 475238384Sjkim nop 476238384Sjkim 477238384SjkimL\$parisc1_ghash 478238384Sjkim ldb 15($Xi),$nlo 479238384Sjkim ldo 12($Htbl),$Hll 480238384Sjkim ldo 8($Htbl),$Hlh 481238384Sjkim ldo 4($Htbl),$Hhl 482238384Sjkim 483238384SjkimL\$outer_ghash_pa1 484238384Sjkim ldb 15($inp),$byte 485238384Sjkim xor $byte,$nlo,$nlo 486238384Sjkim and $mask0xf0,$nlo,$nhi 487238384Sjkim zdep $nlo,27,4,$nlo 488238384Sjkim 489238384Sjkim ldwx $nlo($Hll),$Zll 490238384Sjkim ldwx $nlo($Hlh),$Zlh 491238384Sjkim ldwx $nlo($Hhl),$Zhl 492238384Sjkim ldwx $nlo($Hhh),$Zhh 493238384Sjkim zdep $Zll,28,4,$rem 494238384Sjkim ldb 14($Xi),$nlo 495238384Sjkim ldb 14($inp),$byte 496238384Sjkim ldwx $rem($rem_4bit),$rem 497238384Sjkim shrpw $Zlh,$Zll,4,$Zll 498238384Sjkim ldwx $nhi($Hll),$Tll 499238384Sjkim shrpw $Zhl,$Zlh,4,$Zlh 500238384Sjkim ldwx $nhi($Hlh),$Tlh 501238384Sjkim shrpw $Zhh,$Zhl,4,$Zhl 502238384Sjkim ldwx $nhi($Hhl),$Thl 503238384Sjkim extru $Zhh,27,28,$Zhh 504238384Sjkim ldwx $nhi($Hhh),$Thh 505238384Sjkim xor $byte,$nlo,$nlo 506238384Sjkim xor $rem,$Zhh,$Zhh 507238384Sjkim and $mask0xf0,$nlo,$nhi 508238384Sjkim zdep $nlo,27,4,$nlo 509238384Sjkim 510238384Sjkim xor $Tll,$Zll,$Zll 511238384Sjkim ldwx $nlo($Hll),$Tll 512238384Sjkim xor $Tlh,$Zlh,$Zlh 513238384Sjkim ldwx $nlo($Hlh),$Tlh 514238384Sjkim xor $Thl,$Zhl,$Zhl 515238384Sjkim b L\$oop_ghash_pa1 516238384Sjkim ldi 13,$cnt 517238384Sjkim 518238384Sjkim .ALIGN 8 519238384SjkimL\$oop_ghash_pa1 520238384Sjkim zdep $Zll,28,4,$rem 521238384Sjkim ldwx $nlo($Hhl),$Thl 522238384Sjkim xor $Thh,$Zhh,$Zhh 523238384Sjkim ldwx $rem($rem_4bit),$rem 524238384Sjkim shrpw $Zlh,$Zll,4,$Zll 525238384Sjkim ldwx $nlo($Hhh),$Thh 526238384Sjkim shrpw $Zhl,$Zlh,4,$Zlh 527238384Sjkim ldbx $cnt($Xi),$nlo 528238384Sjkim xor $Tll,$Zll,$Zll 529238384Sjkim ldwx $nhi($Hll),$Tll 530238384Sjkim shrpw $Zhh,$Zhl,4,$Zhl 531238384Sjkim ldbx $cnt($inp),$byte 532238384Sjkim xor $Tlh,$Zlh,$Zlh 533238384Sjkim ldwx $nhi($Hlh),$Tlh 534238384Sjkim extru $Zhh,27,28,$Zhh 535238384Sjkim xor $Thl,$Zhl,$Zhl 536238384Sjkim ldwx $nhi($Hhl),$Thl 537238384Sjkim xor $rem,$Zhh,$Zhh 538238384Sjkim zdep $Zll,28,4,$rem 539238384Sjkim xor $Thh,$Zhh,$Zhh 540238384Sjkim ldwx $nhi($Hhh),$Thh 541238384Sjkim shrpw $Zlh,$Zll,4,$Zll 542238384Sjkim ldwx $rem($rem_4bit),$rem 543238384Sjkim shrpw $Zhl,$Zlh,4,$Zlh 544238384Sjkim xor $byte,$nlo,$nlo 545238384Sjkim shrpw $Zhh,$Zhl,4,$Zhl 546238384Sjkim and $mask0xf0,$nlo,$nhi 547238384Sjkim extru $Zhh,27,28,$Zhh 548238384Sjkim zdep $nlo,27,4,$nlo 549238384Sjkim xor $Tll,$Zll,$Zll 550238384Sjkim ldwx $nlo($Hll),$Tll 551238384Sjkim xor $Tlh,$Zlh,$Zlh 552238384Sjkim ldwx $nlo($Hlh),$Tlh 553238384Sjkim xor $rem,$Zhh,$Zhh 554238384Sjkim addib,uv -1,$cnt,L\$oop_ghash_pa1 555238384Sjkim xor $Thl,$Zhl,$Zhl 556238384Sjkim 557238384Sjkim zdep $Zll,28,4,$rem 558238384Sjkim ldwx $nlo($Hhl),$Thl 559238384Sjkim xor $Thh,$Zhh,$Zhh 560238384Sjkim ldwx $rem($rem_4bit),$rem 561238384Sjkim shrpw $Zlh,$Zll,4,$Zll 562238384Sjkim ldwx $nlo($Hhh),$Thh 563238384Sjkim shrpw $Zhl,$Zlh,4,$Zlh 564238384Sjkim xor $Tll,$Zll,$Zll 565238384Sjkim ldwx $nhi($Hll),$Tll 566238384Sjkim shrpw $Zhh,$Zhl,4,$Zhl 567238384Sjkim xor $Tlh,$Zlh,$Zlh 568238384Sjkim ldwx $nhi($Hlh),$Tlh 569238384Sjkim extru $Zhh,27,28,$Zhh 570238384Sjkim xor $rem,$Zhh,$Zhh 571238384Sjkim xor $Thl,$Zhl,$Zhl 572238384Sjkim ldwx $nhi($Hhl),$Thl 573238384Sjkim xor $Thh,$Zhh,$Zhh 574238384Sjkim ldwx $nhi($Hhh),$Thh 575238384Sjkim zdep $Zll,28,4,$rem 576238384Sjkim ldwx $rem($rem_4bit),$rem 577238384Sjkim shrpw $Zlh,$Zll,4,$Zll 578238384Sjkim shrpw $Zhl,$Zlh,4,$Zlh 579238384Sjkim shrpw $Zhh,$Zhl,4,$Zhl 580238384Sjkim extru $Zhh,27,28,$Zhh 581238384Sjkim xor $Tll,$Zll,$Zll 582238384Sjkim xor $Tlh,$Zlh,$Zlh 583238384Sjkim xor $rem,$Zhh,$Zhh 584238384Sjkim stw $Zll,12($Xi) 585238384Sjkim xor $Thl,$Zhl,$Zhl 586238384Sjkim stw $Zlh,8($Xi) 587238384Sjkim xor $Thh,$Zhh,$Zhh 588238384Sjkim stw $Zhl,4($Xi) 589238384Sjkim ldo 16($inp),$inp 590238384Sjkim stw $Zhh,0($Xi) 591238384Sjkim comb,<> $inp,$len,L\$outer_ghash_pa1 592238384Sjkim copy $Zll,$nlo 593238384Sjkim___ 594238384Sjkim$code.=<<___; 595238384SjkimL\$done_ghash 596238384Sjkim $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 597238384Sjkim $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 598238384Sjkim $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 599238384Sjkim $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 600238384Sjkim___ 601238384Sjkim$code.=<<___ if ($SIZE_T==4); 602238384Sjkim $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 603238384Sjkim $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 604238384Sjkim $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 605238384Sjkim $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 606238384Sjkim $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 607238384Sjkim___ 608238384Sjkim$code.=<<___; 609238384Sjkim bv (%r2) 610238384Sjkim .EXIT 611238384Sjkim $POPMB -$FRAME(%sp),%r3 612238384Sjkim .PROCEND 613238384Sjkim 614238384Sjkim .ALIGN 64 615238384SjkimL\$rem_4bit 616238384Sjkim .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 617238384Sjkim .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 618238384Sjkim .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 619238384Sjkim .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 620238384Sjkim .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>" 621238384Sjkim .ALIGN 64 622238384Sjkim___ 623238384Sjkim 624238384Sjkim# Explicitly encode PA-RISC 2.0 instructions used in this module, so 625238384Sjkim# that it can be compiled with .LEVEL 1.0. It should be noted that I 626238384Sjkim# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 627238384Sjkim# directive... 628238384Sjkim 629238384Sjkimmy $ldd = sub { 630238384Sjkim my ($mod,$args) = @_; 631238384Sjkim my $orig = "ldd$mod\t$args"; 632238384Sjkim 633238384Sjkim if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 634238384Sjkim { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 635238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 636238384Sjkim } 637238384Sjkim elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 638238384Sjkim { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 639238384Sjkim $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 640238384Sjkim $opcode|=(1<<5) if ($mod =~ /^,m/); 641238384Sjkim $opcode|=(1<<13) if ($mod =~ /^,mb/); 642238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 643238384Sjkim } 644238384Sjkim else { "\t".$orig; } 645238384Sjkim}; 646238384Sjkim 647238384Sjkimmy $std = sub { 648238384Sjkim my ($mod,$args) = @_; 649238384Sjkim my $orig = "std$mod\t$args"; 650238384Sjkim 651238384Sjkim if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices 652238384Sjkim { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); 653238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 654238384Sjkim } 655238384Sjkim else { "\t".$orig; } 656238384Sjkim}; 657238384Sjkim 658238384Sjkimmy $extrd = sub { 659238384Sjkim my ($mod,$args) = @_; 660238384Sjkim my $orig = "extrd$mod\t$args"; 661238384Sjkim 662238384Sjkim # I only have ",u" completer, it's implicitly encoded... 663238384Sjkim if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 664238384Sjkim { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 665238384Sjkim my $len=32-$3; 666238384Sjkim $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 667238384Sjkim $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 668238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 669238384Sjkim } 670238384Sjkim elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 671238384Sjkim { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 672238384Sjkim my $len=32-$2; 673238384Sjkim $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 674238384Sjkim $opcode |= (1<<13) if ($mod =~ /,\**=/); 675238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 676238384Sjkim } 677238384Sjkim else { "\t".$orig; } 678238384Sjkim}; 679238384Sjkim 680238384Sjkimmy $shrpd = sub { 681238384Sjkim my ($mod,$args) = @_; 682238384Sjkim my $orig = "shrpd$mod\t$args"; 683238384Sjkim 684238384Sjkim if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 685238384Sjkim { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 686238384Sjkim my $cpos=63-$3; 687238384Sjkim $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 688238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 689238384Sjkim } 690238384Sjkim elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 691238384Sjkim { sprintf "\t.WORD\t0x%08x\t; %s", 692238384Sjkim (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; 693238384Sjkim } 694238384Sjkim else { "\t".$orig; } 695238384Sjkim}; 696238384Sjkim 697238384Sjkimmy $depd = sub { 698238384Sjkim my ($mod,$args) = @_; 699238384Sjkim my $orig = "depd$mod\t$args"; 700238384Sjkim 701238384Sjkim # I only have ",z" completer, it's impicitly encoded... 702238384Sjkim if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16 703238384Sjkim { my $opcode=(0x3c<<26)|($4<<21)|($1<<16); 704238384Sjkim my $cpos=63-$2; 705238384Sjkim my $len=32-$3; 706238384Sjkim $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos 707238384Sjkim $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 708238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 709238384Sjkim } 710238384Sjkim else { "\t".$orig; } 711238384Sjkim}; 712238384Sjkim 713238384Sjkimsub assemble { 714238384Sjkim my ($mnemonic,$mod,$args)=@_; 715238384Sjkim my $opcode = eval("\$$mnemonic"); 716238384Sjkim 717238384Sjkim ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 718238384Sjkim} 719238384Sjkim 720238384Sjkimforeach (split("\n",$code)) { 721238384Sjkim s/\`([^\`]*)\`/eval $1/ge; 722238384Sjkim if ($SIZE_T==4) { 723238384Sjkim s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e; 724238384Sjkim s/cmpb,\*/comb,/; 725238384Sjkim s/,\*/,/; 726238384Sjkim } 727279264Sdelphij s/\bbv\b/bve/ if ($SIZE_T==8); 728238384Sjkim print $_,"\n"; 729238384Sjkim} 730238384Sjkim 731238384Sjkimclose STDOUT; 732