1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8290207Sjkim# 9290207Sjkim# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>. 10238384Sjkim# ==================================================================== 11238384Sjkim 12238384Sjkim# Performance improvement is not really impressive on pre-T1 CPU: +8% 13238384Sjkim# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it 14238384Sjkim# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and 15238384Sjkim# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick. 16238384Sjkim# X[16] vector is packed to 8 64-bit registers and as result nothing 17238384Sjkim# is spilled on stack. In addition input data is loaded in compact 18238384Sjkim# instruction sequence, thus minimizing the window when the code is 19238384Sjkim# subject to [inter-thread] cache-thrashing hazard. The goal is to 20238384Sjkim# ensure scalability on UltraSPARC T1, or rather to avoid decay when 21238384Sjkim# amount of active threads exceeds the number of physical cores. 22238384Sjkim 23290207Sjkim# SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x 24290207Sjkim# faster than software. Multi-process benchmark saturates at 11x 25290207Sjkim# single-process result on 8-core processor, or ~9GBps per 2.85GHz 26290207Sjkim# socket. 27238384Sjkim 28238384Sjkim$output=shift; 29238384Sjkimopen STDOUT,">$output"; 30238384Sjkim 31238384Sjkim@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); 32238384Sjkim$rot1m="%g2"; 33238384Sjkim$tmp64="%g3"; 34238384Sjkim$Xi="%g4"; 35238384Sjkim$A="%l0"; 36238384Sjkim$B="%l1"; 37238384Sjkim$C="%l2"; 38238384Sjkim$D="%l3"; 39238384Sjkim$E="%l4"; 40238384Sjkim@V=($A,$B,$C,$D,$E); 41238384Sjkim$K_00_19="%l5"; 42238384Sjkim$K_20_39="%l6"; 43238384Sjkim$K_40_59="%l7"; 44238384Sjkim$K_60_79="%g5"; 45238384Sjkim@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79); 46238384Sjkim 47238384Sjkim$ctx="%i0"; 48238384Sjkim$inp="%i1"; 49238384Sjkim$len="%i2"; 50238384Sjkim$tmp0="%i3"; 51238384Sjkim$tmp1="%i4"; 52238384Sjkim$tmp2="%i5"; 53238384Sjkim 54238384Sjkimsub BODY_00_15 { 55238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 56238384Sjkimmy $xi=($i&1)?@X[($i/2)%8]:$Xi; 57238384Sjkim 58238384Sjkim$code.=<<___; 59238384Sjkim sll $a,5,$tmp0 !! $i 60238384Sjkim add @K[$i/20],$e,$e 61238384Sjkim srl $a,27,$tmp1 62238384Sjkim add $tmp0,$e,$e 63238384Sjkim and $c,$b,$tmp0 64238384Sjkim add $tmp1,$e,$e 65238384Sjkim sll $b,30,$tmp2 66238384Sjkim andn $d,$b,$tmp1 67238384Sjkim srl $b,2,$b 68238384Sjkim or $tmp1,$tmp0,$tmp1 69238384Sjkim or $tmp2,$b,$b 70238384Sjkim add $xi,$e,$e 71238384Sjkim___ 72238384Sjkimif ($i&1 && $i<15) { 73238384Sjkim $code.= 74238384Sjkim " srlx @X[(($i+1)/2)%8],32,$Xi\n"; 75238384Sjkim} 76238384Sjkim$code.=<<___; 77238384Sjkim add $tmp1,$e,$e 78238384Sjkim___ 79238384Sjkim} 80238384Sjkim 81238384Sjkimsub Xupdate { 82238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 83238384Sjkimmy $j=$i/2; 84238384Sjkim 85238384Sjkimif ($i&1) { 86238384Sjkim$code.=<<___; 87238384Sjkim sll $a,5,$tmp0 !! $i 88238384Sjkim add @K[$i/20],$e,$e 89238384Sjkim srl $a,27,$tmp1 90238384Sjkim___ 91238384Sjkim} else { 92238384Sjkim$code.=<<___; 93238384Sjkim sllx @X[($j+6)%8],32,$Xi ! Xupdate($i) 94238384Sjkim xor @X[($j+1)%8],@X[$j%8],@X[$j%8] 95238384Sjkim srlx @X[($j+7)%8],32,$tmp1 96238384Sjkim xor @X[($j+4)%8],@X[$j%8],@X[$j%8] 97238384Sjkim sll $a,5,$tmp0 !! $i 98238384Sjkim or $tmp1,$Xi,$Xi 99238384Sjkim add @K[$i/20],$e,$e !! 100238384Sjkim xor $Xi,@X[$j%8],@X[$j%8] 101238384Sjkim srlx @X[$j%8],31,$Xi 102238384Sjkim add @X[$j%8],@X[$j%8],@X[$j%8] 103238384Sjkim and $Xi,$rot1m,$Xi 104238384Sjkim andn @X[$j%8],$rot1m,@X[$j%8] 105238384Sjkim srl $a,27,$tmp1 !! 106238384Sjkim or $Xi,@X[$j%8],@X[$j%8] 107238384Sjkim___ 108238384Sjkim} 109238384Sjkim} 110238384Sjkim 111238384Sjkimsub BODY_16_19 { 112238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 113238384Sjkim 114238384Sjkim &Xupdate(@_); 115238384Sjkim if ($i&1) { 116238384Sjkim $xi=@X[($i/2)%8]; 117238384Sjkim } else { 118238384Sjkim $xi=$Xi; 119238384Sjkim $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; 120238384Sjkim } 121238384Sjkim$code.=<<___; 122238384Sjkim add $tmp0,$e,$e !! 123238384Sjkim and $c,$b,$tmp0 124238384Sjkim add $tmp1,$e,$e 125238384Sjkim sll $b,30,$tmp2 126238384Sjkim add $xi,$e,$e 127238384Sjkim andn $d,$b,$tmp1 128238384Sjkim srl $b,2,$b 129238384Sjkim or $tmp1,$tmp0,$tmp1 130238384Sjkim or $tmp2,$b,$b 131238384Sjkim add $tmp1,$e,$e 132238384Sjkim___ 133238384Sjkim} 134238384Sjkim 135238384Sjkimsub BODY_20_39 { 136238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 137238384Sjkimmy $xi; 138238384Sjkim &Xupdate(@_); 139238384Sjkim if ($i&1) { 140238384Sjkim $xi=@X[($i/2)%8]; 141238384Sjkim } else { 142238384Sjkim $xi=$Xi; 143238384Sjkim $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; 144238384Sjkim } 145238384Sjkim$code.=<<___; 146238384Sjkim add $tmp0,$e,$e !! 147238384Sjkim xor $c,$b,$tmp0 148238384Sjkim add $tmp1,$e,$e 149238384Sjkim sll $b,30,$tmp2 150238384Sjkim xor $d,$tmp0,$tmp1 151238384Sjkim srl $b,2,$b 152238384Sjkim add $tmp1,$e,$e 153238384Sjkim or $tmp2,$b,$b 154238384Sjkim add $xi,$e,$e 155238384Sjkim___ 156238384Sjkim} 157238384Sjkim 158238384Sjkimsub BODY_40_59 { 159238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 160238384Sjkimmy $xi; 161238384Sjkim &Xupdate(@_); 162238384Sjkim if ($i&1) { 163238384Sjkim $xi=@X[($i/2)%8]; 164238384Sjkim } else { 165238384Sjkim $xi=$Xi; 166238384Sjkim $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; 167238384Sjkim } 168238384Sjkim$code.=<<___; 169238384Sjkim add $tmp0,$e,$e !! 170238384Sjkim and $c,$b,$tmp0 171238384Sjkim add $tmp1,$e,$e 172238384Sjkim sll $b,30,$tmp2 173238384Sjkim or $c,$b,$tmp1 174238384Sjkim srl $b,2,$b 175238384Sjkim and $d,$tmp1,$tmp1 176238384Sjkim add $xi,$e,$e 177238384Sjkim or $tmp1,$tmp0,$tmp1 178238384Sjkim or $tmp2,$b,$b 179238384Sjkim add $tmp1,$e,$e 180238384Sjkim___ 181238384Sjkim} 182238384Sjkim 183290207Sjkim$code.=<<___; 184290207Sjkim#include "sparc_arch.h" 185290207Sjkim 186290207Sjkim#ifdef __arch64__ 187238384Sjkim.register %g2,#scratch 188238384Sjkim.register %g3,#scratch 189290207Sjkim#endif 190290207Sjkim 191238384Sjkim.section ".text",#alloc,#execinstr 192238384Sjkim 193290207Sjkim#ifdef __PIC__ 194290207SjkimSPARC_PIC_THUNK(%g1) 195290207Sjkim#endif 196290207Sjkim 197238384Sjkim.align 32 198238384Sjkim.globl sha1_block_data_order 199238384Sjkimsha1_block_data_order: 200290207Sjkim SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 201290207Sjkim ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1] 202290207Sjkim 203290207Sjkim andcc %g1, CFR_SHA1, %g0 204290207Sjkim be .Lsoftware 205290207Sjkim nop 206290207Sjkim 207290207Sjkim ld [%o0 + 0x00], %f0 ! load context 208290207Sjkim ld [%o0 + 0x04], %f1 209290207Sjkim ld [%o0 + 0x08], %f2 210290207Sjkim andcc %o1, 0x7, %g0 211290207Sjkim ld [%o0 + 0x0c], %f3 212290207Sjkim bne,pn %icc, .Lhwunaligned 213290207Sjkim ld [%o0 + 0x10], %f4 214290207Sjkim 215290207Sjkim.Lhw_loop: 216290207Sjkim ldd [%o1 + 0x00], %f8 217290207Sjkim ldd [%o1 + 0x08], %f10 218290207Sjkim ldd [%o1 + 0x10], %f12 219290207Sjkim ldd [%o1 + 0x18], %f14 220290207Sjkim ldd [%o1 + 0x20], %f16 221290207Sjkim ldd [%o1 + 0x28], %f18 222290207Sjkim ldd [%o1 + 0x30], %f20 223290207Sjkim subcc %o2, 1, %o2 ! done yet? 224290207Sjkim ldd [%o1 + 0x38], %f22 225290207Sjkim add %o1, 0x40, %o1 226290207Sjkim prefetch [%o1 + 63], 20 227290207Sjkim 228290207Sjkim .word 0x81b02820 ! SHA1 229290207Sjkim 230290207Sjkim bne,pt SIZE_T_CC, .Lhw_loop 231290207Sjkim nop 232290207Sjkim 233290207Sjkim.Lhwfinish: 234290207Sjkim st %f0, [%o0 + 0x00] ! store context 235290207Sjkim st %f1, [%o0 + 0x04] 236290207Sjkim st %f2, [%o0 + 0x08] 237290207Sjkim st %f3, [%o0 + 0x0c] 238290207Sjkim retl 239290207Sjkim st %f4, [%o0 + 0x10] 240290207Sjkim 241290207Sjkim.align 8 242290207Sjkim.Lhwunaligned: 243290207Sjkim alignaddr %o1, %g0, %o1 244290207Sjkim 245290207Sjkim ldd [%o1 + 0x00], %f10 246290207Sjkim.Lhwunaligned_loop: 247290207Sjkim ldd [%o1 + 0x08], %f12 248290207Sjkim ldd [%o1 + 0x10], %f14 249290207Sjkim ldd [%o1 + 0x18], %f16 250290207Sjkim ldd [%o1 + 0x20], %f18 251290207Sjkim ldd [%o1 + 0x28], %f20 252290207Sjkim ldd [%o1 + 0x30], %f22 253290207Sjkim ldd [%o1 + 0x38], %f24 254290207Sjkim subcc %o2, 1, %o2 ! done yet? 255290207Sjkim ldd [%o1 + 0x40], %f26 256290207Sjkim add %o1, 0x40, %o1 257290207Sjkim prefetch [%o1 + 63], 20 258290207Sjkim 259290207Sjkim faligndata %f10, %f12, %f8 260290207Sjkim faligndata %f12, %f14, %f10 261290207Sjkim faligndata %f14, %f16, %f12 262290207Sjkim faligndata %f16, %f18, %f14 263290207Sjkim faligndata %f18, %f20, %f16 264290207Sjkim faligndata %f20, %f22, %f18 265290207Sjkim faligndata %f22, %f24, %f20 266290207Sjkim faligndata %f24, %f26, %f22 267290207Sjkim 268290207Sjkim .word 0x81b02820 ! SHA1 269290207Sjkim 270290207Sjkim bne,pt SIZE_T_CC, .Lhwunaligned_loop 271290207Sjkim for %f26, %f26, %f10 ! %f10=%f26 272290207Sjkim 273290207Sjkim ba .Lhwfinish 274290207Sjkim nop 275290207Sjkim 276290207Sjkim.align 16 277290207Sjkim.Lsoftware: 278290207Sjkim save %sp,-STACK_FRAME,%sp 279238384Sjkim sllx $len,6,$len 280238384Sjkim add $inp,$len,$len 281238384Sjkim 282238384Sjkim or %g0,1,$rot1m 283238384Sjkim sllx $rot1m,32,$rot1m 284238384Sjkim or $rot1m,1,$rot1m 285238384Sjkim 286238384Sjkim ld [$ctx+0],$A 287238384Sjkim ld [$ctx+4],$B 288238384Sjkim ld [$ctx+8],$C 289238384Sjkim ld [$ctx+12],$D 290238384Sjkim ld [$ctx+16],$E 291238384Sjkim andn $inp,7,$tmp0 292238384Sjkim 293238384Sjkim sethi %hi(0x5a827999),$K_00_19 294238384Sjkim or $K_00_19,%lo(0x5a827999),$K_00_19 295238384Sjkim sethi %hi(0x6ed9eba1),$K_20_39 296238384Sjkim or $K_20_39,%lo(0x6ed9eba1),$K_20_39 297238384Sjkim sethi %hi(0x8f1bbcdc),$K_40_59 298238384Sjkim or $K_40_59,%lo(0x8f1bbcdc),$K_40_59 299238384Sjkim sethi %hi(0xca62c1d6),$K_60_79 300238384Sjkim or $K_60_79,%lo(0xca62c1d6),$K_60_79 301238384Sjkim 302238384Sjkim.Lloop: 303238384Sjkim ldx [$tmp0+0],@X[0] 304238384Sjkim ldx [$tmp0+16],@X[2] 305238384Sjkim ldx [$tmp0+32],@X[4] 306238384Sjkim ldx [$tmp0+48],@X[6] 307238384Sjkim and $inp,7,$tmp1 308238384Sjkim ldx [$tmp0+8],@X[1] 309238384Sjkim sll $tmp1,3,$tmp1 310238384Sjkim ldx [$tmp0+24],@X[3] 311238384Sjkim subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too 312238384Sjkim ldx [$tmp0+40],@X[5] 313238384Sjkim bz,pt %icc,.Laligned 314238384Sjkim ldx [$tmp0+56],@X[7] 315238384Sjkim 316238384Sjkim sllx @X[0],$tmp1,@X[0] 317238384Sjkim ldx [$tmp0+64],$tmp64 318238384Sjkim___ 319238384Sjkimfor($i=0;$i<7;$i++) 320238384Sjkim{ $code.=<<___; 321238384Sjkim srlx @X[$i+1],$tmp2,$Xi 322238384Sjkim sllx @X[$i+1],$tmp1,@X[$i+1] 323238384Sjkim or $Xi,@X[$i],@X[$i] 324238384Sjkim___ 325238384Sjkim} 326238384Sjkim$code.=<<___; 327238384Sjkim srlx $tmp64,$tmp2,$tmp64 328238384Sjkim or $tmp64,@X[7],@X[7] 329238384Sjkim.Laligned: 330238384Sjkim srlx @X[0],32,$Xi 331238384Sjkim___ 332238384Sjkimfor ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 333238384Sjkimfor (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } 334238384Sjkimfor (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 335238384Sjkimfor (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 336238384Sjkimfor (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 337238384Sjkim$code.=<<___; 338238384Sjkim 339238384Sjkim ld [$ctx+0],@X[0] 340238384Sjkim ld [$ctx+4],@X[1] 341238384Sjkim ld [$ctx+8],@X[2] 342238384Sjkim ld [$ctx+12],@X[3] 343238384Sjkim add $inp,64,$inp 344238384Sjkim ld [$ctx+16],@X[4] 345238384Sjkim cmp $inp,$len 346238384Sjkim 347238384Sjkim add $A,@X[0],$A 348238384Sjkim st $A,[$ctx+0] 349238384Sjkim add $B,@X[1],$B 350238384Sjkim st $B,[$ctx+4] 351238384Sjkim add $C,@X[2],$C 352238384Sjkim st $C,[$ctx+8] 353238384Sjkim add $D,@X[3],$D 354238384Sjkim st $D,[$ctx+12] 355238384Sjkim add $E,@X[4],$E 356238384Sjkim st $E,[$ctx+16] 357238384Sjkim 358290207Sjkim bne SIZE_T_CC,.Lloop 359238384Sjkim andn $inp,7,$tmp0 360238384Sjkim 361238384Sjkim ret 362238384Sjkim restore 363238384Sjkim.type sha1_block_data_order,#function 364238384Sjkim.size sha1_block_data_order,(.-sha1_block_data_order) 365238384Sjkim.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 366238384Sjkim.align 4 367238384Sjkim___ 368238384Sjkim 369290207Sjkim# Purpose of these subroutines is to explicitly encode VIS instructions, 370290207Sjkim# so that one can compile the module without having to specify VIS 371290207Sjkim# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 372290207Sjkim# Idea is to reserve for option to produce "universal" binary and let 373290207Sjkim# programmer detect if current CPU is VIS capable at run-time. 374290207Sjkimsub unvis { 375290207Sjkimmy ($mnemonic,$rs1,$rs2,$rd)=@_; 376290207Sjkimmy $ref,$opf; 377290207Sjkimmy %visopf = ( "faligndata" => 0x048, 378290207Sjkim "for" => 0x07c ); 379290207Sjkim 380290207Sjkim $ref = "$mnemonic\t$rs1,$rs2,$rd"; 381290207Sjkim 382290207Sjkim if ($opf=$visopf{$mnemonic}) { 383290207Sjkim foreach ($rs1,$rs2,$rd) { 384290207Sjkim return $ref if (!/%f([0-9]{1,2})/); 385290207Sjkim $_=$1; 386290207Sjkim if ($1>=32) { 387290207Sjkim return $ref if ($1&1); 388290207Sjkim # re-encode for upper double register addressing 389290207Sjkim $_=($1|$1>>5)&31; 390290207Sjkim } 391290207Sjkim } 392290207Sjkim 393290207Sjkim return sprintf ".word\t0x%08x !%s", 394290207Sjkim 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 395290207Sjkim $ref; 396290207Sjkim } else { 397290207Sjkim return $ref; 398290207Sjkim } 399290207Sjkim} 400290207Sjkimsub unalignaddr { 401290207Sjkimmy ($mnemonic,$rs1,$rs2,$rd)=@_; 402290207Sjkimmy %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 403290207Sjkimmy $ref="$mnemonic\t$rs1,$rs2,$rd"; 404290207Sjkim 405290207Sjkim foreach ($rs1,$rs2,$rd) { 406290207Sjkim if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } 407290207Sjkim else { return $ref; } 408290207Sjkim } 409290207Sjkim return sprintf ".word\t0x%08x !%s", 410290207Sjkim 0x81b00300|$rd<<25|$rs1<<14|$rs2, 411290207Sjkim $ref; 412290207Sjkim} 413290207Sjkim 414290207Sjkimforeach (split("\n",$code)) { 415290207Sjkim s/\`([^\`]*)\`/eval $1/ge; 416290207Sjkim 417290207Sjkim s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 418290207Sjkim &unvis($1,$2,$3,$4) 419290207Sjkim /ge; 420290207Sjkim s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 421290207Sjkim &unalignaddr($1,$2,$3,$4) 422290207Sjkim /ge; 423290207Sjkim 424290207Sjkim print $_,"\n"; 425290207Sjkim} 426290207Sjkim 427238384Sjkimclose STDOUT; 428