1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# 9# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>. 10# ==================================================================== 11 12# Performance improvement is not really impressive on pre-T1 CPU: +8% 13# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it 14# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and 15# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick. 16# X[16] vector is packed to 8 64-bit registers and as result nothing 17# is spilled on stack. In addition input data is loaded in compact 18# instruction sequence, thus minimizing the window when the code is 19# subject to [inter-thread] cache-thrashing hazard. The goal is to 20# ensure scalability on UltraSPARC T1, or rather to avoid decay when 21# amount of active threads exceeds the number of physical cores. 22 23# SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x 24# faster than software. Multi-process benchmark saturates at 11x 25# single-process result on 8-core processor, or ~9GBps per 2.85GHz 26# socket. 27 28$output=shift; 29open STDOUT,">$output"; 30 31@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); 32$rot1m="%g2"; 33$tmp64="%g3"; 34$Xi="%g4"; 35$A="%l0"; 36$B="%l1"; 37$C="%l2"; 38$D="%l3"; 39$E="%l4"; 40@V=($A,$B,$C,$D,$E); 41$K_00_19="%l5"; 42$K_20_39="%l6"; 43$K_40_59="%l7"; 44$K_60_79="%g5"; 45@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79); 46 47$ctx="%i0"; 48$inp="%i1"; 49$len="%i2"; 50$tmp0="%i3"; 51$tmp1="%i4"; 52$tmp2="%i5"; 53 54sub BODY_00_15 { 55my ($i,$a,$b,$c,$d,$e)=@_; 56my $xi=($i&1)?@X[($i/2)%8]:$Xi; 57 58$code.=<<___; 59 sll $a,5,$tmp0 !! $i 60 add @K[$i/20],$e,$e 61 srl $a,27,$tmp1 62 add $tmp0,$e,$e 63 and $c,$b,$tmp0 64 add $tmp1,$e,$e 65 sll $b,30,$tmp2 66 andn $d,$b,$tmp1 67 srl $b,2,$b 68 or $tmp1,$tmp0,$tmp1 69 or $tmp2,$b,$b 70 add $xi,$e,$e 71___ 72if ($i&1 && $i<15) { 73 $code.= 74 " srlx @X[(($i+1)/2)%8],32,$Xi\n"; 75} 76$code.=<<___; 77 add $tmp1,$e,$e 78___ 79} 80 81sub Xupdate { 82my ($i,$a,$b,$c,$d,$e)=@_; 83my $j=$i/2; 84 85if ($i&1) { 86$code.=<<___; 87 sll $a,5,$tmp0 !! $i 88 add @K[$i/20],$e,$e 89 srl $a,27,$tmp1 90___ 91} else { 92$code.=<<___; 93 sllx @X[($j+6)%8],32,$Xi ! Xupdate($i) 94 xor @X[($j+1)%8],@X[$j%8],@X[$j%8] 95 srlx @X[($j+7)%8],32,$tmp1 96 xor @X[($j+4)%8],@X[$j%8],@X[$j%8] 97 sll $a,5,$tmp0 !! $i 98 or $tmp1,$Xi,$Xi 99 add @K[$i/20],$e,$e !! 100 xor $Xi,@X[$j%8],@X[$j%8] 101 srlx @X[$j%8],31,$Xi 102 add @X[$j%8],@X[$j%8],@X[$j%8] 103 and $Xi,$rot1m,$Xi 104 andn @X[$j%8],$rot1m,@X[$j%8] 105 srl $a,27,$tmp1 !! 106 or $Xi,@X[$j%8],@X[$j%8] 107___ 108} 109} 110 111sub BODY_16_19 { 112my ($i,$a,$b,$c,$d,$e)=@_; 113 114 &Xupdate(@_); 115 if ($i&1) { 116 $xi=@X[($i/2)%8]; 117 } else { 118 $xi=$Xi; 119 $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; 120 } 121$code.=<<___; 122 add $tmp0,$e,$e !! 123 and $c,$b,$tmp0 124 add $tmp1,$e,$e 125 sll $b,30,$tmp2 126 add $xi,$e,$e 127 andn $d,$b,$tmp1 128 srl $b,2,$b 129 or $tmp1,$tmp0,$tmp1 130 or $tmp2,$b,$b 131 add $tmp1,$e,$e 132___ 133} 134 135sub BODY_20_39 { 136my ($i,$a,$b,$c,$d,$e)=@_; 137my $xi; 138 &Xupdate(@_); 139 if ($i&1) { 140 $xi=@X[($i/2)%8]; 141 } else { 142 $xi=$Xi; 143 $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; 144 } 145$code.=<<___; 146 add $tmp0,$e,$e !! 147 xor $c,$b,$tmp0 148 add $tmp1,$e,$e 149 sll $b,30,$tmp2 150 xor $d,$tmp0,$tmp1 151 srl $b,2,$b 152 add $tmp1,$e,$e 153 or $tmp2,$b,$b 154 add $xi,$e,$e 155___ 156} 157 158sub BODY_40_59 { 159my ($i,$a,$b,$c,$d,$e)=@_; 160my $xi; 161 &Xupdate(@_); 162 if ($i&1) { 163 $xi=@X[($i/2)%8]; 164 } else { 165 $xi=$Xi; 166 $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; 167 } 168$code.=<<___; 169 add $tmp0,$e,$e !! 170 and $c,$b,$tmp0 171 add $tmp1,$e,$e 172 sll $b,30,$tmp2 173 or $c,$b,$tmp1 174 srl $b,2,$b 175 and $d,$tmp1,$tmp1 176 add $xi,$e,$e 177 or $tmp1,$tmp0,$tmp1 178 or $tmp2,$b,$b 179 add $tmp1,$e,$e 180___ 181} 182 183$code.=<<___; 184#include "sparc_arch.h" 185 186#ifdef __arch64__ 187.register %g2,#scratch 188.register %g3,#scratch 189#endif 190 191.section ".text",#alloc,#execinstr 192 193#ifdef __PIC__ 194SPARC_PIC_THUNK(%g1) 195#endif 196 197.align 32 198.globl sha1_block_data_order 199sha1_block_data_order: 200 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 201 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1] 202 203 andcc %g1, CFR_SHA1, %g0 204 be .Lsoftware 205 nop 206 207 ld [%o0 + 0x00], %f0 ! load context 208 ld [%o0 + 0x04], %f1 209 ld [%o0 + 0x08], %f2 210 andcc %o1, 0x7, %g0 211 ld [%o0 + 0x0c], %f3 212 bne,pn %icc, .Lhwunaligned 213 ld [%o0 + 0x10], %f4 214 215.Lhw_loop: 216 ldd [%o1 + 0x00], %f8 217 ldd [%o1 + 0x08], %f10 218 ldd [%o1 + 0x10], %f12 219 ldd [%o1 + 0x18], %f14 220 ldd [%o1 + 0x20], %f16 221 ldd [%o1 + 0x28], %f18 222 ldd [%o1 + 0x30], %f20 223 subcc %o2, 1, %o2 ! done yet? 224 ldd [%o1 + 0x38], %f22 225 add %o1, 0x40, %o1 226 prefetch [%o1 + 63], 20 227 228 .word 0x81b02820 ! SHA1 229 230 bne,pt SIZE_T_CC, .Lhw_loop 231 nop 232 233.Lhwfinish: 234 st %f0, [%o0 + 0x00] ! store context 235 st %f1, [%o0 + 0x04] 236 st %f2, [%o0 + 0x08] 237 st %f3, [%o0 + 0x0c] 238 retl 239 st %f4, [%o0 + 0x10] 240 241.align 8 242.Lhwunaligned: 243 alignaddr %o1, %g0, %o1 244 245 ldd [%o1 + 0x00], %f10 246.Lhwunaligned_loop: 247 ldd [%o1 + 0x08], %f12 248 ldd [%o1 + 0x10], %f14 249 ldd [%o1 + 0x18], %f16 250 ldd [%o1 + 0x20], %f18 251 ldd [%o1 + 0x28], %f20 252 ldd [%o1 + 0x30], %f22 253 ldd [%o1 + 0x38], %f24 254 subcc %o2, 1, %o2 ! done yet? 255 ldd [%o1 + 0x40], %f26 256 add %o1, 0x40, %o1 257 prefetch [%o1 + 63], 20 258 259 faligndata %f10, %f12, %f8 260 faligndata %f12, %f14, %f10 261 faligndata %f14, %f16, %f12 262 faligndata %f16, %f18, %f14 263 faligndata %f18, %f20, %f16 264 faligndata %f20, %f22, %f18 265 faligndata %f22, %f24, %f20 266 faligndata %f24, %f26, %f22 267 268 .word 0x81b02820 ! SHA1 269 270 bne,pt SIZE_T_CC, .Lhwunaligned_loop 271 for %f26, %f26, %f10 ! %f10=%f26 272 273 ba .Lhwfinish 274 nop 275 276.align 16 277.Lsoftware: 278 save %sp,-STACK_FRAME,%sp 279 sllx $len,6,$len 280 add $inp,$len,$len 281 282 or %g0,1,$rot1m 283 sllx $rot1m,32,$rot1m 284 or $rot1m,1,$rot1m 285 286 ld [$ctx+0],$A 287 ld [$ctx+4],$B 288 ld [$ctx+8],$C 289 ld [$ctx+12],$D 290 ld [$ctx+16],$E 291 andn $inp,7,$tmp0 292 293 sethi %hi(0x5a827999),$K_00_19 294 or $K_00_19,%lo(0x5a827999),$K_00_19 295 sethi %hi(0x6ed9eba1),$K_20_39 296 or $K_20_39,%lo(0x6ed9eba1),$K_20_39 297 sethi %hi(0x8f1bbcdc),$K_40_59 298 or $K_40_59,%lo(0x8f1bbcdc),$K_40_59 299 sethi %hi(0xca62c1d6),$K_60_79 300 or $K_60_79,%lo(0xca62c1d6),$K_60_79 301 302.Lloop: 303 ldx [$tmp0+0],@X[0] 304 ldx [$tmp0+16],@X[2] 305 ldx [$tmp0+32],@X[4] 306 ldx [$tmp0+48],@X[6] 307 and $inp,7,$tmp1 308 ldx [$tmp0+8],@X[1] 309 sll $tmp1,3,$tmp1 310 ldx [$tmp0+24],@X[3] 311 subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too 312 ldx [$tmp0+40],@X[5] 313 bz,pt %icc,.Laligned 314 ldx [$tmp0+56],@X[7] 315 316 sllx @X[0],$tmp1,@X[0] 317 ldx [$tmp0+64],$tmp64 318___ 319for($i=0;$i<7;$i++) 320{ $code.=<<___; 321 srlx @X[$i+1],$tmp2,$Xi 322 sllx @X[$i+1],$tmp1,@X[$i+1] 323 or $Xi,@X[$i],@X[$i] 324___ 325} 326$code.=<<___; 327 srlx $tmp64,$tmp2,$tmp64 328 or $tmp64,@X[7],@X[7] 329.Laligned: 330 srlx @X[0],32,$Xi 331___ 332for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 333for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } 334for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 335for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 336for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 337$code.=<<___; 338 339 ld [$ctx+0],@X[0] 340 ld [$ctx+4],@X[1] 341 ld [$ctx+8],@X[2] 342 ld [$ctx+12],@X[3] 343 add $inp,64,$inp 344 ld [$ctx+16],@X[4] 345 cmp $inp,$len 346 347 add $A,@X[0],$A 348 st $A,[$ctx+0] 349 add $B,@X[1],$B 350 st $B,[$ctx+4] 351 add $C,@X[2],$C 352 st $C,[$ctx+8] 353 add $D,@X[3],$D 354 st $D,[$ctx+12] 355 add $E,@X[4],$E 356 st $E,[$ctx+16] 357 358 bne SIZE_T_CC,.Lloop 359 andn $inp,7,$tmp0 360 361 ret 362 restore 363.type sha1_block_data_order,#function 364.size sha1_block_data_order,(.-sha1_block_data_order) 365.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 366.align 4 367___ 368 369# Purpose of these subroutines is to explicitly encode VIS instructions, 370# so that one can compile the module without having to specify VIS 371# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 372# Idea is to reserve for option to produce "universal" binary and let 373# programmer detect if current CPU is VIS capable at run-time. 374sub unvis { 375my ($mnemonic,$rs1,$rs2,$rd)=@_; 376my $ref,$opf; 377my %visopf = ( "faligndata" => 0x048, 378 "for" => 0x07c ); 379 380 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 381 382 if ($opf=$visopf{$mnemonic}) { 383 foreach ($rs1,$rs2,$rd) { 384 return $ref if (!/%f([0-9]{1,2})/); 385 $_=$1; 386 if ($1>=32) { 387 return $ref if ($1&1); 388 # re-encode for upper double register addressing 389 $_=($1|$1>>5)&31; 390 } 391 } 392 393 return sprintf ".word\t0x%08x !%s", 394 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 395 $ref; 396 } else { 397 return $ref; 398 } 399} 400sub unalignaddr { 401my ($mnemonic,$rs1,$rs2,$rd)=@_; 402my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 403my $ref="$mnemonic\t$rs1,$rs2,$rd"; 404 405 foreach ($rs1,$rs2,$rd) { 406 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } 407 else { return $ref; } 408 } 409 return sprintf ".word\t0x%08x !%s", 410 0x81b00300|$rd<<25|$rs1<<14|$rs2, 411 $ref; 412} 413 414foreach (split("\n",$code)) { 415 s/\`([^\`]*)\`/eval $1/ge; 416 417 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 418 &unvis($1,$2,$3,$4) 419 /ge; 420 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 421 &unalignaddr($1,$2,$3,$4) 422 /ge; 423 424 print $_,"\n"; 425} 426 427close STDOUT; 428