1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# March 2010 11# 12# The module implements "4-bit" GCM GHASH function and underlying 13# single multiplication operation in GF(2^128). "4-bit" means that it 14# uses 256 bytes per-key table [+128 bytes shared table]. Performance 15# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU 16# and are expressed in cycles per processed byte, less is better: 17# 18# gcc 3.3.x cc 5.2 this assembler 19# 20# 32-bit build 81.4 43.3 12.6 (+546%/+244%) 21# 64-bit build 20.2 21.2 12.6 (+60%/+68%) 22# 23# Here is data collected on UltraSPARC T1 system running Linux: 24# 25# gcc 4.4.1 this assembler 26# 27# 32-bit build 566 50 (+1000%) 28# 64-bit build 56 50 (+12%) 29# 30# I don't quite understand why difference between 32-bit and 64-bit 31# compiler-generated code is so big. Compilers *were* instructed to 32# generate code for UltraSPARC and should have used 64-bit registers 33# for Z vector (see C code) even in 32-bit build... Oh well, it only 34# means more impressive improvement coefficients for this assembler 35# module;-) Loops are aggressively modulo-scheduled in respect to 36# references to input data and Z.hi updates to achieve 12 cycles 37# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 38# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. 39# 40# October 2012 41# 42# Add VIS3 lookup-table-free implementation using polynomial 43# multiplication xmulx[hi] and extended addition addxc[cc] 44# instructions. 4.52/7.63x improvement on T3/T4 or in absolute 45# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark 46# saturates at ~15.5x single-process result on 8-core processor, 47# or ~20.5GBps per 2.85GHz socket. 48 49$bits=32; 50for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } 51if ($bits==64) { $bias=2047; $frame=192; } 52else { $bias=0; $frame=112; } 53 54$output=shift; 55open STDOUT,">$output"; 56 57$Zhi="%o0"; # 64-bit values 58$Zlo="%o1"; 59$Thi="%o2"; 60$Tlo="%o3"; 61$rem="%o4"; 62$tmp="%o5"; 63 64$nhi="%l0"; # small values and pointers 65$nlo="%l1"; 66$xi0="%l2"; 67$xi1="%l3"; 68$rem_4bit="%l4"; 69$remi="%l5"; 70$Htblo="%l6"; 71$cnt="%l7"; 72 73$Xi="%i0"; # input argument block 74$Htbl="%i1"; 75$inp="%i2"; 76$len="%i3"; 77 78$code.=<<___ if ($bits==64); 79.register %g2,#scratch 80.register %g3,#scratch 81___ 82$code.=<<___; 83.section ".text",#alloc,#execinstr 84 85.align 64 86rem_4bit: 87 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 88 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 89 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 90 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 91.type rem_4bit,#object 92.size rem_4bit,(.-rem_4bit) 93 94.globl gcm_ghash_4bit 95.align 32 96gcm_ghash_4bit: 97 save %sp,-$frame,%sp 98 ldub [$inp+15],$nlo 99 ldub [$Xi+15],$xi0 100 ldub [$Xi+14],$xi1 101 add $len,$inp,$len 102 add $Htbl,8,$Htblo 103 1041: call .+8 105 add %o7,rem_4bit-1b,$rem_4bit 106 107.Louter: 108 xor $xi0,$nlo,$nlo 109 and $nlo,0xf0,$nhi 110 and $nlo,0x0f,$nlo 111 sll $nlo,4,$nlo 112 ldx [$Htblo+$nlo],$Zlo 113 ldx [$Htbl+$nlo],$Zhi 114 115 ldub [$inp+14],$nlo 116 117 ldx [$Htblo+$nhi],$Tlo 118 and $Zlo,0xf,$remi 119 ldx [$Htbl+$nhi],$Thi 120 sll $remi,3,$remi 121 ldx [$rem_4bit+$remi],$rem 122 srlx $Zlo,4,$Zlo 123 mov 13,$cnt 124 sllx $Zhi,60,$tmp 125 xor $Tlo,$Zlo,$Zlo 126 srlx $Zhi,4,$Zhi 127 xor $Zlo,$tmp,$Zlo 128 129 xor $xi1,$nlo,$nlo 130 and $Zlo,0xf,$remi 131 and $nlo,0xf0,$nhi 132 and $nlo,0x0f,$nlo 133 ba .Lghash_inner 134 sll $nlo,4,$nlo 135.align 32 136.Lghash_inner: 137 ldx [$Htblo+$nlo],$Tlo 138 sll $remi,3,$remi 139 xor $Thi,$Zhi,$Zhi 140 ldx [$Htbl+$nlo],$Thi 141 srlx $Zlo,4,$Zlo 142 xor $rem,$Zhi,$Zhi 143 ldx [$rem_4bit+$remi],$rem 144 sllx $Zhi,60,$tmp 145 xor $Tlo,$Zlo,$Zlo 146 ldub [$inp+$cnt],$nlo 147 srlx $Zhi,4,$Zhi 148 xor $Zlo,$tmp,$Zlo 149 ldub [$Xi+$cnt],$xi1 150 xor $Thi,$Zhi,$Zhi 151 and $Zlo,0xf,$remi 152 153 ldx [$Htblo+$nhi],$Tlo 154 sll $remi,3,$remi 155 xor $rem,$Zhi,$Zhi 156 ldx [$Htbl+$nhi],$Thi 157 srlx $Zlo,4,$Zlo 158 ldx [$rem_4bit+$remi],$rem 159 sllx $Zhi,60,$tmp 160 xor $xi1,$nlo,$nlo 161 srlx $Zhi,4,$Zhi 162 and $nlo,0xf0,$nhi 163 addcc $cnt,-1,$cnt 164 xor $Zlo,$tmp,$Zlo 165 and $nlo,0x0f,$nlo 166 xor $Tlo,$Zlo,$Zlo 167 sll $nlo,4,$nlo 168 blu .Lghash_inner 169 and $Zlo,0xf,$remi 170 171 ldx [$Htblo+$nlo],$Tlo 172 sll $remi,3,$remi 173 xor $Thi,$Zhi,$Zhi 174 ldx [$Htbl+$nlo],$Thi 175 srlx $Zlo,4,$Zlo 176 xor $rem,$Zhi,$Zhi 177 ldx [$rem_4bit+$remi],$rem 178 sllx $Zhi,60,$tmp 179 xor $Tlo,$Zlo,$Zlo 180 srlx $Zhi,4,$Zhi 181 xor $Zlo,$tmp,$Zlo 182 xor $Thi,$Zhi,$Zhi 183 184 add $inp,16,$inp 185 cmp $inp,$len 186 be,pn `$bits==64?"%xcc":"%icc"`,.Ldone 187 and $Zlo,0xf,$remi 188 189 ldx [$Htblo+$nhi],$Tlo 190 sll $remi,3,$remi 191 xor $rem,$Zhi,$Zhi 192 ldx [$Htbl+$nhi],$Thi 193 srlx $Zlo,4,$Zlo 194 ldx [$rem_4bit+$remi],$rem 195 sllx $Zhi,60,$tmp 196 xor $Tlo,$Zlo,$Zlo 197 ldub [$inp+15],$nlo 198 srlx $Zhi,4,$Zhi 199 xor $Zlo,$tmp,$Zlo 200 xor $Thi,$Zhi,$Zhi 201 stx $Zlo,[$Xi+8] 202 xor $rem,$Zhi,$Zhi 203 stx $Zhi,[$Xi] 204 srl $Zlo,8,$xi1 205 and $Zlo,0xff,$xi0 206 ba .Louter 207 and $xi1,0xff,$xi1 208.align 32 209.Ldone: 210 ldx [$Htblo+$nhi],$Tlo 211 sll $remi,3,$remi 212 xor $rem,$Zhi,$Zhi 213 ldx [$Htbl+$nhi],$Thi 214 srlx $Zlo,4,$Zlo 215 ldx [$rem_4bit+$remi],$rem 216 sllx $Zhi,60,$tmp 217 xor $Tlo,$Zlo,$Zlo 218 srlx $Zhi,4,$Zhi 219 xor $Zlo,$tmp,$Zlo 220 xor $Thi,$Zhi,$Zhi 221 stx $Zlo,[$Xi+8] 222 xor $rem,$Zhi,$Zhi 223 stx $Zhi,[$Xi] 224 225 ret 226 restore 227.type gcm_ghash_4bit,#function 228.size gcm_ghash_4bit,(.-gcm_ghash_4bit) 229___ 230 231undef $inp; 232undef $len; 233 234$code.=<<___; 235.globl gcm_gmult_4bit 236.align 32 237gcm_gmult_4bit: 238 save %sp,-$frame,%sp 239 ldub [$Xi+15],$nlo 240 add $Htbl,8,$Htblo 241 2421: call .+8 243 add %o7,rem_4bit-1b,$rem_4bit 244 245 and $nlo,0xf0,$nhi 246 and $nlo,0x0f,$nlo 247 sll $nlo,4,$nlo 248 ldx [$Htblo+$nlo],$Zlo 249 ldx [$Htbl+$nlo],$Zhi 250 251 ldub [$Xi+14],$nlo 252 253 ldx [$Htblo+$nhi],$Tlo 254 and $Zlo,0xf,$remi 255 ldx [$Htbl+$nhi],$Thi 256 sll $remi,3,$remi 257 ldx [$rem_4bit+$remi],$rem 258 srlx $Zlo,4,$Zlo 259 mov 13,$cnt 260 sllx $Zhi,60,$tmp 261 xor $Tlo,$Zlo,$Zlo 262 srlx $Zhi,4,$Zhi 263 xor $Zlo,$tmp,$Zlo 264 265 and $Zlo,0xf,$remi 266 and $nlo,0xf0,$nhi 267 and $nlo,0x0f,$nlo 268 ba .Lgmult_inner 269 sll $nlo,4,$nlo 270.align 32 271.Lgmult_inner: 272 ldx [$Htblo+$nlo],$Tlo 273 sll $remi,3,$remi 274 xor $Thi,$Zhi,$Zhi 275 ldx [$Htbl+$nlo],$Thi 276 srlx $Zlo,4,$Zlo 277 xor $rem,$Zhi,$Zhi 278 ldx [$rem_4bit+$remi],$rem 279 sllx $Zhi,60,$tmp 280 xor $Tlo,$Zlo,$Zlo 281 ldub [$Xi+$cnt],$nlo 282 srlx $Zhi,4,$Zhi 283 xor $Zlo,$tmp,$Zlo 284 xor $Thi,$Zhi,$Zhi 285 and $Zlo,0xf,$remi 286 287 ldx [$Htblo+$nhi],$Tlo 288 sll $remi,3,$remi 289 xor $rem,$Zhi,$Zhi 290 ldx [$Htbl+$nhi],$Thi 291 srlx $Zlo,4,$Zlo 292 ldx [$rem_4bit+$remi],$rem 293 sllx $Zhi,60,$tmp 294 srlx $Zhi,4,$Zhi 295 and $nlo,0xf0,$nhi 296 addcc $cnt,-1,$cnt 297 xor $Zlo,$tmp,$Zlo 298 and $nlo,0x0f,$nlo 299 xor $Tlo,$Zlo,$Zlo 300 sll $nlo,4,$nlo 301 blu .Lgmult_inner 302 and $Zlo,0xf,$remi 303 304 ldx [$Htblo+$nlo],$Tlo 305 sll $remi,3,$remi 306 xor $Thi,$Zhi,$Zhi 307 ldx [$Htbl+$nlo],$Thi 308 srlx $Zlo,4,$Zlo 309 xor $rem,$Zhi,$Zhi 310 ldx [$rem_4bit+$remi],$rem 311 sllx $Zhi,60,$tmp 312 xor $Tlo,$Zlo,$Zlo 313 srlx $Zhi,4,$Zhi 314 xor $Zlo,$tmp,$Zlo 315 xor $Thi,$Zhi,$Zhi 316 and $Zlo,0xf,$remi 317 318 ldx [$Htblo+$nhi],$Tlo 319 sll $remi,3,$remi 320 xor $rem,$Zhi,$Zhi 321 ldx [$Htbl+$nhi],$Thi 322 srlx $Zlo,4,$Zlo 323 ldx [$rem_4bit+$remi],$rem 324 sllx $Zhi,60,$tmp 325 xor $Tlo,$Zlo,$Zlo 326 srlx $Zhi,4,$Zhi 327 xor $Zlo,$tmp,$Zlo 328 xor $Thi,$Zhi,$Zhi 329 stx $Zlo,[$Xi+8] 330 xor $rem,$Zhi,$Zhi 331 stx $Zhi,[$Xi] 332 333 ret 334 restore 335.type gcm_gmult_4bit,#function 336.size gcm_gmult_4bit,(.-gcm_gmult_4bit) 337___ 338 339{{{ 340# Straightforward 128x128-bit multiplication using Karatsuba algorithm 341# followed by pair of 64-bit reductions [with a shortcut in first one, 342# which allowed to break dependency between reductions and remove one 343# multiplication from critical path]. While it might be suboptimal 344# with regard to sheer number of multiplications, other methods [such 345# as aggregate reduction] would require more 64-bit registers, which 346# we don't have in 32-bit application context. 347 348($Xip,$Htable,$inp,$len)=map("%i$_",(0..3)); 349 350($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)= 351 (map("%o$_",(0..5,7)),map("%g$_",(1..5))); 352 353($shl,$shr)=map("%l$_",(0..7)); 354 355# For details regarding "twisted H" see ghash-x86.pl. 356$code.=<<___; 357.globl gcm_init_vis3 358.align 32 359gcm_init_vis3: 360 save %sp,-$frame,%sp 361 362 ldx [%i1+0],$Hhi 363 ldx [%i1+8],$Hlo 364 mov 0xE1,$Xhi 365 mov 1,$Xlo 366 sllx $Xhi,57,$Xhi 367 srax $Hhi,63,$C0 ! broadcast carry 368 addcc $Hlo,$Hlo,$Hlo ! H<<=1 369 addxc $Hhi,$Hhi,$Hhi 370 and $C0,$Xlo,$Xlo 371 and $C0,$Xhi,$Xhi 372 xor $Xlo,$Hlo,$Hlo 373 xor $Xhi,$Hhi,$Hhi 374 stx $Hlo,[%i0+8] ! save twisted H 375 stx $Hhi,[%i0+0] 376 377 sethi %hi(0xA0406080),$V 378 sethi %hi(0x20C0E000),%l0 379 or $V,%lo(0xA0406080),$V 380 or %l0,%lo(0x20C0E000),%l0 381 sllx $V,32,$V 382 or %l0,$V,$V ! (0xE0��i)&0xff=0xA040608020C0E000 383 stx $V,[%i0+16] 384 385 ret 386 restore 387.type gcm_init_vis3,#function 388.size gcm_init_vis3,.-gcm_init_vis3 389 390.globl gcm_gmult_vis3 391.align 32 392gcm_gmult_vis3: 393 save %sp,-$frame,%sp 394 395 ldx [$Xip+8],$Xlo ! load Xi 396 ldx [$Xip+0],$Xhi 397 ldx [$Htable+8],$Hlo ! load twisted H 398 ldx [$Htable+0],$Hhi 399 400 mov 0xE1,%l7 401 sllx %l7,57,$xE1 ! 57 is not a typo 402 ldx [$Htable+16],$V ! (0xE0��i)&0xff=0xA040608020C0E000 403 404 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing 405 xmulx $Xlo,$Hlo,$C0 406 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing 407 xmulx $C2,$Hhl,$C1 408 xmulxhi $Xlo,$Hlo,$Xlo 409 xmulxhi $C2,$Hhl,$C2 410 xmulxhi $Xhi,$Hhi,$C3 411 xmulx $Xhi,$Hhi,$Xhi 412 413 sll $C0,3,$sqr 414 srlx $V,$sqr,$sqr ! ��0xE0 [implicit &(7<<3)] 415 xor $C0,$sqr,$sqr 416 sllx $sqr,57,$sqr ! ($C0��0xE1)<<1<<56 [implicit &0x7f] 417 418 xor $C0,$C1,$C1 ! Karatsuba post-processing 419 xor $Xlo,$C2,$C2 420 xor $sqr,$Xlo,$Xlo ! real destination is $C1 421 xor $C3,$C2,$C2 422 xor $Xlo,$C1,$C1 423 xor $Xhi,$C2,$C2 424 xor $Xhi,$C1,$C1 425 426 xmulxhi $C0,$xE1,$Xlo ! ��0xE1<<1<<56 427 xor $C0,$C2,$C2 428 xmulx $C1,$xE1,$C0 429 xor $C1,$C3,$C3 430 xmulxhi $C1,$xE1,$C1 431 432 xor $Xlo,$C2,$C2 433 xor $C0,$C2,$C2 434 xor $C1,$C3,$C3 435 436 stx $C2,[$Xip+8] ! save Xi 437 stx $C3,[$Xip+0] 438 439 ret 440 restore 441.type gcm_gmult_vis3,#function 442.size gcm_gmult_vis3,.-gcm_gmult_vis3 443 444.globl gcm_ghash_vis3 445.align 32 446gcm_ghash_vis3: 447 save %sp,-$frame,%sp 448 449 ldx [$Xip+8],$C2 ! load Xi 450 ldx [$Xip+0],$C3 451 ldx [$Htable+8],$Hlo ! load twisted H 452 ldx [$Htable+0],$Hhi 453 454 mov 0xE1,%l7 455 sllx %l7,57,$xE1 ! 57 is not a typo 456 ldx [$Htable+16],$V ! (0xE0��i)&0xff=0xA040608020C0E000 457 458 and $inp,7,$shl 459 andn $inp,7,$inp 460 sll $shl,3,$shl 461 prefetch [$inp+63], 20 462 sub %g0,$shl,$shr 463 464 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing 465.Loop: 466 ldx [$inp+8],$Xlo 467 brz,pt $shl,1f 468 ldx [$inp+0],$Xhi 469 470 ldx [$inp+16],$C1 ! align data 471 srlx $Xlo,$shr,$C0 472 sllx $Xlo,$shl,$Xlo 473 sllx $Xhi,$shl,$Xhi 474 srlx $C1,$shr,$C1 475 or $C0,$Xhi,$Xhi 476 or $C1,$Xlo,$Xlo 4771: 478 add $inp,16,$inp 479 sub $len,16,$len 480 xor $C2,$Xlo,$Xlo 481 xor $C3,$Xhi,$Xhi 482 prefetch [$inp+63], 20 483 484 xmulx $Xlo,$Hlo,$C0 485 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing 486 xmulx $C2,$Hhl,$C1 487 xmulxhi $Xlo,$Hlo,$Xlo 488 xmulxhi $C2,$Hhl,$C2 489 xmulxhi $Xhi,$Hhi,$C3 490 xmulx $Xhi,$Hhi,$Xhi 491 492 sll $C0,3,$sqr 493 srlx $V,$sqr,$sqr ! ��0xE0 [implicit &(7<<3)] 494 xor $C0,$sqr,$sqr 495 sllx $sqr,57,$sqr ! ($C0��0xE1)<<1<<56 [implicit &0x7f] 496 497 xor $C0,$C1,$C1 ! Karatsuba post-processing 498 xor $Xlo,$C2,$C2 499 xor $sqr,$Xlo,$Xlo ! real destination is $C1 500 xor $C3,$C2,$C2 501 xor $Xlo,$C1,$C1 502 xor $Xhi,$C2,$C2 503 xor $Xhi,$C1,$C1 504 505 xmulxhi $C0,$xE1,$Xlo ! ��0xE1<<1<<56 506 xor $C0,$C2,$C2 507 xmulx $C1,$xE1,$C0 508 xor $C1,$C3,$C3 509 xmulxhi $C1,$xE1,$C1 510 511 xor $Xlo,$C2,$C2 512 xor $C0,$C2,$C2 513 brnz,pt $len,.Loop 514 xor $C1,$C3,$C3 515 516 stx $C2,[$Xip+8] ! save Xi 517 stx $C3,[$Xip+0] 518 519 ret 520 restore 521.type gcm_ghash_vis3,#function 522.size gcm_ghash_vis3,.-gcm_ghash_vis3 523___ 524}}} 525$code.=<<___; 526.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>" 527.align 4 528___ 529 530 531# Purpose of these subroutines is to explicitly encode VIS instructions, 532# so that one can compile the module without having to specify VIS 533# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 534# Idea is to reserve for option to produce "universal" binary and let 535# programmer detect if current CPU is VIS capable at run-time. 536sub unvis3 { 537my ($mnemonic,$rs1,$rs2,$rd)=@_; 538my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 539my ($ref,$opf); 540my %visopf = ( "addxc" => 0x011, 541 "addxccc" => 0x013, 542 "xmulx" => 0x115, 543 "xmulxhi" => 0x116 ); 544 545 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 546 547 if ($opf=$visopf{$mnemonic}) { 548 foreach ($rs1,$rs2,$rd) { 549 return $ref if (!/%([goli])([0-9])/); 550 $_=$bias{$1}+$2; 551 } 552 553 return sprintf ".word\t0x%08x !%s", 554 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 555 $ref; 556 } else { 557 return $ref; 558 } 559} 560 561foreach (split("\n",$code)) { 562 s/\`([^\`]*)\`/eval $1/ge; 563 564 s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 565 &unvis3($1,$2,$3,$4) 566 /ge; 567 568 print $_,"\n"; 569} 570 571close STDOUT; 572