ghash-alpha.pl revision 1.3
1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# March 2010 11# 12# The module implements "4-bit" GCM GHASH function and underlying 13# single multiplication operation in GF(2^128). "4-bit" means that it 14# uses 256 bytes per-key table [+128 bytes shared table]. Even though 15# loops are aggressively modulo-scheduled in respect to references to 16# Htbl and Z.hi updates for 8 cycles per byte, measured performance is 17# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic 18# scheduling "glitch," because uprofile(1) indicates uniform sample 19# distribution, as if all instruction bundles execute in 1.5 cycles. 20# Meaning that it could have been even faster, yet 12 cycles is ~60% 21# better than gcc-generated code and ~80% than code generated by vendor 22# compiler. 23 24$cnt="v0"; # $0 25$t0="t0"; 26$t1="t1"; 27$t2="t2"; 28$Thi0="t3"; # $4 29$Tlo0="t4"; 30$Thi1="t5"; 31$Tlo1="t6"; 32$rem="t7"; # $8 33################# 34$Xi="a0"; # $16, input argument block 35$Htbl="a1"; 36$inp="a2"; 37$len="a3"; 38$nlo="a4"; # $20 39$nhi="a5"; 40$Zhi="t8"; 41$Zlo="t9"; 42$Xhi="t10"; # $24 43$Xlo="t11"; 44$remp="t12"; 45$rem_4bit="AT"; # $28 46 47{ my $N; 48 sub loop() { 49 50 $N++; 51$code.=<<___; 52.align 4 53 extbl $Xlo,7,$nlo 54 and $nlo,0xf0,$nhi 55 sll $nlo,4,$nlo 56 and $nlo,0xf0,$nlo 57 58 addq $nlo,$Htbl,$nlo 59 ldq $Zlo,8($nlo) 60 addq $nhi,$Htbl,$nhi 61 ldq $Zhi,0($nlo) 62 63 and $Zlo,0x0f,$remp 64 sll $Zhi,60,$t0 65 lda $cnt,6(zero) 66 extbl $Xlo,6,$nlo 67 68 ldq $Tlo1,8($nhi) 69 s8addq $remp,$rem_4bit,$remp 70 ldq $Thi1,0($nhi) 71 srl $Zlo,4,$Zlo 72 73 ldq $rem,0($remp) 74 srl $Zhi,4,$Zhi 75 xor $t0,$Zlo,$Zlo 76 and $nlo,0xf0,$nhi 77 78 xor $Tlo1,$Zlo,$Zlo 79 sll $nlo,4,$nlo 80 xor $Thi1,$Zhi,$Zhi 81 and $nlo,0xf0,$nlo 82 83 addq $nlo,$Htbl,$nlo 84 ldq $Tlo0,8($nlo) 85 addq $nhi,$Htbl,$nhi 86 ldq $Thi0,0($nlo) 87 88.Looplo$N: 89 and $Zlo,0x0f,$remp 90 sll $Zhi,60,$t0 91 subq $cnt,1,$cnt 92 srl $Zlo,4,$Zlo 93 94 ldq $Tlo1,8($nhi) 95 xor $rem,$Zhi,$Zhi 96 ldq $Thi1,0($nhi) 97 s8addq $remp,$rem_4bit,$remp 98 99 ldq $rem,0($remp) 100 srl $Zhi,4,$Zhi 101 xor $t0,$Zlo,$Zlo 102 extbl $Xlo,$cnt,$nlo 103 104 and $nlo,0xf0,$nhi 105 xor $Thi0,$Zhi,$Zhi 106 xor $Tlo0,$Zlo,$Zlo 107 sll $nlo,4,$nlo 108 109 110 and $Zlo,0x0f,$remp 111 sll $Zhi,60,$t0 112 and $nlo,0xf0,$nlo 113 srl $Zlo,4,$Zlo 114 115 s8addq $remp,$rem_4bit,$remp 116 xor $rem,$Zhi,$Zhi 117 addq $nlo,$Htbl,$nlo 118 addq $nhi,$Htbl,$nhi 119 120 ldq $rem,0($remp) 121 srl $Zhi,4,$Zhi 122 ldq $Tlo0,8($nlo) 123 xor $t0,$Zlo,$Zlo 124 125 xor $Tlo1,$Zlo,$Zlo 126 xor $Thi1,$Zhi,$Zhi 127 ldq $Thi0,0($nlo) 128 bne $cnt,.Looplo$N 129 130 131 and $Zlo,0x0f,$remp 132 sll $Zhi,60,$t0 133 lda $cnt,7(zero) 134 srl $Zlo,4,$Zlo 135 136 ldq $Tlo1,8($nhi) 137 xor $rem,$Zhi,$Zhi 138 ldq $Thi1,0($nhi) 139 s8addq $remp,$rem_4bit,$remp 140 141 ldq $rem,0($remp) 142 srl $Zhi,4,$Zhi 143 xor $t0,$Zlo,$Zlo 144 extbl $Xhi,$cnt,$nlo 145 146 and $nlo,0xf0,$nhi 147 xor $Thi0,$Zhi,$Zhi 148 xor $Tlo0,$Zlo,$Zlo 149 sll $nlo,4,$nlo 150 151 and $Zlo,0x0f,$remp 152 sll $Zhi,60,$t0 153 and $nlo,0xf0,$nlo 154 srl $Zlo,4,$Zlo 155 156 s8addq $remp,$rem_4bit,$remp 157 xor $rem,$Zhi,$Zhi 158 addq $nlo,$Htbl,$nlo 159 addq $nhi,$Htbl,$nhi 160 161 ldq $rem,0($remp) 162 srl $Zhi,4,$Zhi 163 ldq $Tlo0,8($nlo) 164 xor $t0,$Zlo,$Zlo 165 166 xor $Tlo1,$Zlo,$Zlo 167 xor $Thi1,$Zhi,$Zhi 168 ldq $Thi0,0($nlo) 169 unop 170 171 172.Loophi$N: 173 and $Zlo,0x0f,$remp 174 sll $Zhi,60,$t0 175 subq $cnt,1,$cnt 176 srl $Zlo,4,$Zlo 177 178 ldq $Tlo1,8($nhi) 179 xor $rem,$Zhi,$Zhi 180 ldq $Thi1,0($nhi) 181 s8addq $remp,$rem_4bit,$remp 182 183 ldq $rem,0($remp) 184 srl $Zhi,4,$Zhi 185 xor $t0,$Zlo,$Zlo 186 extbl $Xhi,$cnt,$nlo 187 188 and $nlo,0xf0,$nhi 189 xor $Thi0,$Zhi,$Zhi 190 xor $Tlo0,$Zlo,$Zlo 191 sll $nlo,4,$nlo 192 193 194 and $Zlo,0x0f,$remp 195 sll $Zhi,60,$t0 196 and $nlo,0xf0,$nlo 197 srl $Zlo,4,$Zlo 198 199 s8addq $remp,$rem_4bit,$remp 200 xor $rem,$Zhi,$Zhi 201 addq $nlo,$Htbl,$nlo 202 addq $nhi,$Htbl,$nhi 203 204 ldq $rem,0($remp) 205 srl $Zhi,4,$Zhi 206 ldq $Tlo0,8($nlo) 207 xor $t0,$Zlo,$Zlo 208 209 xor $Tlo1,$Zlo,$Zlo 210 xor $Thi1,$Zhi,$Zhi 211 ldq $Thi0,0($nlo) 212 bne $cnt,.Loophi$N 213 214 215 and $Zlo,0x0f,$remp 216 sll $Zhi,60,$t0 217 srl $Zlo,4,$Zlo 218 219 ldq $Tlo1,8($nhi) 220 xor $rem,$Zhi,$Zhi 221 ldq $Thi1,0($nhi) 222 s8addq $remp,$rem_4bit,$remp 223 224 ldq $rem,0($remp) 225 srl $Zhi,4,$Zhi 226 xor $t0,$Zlo,$Zlo 227 228 xor $Tlo0,$Zlo,$Zlo 229 xor $Thi0,$Zhi,$Zhi 230 231 and $Zlo,0x0f,$remp 232 sll $Zhi,60,$t0 233 srl $Zlo,4,$Zlo 234 235 s8addq $remp,$rem_4bit,$remp 236 xor $rem,$Zhi,$Zhi 237 238 ldq $rem,0($remp) 239 srl $Zhi,4,$Zhi 240 xor $Tlo1,$Zlo,$Zlo 241 xor $Thi1,$Zhi,$Zhi 242 xor $t0,$Zlo,$Zlo 243 xor $rem,$Zhi,$Zhi 244___ 245}} 246 247$code=<<___; 248#include <machine/asm.h> 249 250.text 251 252.set noat 253.set noreorder 254.globl gcm_gmult_4bit 255.align 4 256.ent gcm_gmult_4bit 257gcm_gmult_4bit: 258 .frame sp,0,ra 259 .prologue 0 260 261 ldq $Xlo,8($Xi) 262 ldq $Xhi,0($Xi) 263 264 lda $rem_4bit,rem_4bit 265___ 266 267 &loop(); 268 269$code.=<<___; 270 srl $Zlo,24,$t0 # byte swap 271 srl $Zlo,8,$t1 272 273 sll $Zlo,8,$t2 274 sll $Zlo,24,$Zlo 275 zapnot $t0,0x11,$t0 276 zapnot $t1,0x22,$t1 277 278 zapnot $Zlo,0x88,$Zlo 279 or $t0,$t1,$t0 280 zapnot $t2,0x44,$t2 281 282 or $Zlo,$t0,$Zlo 283 srl $Zhi,24,$t0 284 srl $Zhi,8,$t1 285 286 or $Zlo,$t2,$Zlo 287 sll $Zhi,8,$t2 288 sll $Zhi,24,$Zhi 289 290 srl $Zlo,32,$Xlo 291 sll $Zlo,32,$Zlo 292 293 zapnot $t0,0x11,$t0 294 zapnot $t1,0x22,$t1 295 or $Zlo,$Xlo,$Xlo 296 297 zapnot $Zhi,0x88,$Zhi 298 or $t0,$t1,$t0 299 zapnot $t2,0x44,$t2 300 301 or $Zhi,$t0,$Zhi 302 or $Zhi,$t2,$Zhi 303 304 srl $Zhi,32,$Xhi 305 sll $Zhi,32,$Zhi 306 307 or $Zhi,$Xhi,$Xhi 308 stq $Xlo,8($Xi) 309 stq $Xhi,0($Xi) 310 311 ret (ra) 312.end gcm_gmult_4bit 313___ 314 315$inhi="s0"; 316$inlo="s1"; 317 318$code.=<<___; 319.globl gcm_ghash_4bit 320.align 4 321.ent gcm_ghash_4bit 322gcm_ghash_4bit: 323 lda sp,-32(sp) 324 stq ra,0(sp) 325 stq s0,8(sp) 326 stq s1,16(sp) 327 .mask 0x04000600,-32 328 .frame sp,32,ra 329 .prologue 0 330 331 ldq_u $inhi,0($inp) 332 ldq_u $Thi0,7($inp) 333 ldq_u $inlo,8($inp) 334 ldq_u $Tlo0,15($inp) 335 ldq $Xhi,0($Xi) 336 ldq $Xlo,8($Xi) 337 338 lda $rem_4bit,rem_4bit 339 340.Louter: 341 extql $inhi,$inp,$inhi 342 extqh $Thi0,$inp,$Thi0 343 or $inhi,$Thi0,$inhi 344 lda $inp,16($inp) 345 346 extql $inlo,$inp,$inlo 347 extqh $Tlo0,$inp,$Tlo0 348 or $inlo,$Tlo0,$inlo 349 subq $len,16,$len 350 351 xor $Xlo,$inlo,$Xlo 352 xor $Xhi,$inhi,$Xhi 353___ 354 355 &loop(); 356 357$code.=<<___; 358 srl $Zlo,24,$t0 # byte swap 359 srl $Zlo,8,$t1 360 361 sll $Zlo,8,$t2 362 sll $Zlo,24,$Zlo 363 zapnot $t0,0x11,$t0 364 zapnot $t1,0x22,$t1 365 366 zapnot $Zlo,0x88,$Zlo 367 or $t0,$t1,$t0 368 zapnot $t2,0x44,$t2 369 370 or $Zlo,$t0,$Zlo 371 srl $Zhi,24,$t0 372 srl $Zhi,8,$t1 373 374 or $Zlo,$t2,$Zlo 375 sll $Zhi,8,$t2 376 sll $Zhi,24,$Zhi 377 378 srl $Zlo,32,$Xlo 379 sll $Zlo,32,$Zlo 380 beq $len,.Ldone 381 382 zapnot $t0,0x11,$t0 383 zapnot $t1,0x22,$t1 384 or $Zlo,$Xlo,$Xlo 385 ldq_u $inhi,0($inp) 386 387 zapnot $Zhi,0x88,$Zhi 388 or $t0,$t1,$t0 389 zapnot $t2,0x44,$t2 390 ldq_u $Thi0,7($inp) 391 392 or $Zhi,$t0,$Zhi 393 or $Zhi,$t2,$Zhi 394 ldq_u $inlo,8($inp) 395 ldq_u $Tlo0,15($inp) 396 397 srl $Zhi,32,$Xhi 398 sll $Zhi,32,$Zhi 399 400 or $Zhi,$Xhi,$Xhi 401 br zero,.Louter 402 403.Ldone: 404 zapnot $t0,0x11,$t0 405 zapnot $t1,0x22,$t1 406 or $Zlo,$Xlo,$Xlo 407 408 zapnot $Zhi,0x88,$Zhi 409 or $t0,$t1,$t0 410 zapnot $t2,0x44,$t2 411 412 or $Zhi,$t0,$Zhi 413 or $Zhi,$t2,$Zhi 414 415 srl $Zhi,32,$Xhi 416 sll $Zhi,32,$Zhi 417 418 or $Zhi,$Xhi,$Xhi 419 420 stq $Xlo,8($Xi) 421 stq $Xhi,0($Xi) 422 423 .set noreorder 424 /*ldq ra,0(sp)*/ 425 ldq s0,8(sp) 426 ldq s1,16(sp) 427 lda sp,32(sp) 428 ret (ra) 429.end gcm_ghash_4bit 430 431 .section .rodata 432 .align 4 433rem_4bit: 434 .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16 435 .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16 436 .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16 437 .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16 438 .previous 439 440___ 441$output=shift and open STDOUT,">$output"; 442print $code; 443close STDOUT; 444 445